In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn import preprocessing, model_selection, metrics
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.metrics import accuracy_score


from IPython.display import display # Allows the use of display() for DataFrames

import warnings
warnings.filterwarnings('ignore')

In [2]:
train_df = pd.read_csv('../input/santander-value-prediction-challenge/train.csv')
test_df = pd.read_csv('../input/santander-value-prediction-challenge/test.csv', nrows=100)

In [3]:
train_df.head()

Unnamed: 0,ID,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000d6aaf2,38000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
1,000fbd867,600000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
2,0027d6b71,10000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
3,0028cbf45,2000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4,002a68644,14400000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0


In [4]:
test_df.head()

Unnamed: 0,ID,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,20aa07010,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000137c73,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00021489f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0004d7953,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,00056a333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,00056d8eb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# print("All Features in Train data with NaN Values =", str(train_df.columns[train_df.isnull().sum() != 0].size) )
# print("All Features in Test data with NaN Values =", str(test_df.columns[train_df.isnull().sum() != 0].size) )

## Remove constant columns from data

In [6]:
const_columns_to_remove = []
for col in train_df.columns:
    if col != 'ID' and col != 'target':
        if train_df[col].std() == 0:
            const_columns_to_remove.append(col)

# Now remove that array of const columns from the data
train_df.drop(const_columns_to_remove, axis=1, inplace=True)
test_df.drop(const_columns_to_remove, axis=1, inplace=True)

## Remove Duplicate Columns

In [7]:
train_df = train_df.loc[:,~train_df.columns.duplicated()]

### handle sparse data

In [8]:
def drop_parse_from_df(df):
    column_list_to_drop_data_from = [i for i in df.columns if not i in ['ID', 'target'] ]
    for column in column_list_to_drop_data_from:
        if len(np.unique(df[column])) < 2:
            df.drop(column, axis=1, inplace=True)
            df.drop(column, axis=1, inplace=True)
    return df


train_df = drop_parse_from_df(train_df)
print('Rows and Columns in train_df after removing sparse ', format(train_df.shape))


Rows and Columns in train_df after removing sparse  (4459, 4737)


### Split data into Train and Test for Model Training

In [9]:
X_train = train_df.drop(['ID', 'target'], axis=1)

y_train = np.log1p(train_df['target'].values)

X_test_original = test_df.drop('ID', axis=1)

X_train_split, X_validation, y_train_split, y_validation = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [10]:
## LightGBM

In [11]:
def light_gbm_model_run(train_x, train_y, validation_x, validation_y, test_x):
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 100,
        "learning_rate" : 0.001,
        "bagging_fraction" : 0.6,
        "feature_fraction" : 0.6,
        "bagging_frequency" : 6,
        "bagging_seed" : 42,
        "verbosity" : -1,
        "seed": 42
    }
    # Given its a regression case, I am using the RMSE as the metric.

    lg_train = lgb.Dataset(train_x, label=train_y)
    lg_validation = lgb.Dataset(validation_x, label=validation_y)
    evals_result_lgbm = {}

    model_light_gbm = lgb.train(params, lg_train, 5000,
                      valid_sets=[lg_train, lg_validation],
                      early_stopping_rounds=100,
                      verbose_eval=150,
                      evals_result=evals_result_lgbm )

    pred_test_light_gbm = np.expm1(model_light_gbm.predict(test_x, num_iteration=model_light_gbm.best_iteration ))

    return pred_test_light_gbm, model_light_gbm, evals_result_lgbm

In [12]:
# Training and output of LightGBM Model
predictions_test_y_light_gbm, model, evals_result = light_gbm_model_run(X_train_split, y_train_split, X_validation, y_validation, X_test_original)
print('Output of LightGBM Model training..')



Training until validation scores don't improve for 100 rounds
[150]	training's rmse: 1.66447	valid_1's rmse: 1.63996
[300]	training's rmse: 1.5765	valid_1's rmse: 1.5927
[450]	training's rmse: 1.49849	valid_1's rmse: 1.55466
[600]	training's rmse: 1.42919	valid_1's rmse: 1.52339
[750]	training's rmse: 1.36631	valid_1's rmse: 1.49837
[900]	training's rmse: 1.30931	valid_1's rmse: 1.47791
[1050]	training's rmse: 1.25734	valid_1's rmse: 1.46143
[1200]	training's rmse: 1.20984	valid_1's rmse: 1.44818
[1350]	training's rmse: 1.16678	valid_1's rmse: 1.43796
[1500]	training's rmse: 1.12698	valid_1's rmse: 1.42969
[1650]	training's rmse: 1.09049	valid_1's rmse: 1.42292
[1800]	training's rmse: 1.05661	valid_1's rmse: 1.41849
[1950]	training's rmse: 1.02528	valid_1's rmse: 1.41488
[2100]	training's rmse: 0.995869	valid_1's rmse: 1.41222
[2250]	training's rmse: 0.968211	valid_1's rmse: 1.40996
[2400]	training's rmse: 0.941985	valid_1's rmse: 1.40807
[2550]	training's rmse: 0.917269	valid_1's rmse

In [13]:
# print('predictions_test_y_light_gbm is ', predictions_test_y_light_gbm.shape)
# print('y_validation is ', y_validation.shape)

In [14]:
def xgb_model_run(train_x, train_y, validation_x, validation_y, test_x):
    params = {
        'objective': 'reg:squarederror',
          'eval_metric': 'rmse',
          'eta': 0.001,
          'max_depth': 10,
          'subsample': 0.6,
          'colsample_bytree': 0.6,
          'alpha':0.001,
          'random_state': 42

    }

    training_data = xgb.DMatrix(train_x, train_y)
    validation_data = xgb.DMatrix(validation_x, validation_y)

    watchlist = [(training_data, 'train'), (validation_data, 'valid')]

    model_xgb = xgb.train(params, training_data, 50, watchlist, maximize=False, early_stopping_rounds=100, verbose_eval=100 )

    data_test = xgb.DMatrix(test_x)
    predict_test_xgb = np.expm1(model_xgb.predict(data_test, ntree_limit=model_xgb.best_ntree_limit ) )

    return predict_test_xgb, model_xgb

### Training XGB
predictions_test_y_xgb, model_xgb = xgb_model_run(X_train_split, y_train_split, X_validation, y_validation, X_test_original)
print('Completion of XGB Training!!')

[0]	train-rmse:14.08765	valid-rmse:14.07678
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 100 rounds.
[49]	train-rmse:13.42470	valid-rmse:13.41331
Completion of XGB Training!!


In [15]:
model_catboost = CatBoostRegressor(iterations=500,
                                   learning_rate=0.01,
                                   depth=10,
                                   eval_metric='RMSE',
                                   random_seed = 42,
                                   bagging_temperature=0.2,
                                   od_type='Iter',
                                   metric_period=50,
                                   od_wait=20
                                   )

model_catboost.fit(X_train_split, y_train_split,
                   eval_set=(X_validation, y_validation),
                   use_best_model=True,
                   verbose=50
                   )

predictions_test_y_catboost = np.expm1(model_catboost.predict(X_test_original))



0:	learn: 1.7614936	test: 1.6943084	best: 1.6943084 (0)	total: 932ms	remaining: 7m 44s
50:	learn: 1.6747126	test: 1.6330977	best: 1.6330977 (50)	total: 43.8s	remaining: 6m 25s
100:	learn: 1.6096948	test: 1.5902474	best: 1.5902474 (100)	total: 1m 29s	remaining: 5m 52s
150:	learn: 1.5572985	test: 1.5589053	best: 1.5589053 (150)	total: 2m 17s	remaining: 5m 17s
200:	learn: 1.5167332	test: 1.5382647	best: 1.5382647 (200)	total: 3m 7s	remaining: 4m 39s
250:	learn: 1.4827385	test: 1.5217760	best: 1.5217760 (250)	total: 3m 58s	remaining: 3m 57s
300:	learn: 1.4545436	test: 1.5093004	best: 1.5093004 (300)	total: 4m 44s	remaining: 3m 7s
350:	learn: 1.4296056	test: 1.4994080	best: 1.4994080 (350)	total: 5m 32s	remaining: 2m 21s
400:	learn: 1.4078157	test: 1.4916401	best: 1.4916401 (400)	total: 6m 16s	remaining: 1m 32s
450:	learn: 1.3911783	test: 1.4861881	best: 1.4861881 (450)	total: 7m	remaining: 45.7s
499:	learn: 1.3751665	test: 1.4807567	best: 1.4807567 (499)	total: 7m 44s	remaining: 0us

bestT

In [16]:
submisstion_final = pd.read_csv('../input/santander-value-prediction-challenge/sample_submission.csv')

submission_lgb = pd.DataFrame()
submission_lgb['target'] = predictions_test_y_light_gbm

# sub['target'] = submission_lgb['target']

submission_xgb = pd.DataFrame()
submission_xgb['target'] = predictions_test_y_xgb

submission_catboost = pd.DataFrame()
submission_catboost['target'] = predictions_test_y_catboost

submisstion_final['target'] = (submission_lgb['target'] * 0.5 + submission_xgb['target'] * 0.3 + submission_catboost['target'] * 0.2)

KeyError: 'tartet'

In [None]:
print(submisstion_final.head())
submisstion_final.to_csv('submission_combined_lgb_xgb_catboost.csv', index=False)