#### Payment organization **[Elo](http://)**, which is operated widely in Brazil offer discounts for card-holders. Elo wants to know that is really these discounts helpful to keep the card-holders happy.

#### Based on target-score we are going to predict whether discounts helpful or not.

#### I will use train & historical_transactions for training and testing purpose.
 
#### As Dependent variable is continuous so I will use regression algorithms starting from basic Linear-Regression model to Gradient boosting models and RMSE as evaluation metric


#### Importing required Libraries

In [None]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import *

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import lightgbm as lgb
import xgboost as xgb

#### Importing data files

In [None]:
train=pd.read_csv("../input/train.csv")
test=pd.read_csv("../input/test.csv")
hist_transactions=pd.read_csv("../input/historical_transactions.csv")

### Data preprocessing

In [42]:
# looking for dimensions of data
train.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target,month_lag,hist_transactions/card,purchase_amt,installments,first_active_year
0,6,C_ID_92a2005557,5,2,1,-0.820283,-4.0,260,-0.638341,0.0,2017
1,1,C_ID_3d0044924f,4,1,0,0.392913,-5.0,350,-0.600018,2.0,2017
2,8,C_ID_d639edf6cd,2,2,0,0.688056,-9.0,43,-0.678311,0.0,2016
3,9,C_ID_186d6a6901,4,3,0,0.142495,-3.0,77,-0.642745,1.0,2017
4,11,C_ID_cdbd2c0db2,1,3,0,-0.159749,-1.0,133,-0.366073,1.0,2017


In [None]:
test.head()

In [None]:
hist_transactions.head()

In [43]:
train.shape, test.shape, hist_transactions.shape

((201917, 11), (123623, 5), (29112361, 14))

In [None]:
train.info()

In [None]:
test.info()

In [None]:
hist_transactions.info()

In [None]:
train.describe(include='all')

In [None]:
hist_transactions.describe(include='all')

In [None]:
#Checking for NA values in train 
train.isna().sum().plot(kind='barh')
for i, v in enumerate(train.isna().sum()):
    plt.text( v,i, str(v))
plt.title('missing values count')

In [None]:
# Distribution of cards used first-time 

train['first_active_month']=pd.to_datetime(train['first_active_month'])
count = train['first_active_month'].dt.date.value_counts()
count= count.sort_index()
plt.figure(figsize=(14,6))
sns.barplot(count.index, count.values)
plt.xticks(rotation='vertical')
plt.xlabel('First active month')
plt.ylabel('Number of cards')
plt.title("First active month count in train set")
plt.show()

In [None]:
# Checking for the distributions of features using violin plot

# feature 1
plt.figure(figsize=(8,4))
sns.violinplot(x="feature_1", y='target', data=train)
plt.xlabel('Feature 1')
plt.ylabel('target score')
plt.title("Feature 1 distribution")
plt.show()

# feature 2
plt.figure(figsize=(8,4))
sns.violinplot(x="feature_2", y='target', data=train)
plt.xlabel('Feature 2')
plt.ylabel('target score')
plt.title("Feature 2 distribution")
plt.show()
 
# feature 3
plt.figure(figsize=(8,4))
sns.violinplot(x="feature_3", y='target', data=train)
plt.xlabel('Feature 3')
plt.ylabel('target score')
plt.title("Feature 3 distribution")
plt.show()

#### Hist_transactions dataset

month_lag

In [None]:
Avg_month_lag= np.round(hist_transactions.groupby('card_id')['month_lag'].agg('mean').reset_index())
train= pd.merge(train, Avg_month_lag, on="card_id")

Card-id

In [None]:
num_trans = hist_transactions.card_id.value_counts().reset_index()
num_trans.columns = ["card_id", "hist_transactions/card"]
train= pd.merge(train, num_trans, on="card_id")

In [None]:
plt.scatter('hist_transactions/card', 'target', data=train)
plt.xlabel('Number of hist_transactions/card')
plt.ylabel('target score')
plt.title('Number of hist_transactions/card  vs target score')

Purchase_amount

In [None]:
pur_amt = hist_transactions.groupby("card_id")
pur_amt = pur_amt["purchase_amount"].agg('mean').reset_index()
pur_amt.columns = ["card_id", "purchase_amt"]
train= pd.merge(train, pur_amt, on="card_id")

In [None]:
plt.scatter('hist_transactions/card', 'target', data=train)
plt.xlabel('Number of hist_transactions/card')
plt.ylabel('target score')
plt.title('Number of hist_transactions/card  Vs target score')

Installments

In [None]:
installments_percard = np.round(hist_transactions.groupby('card_id')['installments'].agg('mean').reset_index())
train= pd.merge(train, installments_percard, on="card_id")

In [None]:
plt.scatter('installments', 'target', data=train)
plt.xlabel('no.of installments')
plt.ylabel('target score')
plt.title('no.of installments Vs target scre')

first_active_month

In [None]:
train['first_active_month']=pd.to_datetime(train['first_active_month'])

train['first_active_year']=train['first_active_month'].dt.year
train['first_active_month']=train['first_active_month'].dt.month


In [None]:
plt.scatter(range(train.shape[0]), np.sort(train.target))
plt.ylabel('target Score')
plt.title('target-score distribution')

Checking for correlatioin between variables

In [None]:
sns.heatmap(train.corr(), annot=True)
plt.title('Correlation map')

Splitting dataset into train and test sets

In [44]:
train_x=train.drop(['target',  'card_id'], axis=1)
train_y=train['target']

In [45]:
x_train, x_test, y_train, y_test=train_test_split(train_x, train_y, test_size=0.33)

In [46]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((135284, 9), (66633, 9), (135284,), (66633,))

Linear regression model

In [47]:
model=LinearRegression()
model.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [48]:
predict=model.predict(x_test)
predict_train=model.predict(x_train)

In [49]:
print('RMSE test:', np.sqrt(np.mean((predict - y_test)**2)))
print('RMSE train:', np.sqrt(np.mean((predict_train - y_train)**2)))

RMSE test: 3.8402728637749637
RMSE train: 3.851667472259535


Randomforest Regresssor

In [50]:
model_rf=RandomForestRegressor()
model_rf.fit(x_train, y_train)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [51]:
predict_rf=model_rf.predict(x_test)
predict_rf_train=model_rf.predict(x_train)

In [52]:
print('Test RMSE RF:', np.sqrt(np.mean((predict_rf - y_test)**2)))
print('Train RMSE RF:', np.sqrt(np.mean((predict_rf_train - y_train)**2)))

Test RMSE RF: 4.152158818141261
Train RMSE RF: 1.7716493810726186


parameter_tuning in Randomforest Regresssor

In [53]:
Random_Search_Params ={
    'max_features':[1,2,3,4,5,6,7,8,9,10],
    "max_depth": list(range(1,train.shape[1])),
    'n_estimators' : [1, 2, 4, 8, 50, 100,150, 200, 250, 300],
    "min_samples_leaf": [5,10,15,20,25],
    'random_state' : [42] 
    }


random_search = RandomizedSearchCV(
    estimator=RandomForestRegressor(),
    param_distributions= Random_Search_Params, 
    cv=3,
    refit=True,
    verbose=True)

In [54]:
random_search.fit(x_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  5.2min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=10, n_jobs=None,
          param_distributions={'max_features': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'n_estimators': [1, 2, 4, 8, 50, 100, 150, 200, 250, 300], 'min_samples_leaf': [5, 10, 15, 20, 25], 'random_state': [42]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=True)

In [55]:
random_search.best_params_

{'random_state': 42,
 'n_estimators': 250,
 'min_samples_leaf': 15,
 'max_features': 6,
 'max_depth': 7}

In [56]:


model_rf_tune=RandomForestRegressor( random_state=42, 
                                     n_estimators=250, min_samples_leaf=15,
                                     max_features=6, max_depth=7 )

In [57]:
model_rf_tune.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=7,
           max_features=6, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=15,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=250, n_jobs=None, oob_score=False, random_state=42,
           verbose=0, warm_start=False)

In [58]:
predict_rf_tune=model_rf_tune.predict(x_test)

predict_rf_tune_train=model_rf_tune.predict(x_train)

In [59]:
print('Test RMSE RF_tune_:', np.sqrt(np.mean((predict_rf_tune - y_test)**2)))
print('Train RMSE RF_tune:', np.sqrt(np.mean((predict_rf_tune_train - y_train)**2)))

Test RMSE RF_tune_: 3.793012505695367
Train RMSE RF_tune: 3.8093516128689835


lgb model

In [62]:
params = {'num_leaves': 30,
         'min_data_in_leaf': 20,
         'objective': 'regression',
         'max_depth': 5,
         'learning_rate': 0.01,
         "boosting": "gbrt",
         "metric": 'rmse'}

lgb_model = lgb.LGBMRegressor(**params, n_estimators = 10000,  n_jobs = -1)
lgb_model.fit(x_train, y_train, 
        eval_set=[(x_train, y_train), (x_test, y_test)], eval_metric='rmse',
        verbose=1000, early_stopping_rounds=1000)

Training until validation scores don't improve for 1000 rounds.
[1000]	training's rmse: 3.79487	valid_1's rmse: 3.79283
Early stopping, best iteration is:
[527]	training's rmse: 3.8078	valid_1's rmse: 3.7916


LGBMRegressor(boosting='gbrt', boosting_type='gbdt', class_weight=None,
       colsample_bytree=1.0, importance_type='split', learning_rate=0.01,
       max_depth=5, metric='rmse', min_child_samples=20,
       min_child_weight=0.001, min_data_in_leaf=20, min_split_gain=0.0,
       n_estimators=10000, n_jobs=-1, num_leaves=30,
       objective='regression', random_state=None, reg_alpha=0.0,
       reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=0)

XGBoost model

In [63]:
xgb_params = {'eta': 0.01,
              'objective': 'reg:linear',
              'max_depth': 6,
              'min_child_weight': 3,
              'subsample': 0.8,
              
              'eval_metric': 'rmse',
              'seed': 11,
              'silent': True}

model_xgb = xgb.XGBRegressor() 
model_xgb.fit(x_train, y_train)

  if getattr(data, 'base', None) is not None and \


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, importance_type='gain',
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)

In [64]:
trainPredict_xgb = model_xgb.predict(x_train)
testPredict_xgb = model_xgb.predict(x_test)

print("xgb test RMSE:", np.sqrt(mean_squared_error(y_test, testPredict_xgb)))
print("xgb train RMSE:", np.sqrt( mean_squared_error(y_train, trainPredict_xgb)))

xgb test RMSE: 3.7951851600758166
xgb train RMSE: 3.8219666616815324
