In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas_profiling
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [None]:
train_data = pd.read_csv('/kaggle/input/hackerearth-how-not-to-lose-a-customer-in-10-days/train.csv')
test_data = pd.read_csv('/kaggle/input/hackerearth-how-not-to-lose-a-customer-in-10-days/test.csv')

In [None]:
report_train = pandas_profiling.ProfileReport(train_data)
report_train.to_file("report_train.html")

report_train

In [None]:
report_test = pandas_profiling.ProfileReport(test_data)
report_test.to_file("report_test.html")

report_test

In [None]:
train_data.info()


In [None]:
merged = pd.concat([train_data,test_data],axis = 0)
merged.dtypes.value_counts()

In [None]:
merged = merged.replace('Unknown',np.nan)
merged = merged.replace('?',np.nan)
merged = merged.replace(-999,np.nan)
merged = merged.replace('Error',np.nan)

In [None]:
merged.info()

In [None]:
cols_with_missing = (col for col in merged.columns 
                                 if merged[col].isnull().any())

for col in cols_with_missing:
    merged[col + '_was_missing'] = merged[col].isnull()

In [None]:
merged.columns

In [None]:
columns = ['gender_was_missing','region_category_was_missing',
           'joined_through_referral_was_missing','preferred_offer_types_was_missing',
           'medium_of_operation_was_missing','days_since_last_login_was_missing',
           'avg_frequency_login_days_was_missing', 'points_in_wallet_was_missing']

for col in columns:
    result = merged[col].astype(int)
    merged[col] = result

In [None]:
merged = merged.drop(['churn_risk_score_was_missing'],axis = 1)


In [None]:
train_data_mod = merged.iloc[:36992, :]
test_data_mod = merged.iloc[36992:, :].drop(columns = ['churn_risk_score'], axis = 1)

In [None]:
exp_tf_filtered = train_data_mod.copy()


In [None]:
exp_tf_filtered.columns

In [None]:
cols_by_median = ['days_since_last_login','avg_frequency_login_days', 'points_in_wallet']

In [None]:
for col in cols_by_median:
    exp_tf_filtered[col].fillna(exp_tf_filtered[col].median(), inplace=True)
    test_data_mod[col].fillna(exp_tf_filtered[col].median(), inplace=True)


In [None]:
cols_by_mode = ['gender','region_category','joined_through_referral',
                'preferred_offer_types','medium_of_operation']

In [None]:
for col in cols_by_mode:
    exp_tf_filtered[col].fillna(exp_tf_filtered[col].mode()[0], inplace=True)
    test_data_mod[col].fillna(exp_tf_filtered[col].mode()[0], inplace=True)

In [None]:
exp_tf_filtered.info()


In [None]:
test_data_mod.info()


In [None]:
df = exp_tf_filtered.copy()


In [None]:
target = df.churn_risk_score
df.drop('churn_risk_score', axis = 1, inplace = True)
cooking_data = pd.concat([df,test_data_mod],axis = 0)

In [None]:
cooking_data.info()


In [None]:
cooking_data.drop(['customer_id', 'Name', 'security_no', 'referral_id'], axis = 1, inplace = True)

In [None]:
unencoded_data = cooking_data.copy()


In [None]:
unencoded_data['current_date'] = pd.Timestamp('2020-12-31')


In [None]:
date_data1 = unencoded_data['current_date']
date_data2 = unencoded_data['joining_date']

In [None]:
date_data1 = pd.to_datetime(date_data1)
date_data2 = pd.to_datetime(date_data2)


In [None]:
days_diff = date_data1 - date_data2
days_diff = pd.to_numeric(days_diff)
days_diff = days_diff/(24*60*60*1000000000)
days_diff = days_diff.astype(int)

In [None]:
unencoded_data[["avg_frequency_login_days"]] = unencoded_data[["avg_frequency_login_days"]].apply(pd.to_numeric)


In [None]:
unencoded_data['last_visit_time_sec'] = None

In [None]:
pd.set_option('mode.chained_assignment', None)
for index, row in unencoded_data.iterrows():
    hour = int(row['last_visit_time'][:2])
    mnt = int(row['last_visit_time'][3:5])
    sec = int(row['last_visit_time'][6:8])
    total_sec = hour*60*60 + mnt*60 + sec
    unencoded_data['last_visit_time_sec'][index] = total_sec
    
    


In [None]:
unencoded_data['last_visit_time_sec'] = unencoded_data['last_visit_time_sec'].astype('int64')

In [None]:
unencoded_data.drop(['last_visit_time'], axis = 1, inplace = True)

In [None]:
unencoded_data.info()

In [None]:
unencoded_data['avg_time_spent'] = unencoded_data['avg_time_spent'].abs()

In [None]:
one_hot_data = unencoded_data[['gender', 'region_category', 'preferred_offer_types', 'medium_of_operation', 'internet_option', 'complaint_status', 'feedback', 'membership_category']]
one_hot_data = pd.get_dummies(one_hot_data, drop_first=True, prefix=['gender', 'region_category', 'preferred_offer_types', 'medium_of_operation', 'internet_option', 'complaint_status', 'feedback', 'membership_category'])
one_hot_data = one_hot_data.astype('int64')

In [None]:
one_hot_data

In [None]:
bool_data = unencoded_data[['joined_through_referral', 'used_special_discount', 'offer_application_preference', 'past_complaint']]
bool_data.replace(value = [1, 0], to_replace = ['Yes', 'No'], inplace = True)

In [None]:
num_data = unencoded_data.select_dtypes(include = ['int64', 'float64'])


In [None]:
num_data

In [None]:
cooked_data = pd.concat([num_data, one_hot_data, bool_data, days_diff],axis = 1)
cooked_data = cooked_data.rename(columns = {0:"Days_diff"}) 

In [None]:
cooked_data.info()

In [None]:
train_prepared =  cooked_data.iloc[:36992, :]
test_prepared = cooked_data.iloc[36992:, :]

In [None]:
target.value_counts()

In [None]:
target = target.abs()

In [None]:
from sklearn.preprocessing import StandardScaler
transformer = StandardScaler().fit(train_prepared)

In [None]:
scaled_train_data = transformer.transform(train_prepared)
scaled_test_data = transformer.transform(test_prepared)

In [None]:
scaled_train_data = pd.DataFrame(data = scaled_train_data, columns = train_prepared.columns, index = train_prepared.index)
scaled_test_data = pd.DataFrame(data = scaled_test_data, columns = test_prepared.columns, index = test_prepared.index)

In [None]:
X = scaled_train_data
y = target  
data = pd.concat([X, y],axis = 1)
train_labels_mod = target

In [None]:
#get correlations of each features in dataset
corrmat = data.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20,20))
#plot heat map
g=sns.heatmap(data[top_corr_features].corr(),annot=True,cmap="RdYlGn")

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
import matplotlib.pyplot as plt
model = ExtraTreesRegressor()
model.fit(X,y)
print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(20).plot(kind='barh')
plt.show()

In [None]:
feat_importances.nlargest(50)

In [None]:
seed = 43
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.svm import LinearSVC 
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.multiclass import OneVsRestClassifier


In [None]:
lr = LogisticRegression(multi_class='ovr')
linsvc = LinearSVC(multi_class='ovr')
mlp = MLPClassifier(random_state=seed, early_stopping=True)
bnb = BernoulliNB()
gnb = GaussianNB()
lda = LinearDiscriminantAnalysis()
qda = QuadraticDiscriminantAnalysis()
ridge = RidgeClassifier(random_state=seed)
dt = DecisionTreeClassifier(random_state=seed)
et = ExtraTreeClassifier(random_state=seed)
rf = RandomForestClassifier(random_state=seed, n_jobs=-1)
ets = ExtraTreesClassifier(random_state=seed, n_jobs=-1)
gboost = GradientBoostingClassifier(random_state=seed)
kn = KNeighborsClassifier(n_jobs=-1)
nc = NearestCentroid()
xgboost = XGBClassifier(random_state=seed, n_jobs=-1)
lgbm = LGBMClassifier(random_state=seed, n_jobs=-1)


In [None]:
def train_test_split_score(model):
    from sklearn.metrics import f1_score
    from sklearn.model_selection import train_test_split
    X_train, X_test, Y_train, Y_test = train_test_split(scaled_train_data, train_labels_mod, test_size = 0.2, random_state = seed)
    model.fit(X_train, Y_train)
    prediction = model.predict(X_test)
    f1_score = f1_score(prediction, Y_test, average='macro')
    return f1_score

In [None]:
models = [lr, linsvc, mlp, bnb, gnb, lda, qda, ridge, dt, et, rf, ets, gboost, kn, nc, xgboost, lgbm]
train_test_split_f1 = []

for model in models:
    print(model)
    train_test_split_f1.append(train_test_split_score(model))

In [None]:
train_test_score = pd.DataFrame(data = train_test_split_f1, columns = ['Train_Test_F1'])
train_test_score.index = ['Logistic Reg','LinearSVC', 'MLPClassifier', 'BernoulliNB', 'GaussianNB', 'LinearDiscriminantAnalysis',
                          'QuadraticDiscriminantAnalysis', 'RidgeClassifier', 'DecisionTreeClassifier', 'ExtraTreeClassifier', 
                          'RandomForestClassifier', 'ExtraTreesClassifier', 'GradientBoostingClassifier', 'KNeighborsClassifier', 
                         'NearestCentroid', 'XGBClassifier', 'LGBMClassifier']
sns.scatterplot(train_test_score.index,train_test_score['Train_Test_F1'])

In [None]:
train_test_score.Train_Test_F1.nlargest(20)

In [None]:
eclf1 = VotingClassifier(estimators=[
         ('gb', gboost), ('lgbm', lgbm), ('xgb', xgboost), ('rf', rf)], voting='soft', verbose=2)

In [None]:
train_test_split_score(eclf1)

In [None]:
eclf1.fit(X,y)

In [None]:
preds = eclf1.predict(scaled_test_data)
Id = test_data['customer_id'].values
d = {'customer_id': Id , 'churn_risk_score': preds}
submission = pd.DataFrame(data=d)
submission.to_csv('votingclf.csv', index = False)


In [None]:
def pred_sub(model, i, full_train=False):
    
    if full_train==True:
        model.fit(scaled_train_data, train_labels_mod)
    preds = model.predict(scaled_test_data)
    Id = test_data['customer_id'].values
    d = {'customer_id': Id , 'churn_risk_score': preds}
    submission = pd.DataFrame(data=d)
    submission.to_csv(str(model)[:10]+str(i)+'.csv', index = False)
    

In [None]:
pred_sub(lgbm, 2)

In [None]:
def cross_validate(model):
    from sklearn.model_selection import cross_val_score
    f1_score = cross_val_score(model, scaled_train_data, train_labels_mod, cv = 10, n_jobs = -1, scoring = 'f1_macro')
    f1_score_rounded = np.round(f1_score, 5)
    return f1_score_rounded.mean()

In [None]:
models = [lr, mlp, bnb, gnb, lda, qda, ridge, dt, et, rf, ets, gboost, kn, nc, xgboost, lgbm]
cross_val_scores = []
for model in models:
    print(model)
    cross_val_scores.append(cross_validate(model))

In [None]:
x_val_score = pd.DataFrame(data = cross_val_scores, columns = ['Cross Validation Scores (F1)'])
x_val_score.index = ['Logistic Reg', 'MLPClassifier', 'BernoulliNB', 'GaussianNB', 'LinearDiscriminantAnalysis',
                          'QuadraticDiscriminantAnalysis', 'RidgeClassifier', 'DecisionTreeClassifier', 'ExtraTreeClassifier', 
                          'RandomForestClassifier', 'ExtraTreesClassifier', 'GradientBoostingClassifier', 'KNeighborsClassifier', 
                         'NearestCentroid', 'XGBClassifier', 'LGBMClassifier']
x_val_score = x_val_score.round(5)
x = x_val_score.index
y = x_val_score['Cross Validation Scores (F1)']
sns.scatterplot(x,y)

In [None]:
x_val_score['Cross Validation Scores (F1)'].nlargest(20)

In [None]:
from imblearn.over_sampling import SMOTE
sm = SMOTE()
scaled_train_data_res, train_labels_mod_res = sm.fit_resample(scaled_train_data, train_labels_mod )

In [None]:
def train_test_split_score(model):
    from sklearn.metrics import f1_score
    from sklearn.model_selection import train_test_split
    X_train, X_test, Y_train, Y_test = train_test_split(scaled_train_data, train_labels_mod, test_size = 0.2, stratify=train_labels_mod, random_state = seed)
    X_train_res, Y_train_res = sm.fit_resample(X_train, Y_train )
    model.fit(X_train_res, Y_train_res)
    prediction = model.predict(X_test)
    f1_score = f1_score(prediction, Y_test, average='macro')
    return f1_score

In [None]:
models = [lr, linsvc, mlp, bnb, gnb, lda, qda, ridge, dt, et, rf, ets, gboost, kn, nc, xgboost, lgbm]
train_test_split_f1 = []

for model in models:
    print(model)
    train_test_split_f1.append(train_test_split_score(model))

In [None]:
train_test_score = pd.DataFrame(data = train_test_split_f1, columns = ['Train_Test_F1'])
train_test_score.index = ['Logistic Reg','LinearSVC', 'MLPClassifier', 'BernoulliNB', 'GaussianNB', 'LinearDiscriminantAnalysis',
                          'QuadraticDiscriminantAnalysis', 'RidgeClassifier', 'DecisionTreeClassifier', 'ExtraTreeClassifier', 
                          'RandomForestClassifier', 'ExtraTreesClassifier', 'GradientBoostingClassifier', 'KNeighborsClassifier', 
                         'NearestCentroid', 'XGBClassifier', 'LGBMClassifier']
sns.scatterplot(train_test_score.index,train_test_score['Train_Test_F1'])

In [None]:
train_test_score.Train_Test_F1.nlargest(20)

In [None]:
preds = rf.predict(scaled_test_data)
Id = test_data['customer_id'].values
d = {'customer_id': Id , 'churn_risk_score': preds}
submission = pd.DataFrame(data=d)
submission.to_csv('rf1.csv', index = False)

In [None]:
cols = feat_importances.nlargest(20).index
scaled_train_data_10 = scaled_train_data[cols]

In [None]:
def train_test_split_score(model):
    from sklearn.metrics import f1_score
    from sklearn.model_selection import train_test_split
    X_train, X_test, Y_train, Y_test = train_test_split(scaled_train_data_10, train_labels_mod, test_size = 0.2, random_state = seed)
    model.fit(X_train, Y_train)
    prediction = model.predict(X_test)
    f1_score = f1_score(prediction, Y_test, average='macro')
    return f1_score

In [None]:
models = [lr, linsvc, mlp, bnb, gnb, lda, qda, ridge, dt, et, rf, ets, gboost, kn, nc, xgboost, lgbm]
train_test_split_f1 = []

for model in models:
    print(model)
    train_test_split_f1.append(train_test_split_score(model))

In [None]:
train_test_score = pd.DataFrame(data = train_test_split_f1, columns = ['Train_Test_F1'])
train_test_score.index = ['Logistic Reg','LinearSVC', 'MLPClassifier', 'BernoulliNB', 'GaussianNB', 'LinearDiscriminantAnalysis',
                          'QuadraticDiscriminantAnalysis', 'RidgeClassifier', 'DecisionTreeClassifier', 'ExtraTreeClassifier', 
                          'RandomForestClassifier', 'ExtraTreesClassifier', 'GradientBoostingClassifier', 'KNeighborsClassifier', 
                         'NearestCentroid', 'XGBClassifier', 'LGBMClassifier']
sns.scatterplot(train_test_score.index,train_test_score['Train_Test_F1'])

In [None]:
train_test_score.Train_Test_F1.nlargest(20)

In [None]:
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(scaled_train_data, train_labels_mod, test_size = 0.2, random_state = seed)
baseline = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100,max_depth=3, min_samples_split=2, min_samples_leaf=1, subsample=1,max_features='sqrt', random_state=10)
baseline.fit(X_train,y_train)
predictors=list(X_train)
feat_imp = pd.Series(baseline.feature_importances_, predictors).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Importance of Features')
plt.ylabel('Feature Importance Score')
print('Accuracy of the GBM on test set: {:.3f}'.format(baseline.score(X_test, y_test)))
pred=baseline.predict(X_test)
print(classification_report(y_test, pred))

In [None]:
%%time
p_test3 = {'learning_rate':[0.1,0.05,0.01,0.5], 'n_estimators':[100,200,250,500,1000]}

tuning = GridSearchCV(estimator =GradientBoostingClassifier(max_depth=4, min_samples_split=2, min_samples_leaf=1, subsample=1,max_features='sqrt', random_state=10), 
            param_grid = p_test3, scoring='f1_macro', cv=5, verbose=2)
tuning.fit(X_train,y_train)


In [None]:
tuning.best_score_
tuning.best_params_

In [None]:
%%time
p_test2 = {'max_depth':[2,3,4,5,6,7] }
tuning = GridSearchCV(estimator =GradientBoostingClassifier(learning_rate=0.05,n_estimators=200, min_samples_split=2, min_samples_leaf=1, subsample=1,max_features='sqrt', random_state=10), 
            param_grid = p_test2, scoring='f1_macro', cv=5, verbose=2)
tuning.fit(X_train,y_train)
tuning.best_score_, tuning.best_params_

In [None]:
model1 = GradientBoostingClassifier(learning_rate=0.05, n_estimators=200,max_depth=4, min_samples_split=2, min_samples_leaf=1, subsample=1,max_features='sqrt', random_state=10)
model1.fit(X_train,y_train)
predictors=list(X_train)
feat_imp = pd.Series(model1.feature_importances_, predictors).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Importance of Features')
plt.ylabel('Feature Importance Score')
print('Accuracy of the GBM on test set: {:.3f}'.format(model1.score(X_test, y_test)))
pred=model1.predict(X_test)
print(classification_report(y_test, pred))

In [None]:
p_test4 = {'min_samples_split':[2,5,10,20,50,75,100], 'min_samples_leaf':[1,3,5,7,9]}

tuning = GridSearchCV(estimator =GradientBoostingClassifier(learning_rate=0.05, n_estimators=200,max_depth=4, subsample=1,max_features='sqrt', random_state=10), 
            param_grid = p_test4, scoring='f1_macro', cv=5,verbose=2)
tuning.fit(X_train,y_train)
tuning.best_params_, tuning.best_score_

In [None]:
p_test5 = {'max_features':[2,5,10,15,20,'auto',None]}
tuning = GridSearchCV(estimator =GradientBoostingClassifier(learning_rate=0.05, n_estimators=200,max_depth=4, min_samples_split=2, min_samples_leaf=3, subsample=1, random_state=10), 
param_grid = p_test5, scoring='f1_macro',verbose=2, cv=5)
tuning.fit(X_train,y_train)
tuning.best_params_, tuning.best_score_

In [None]:
p_test6= {'subsample':[0.7,0.75,0.8,0.85,0.9,0.95,1]}

tuning = GridSearchCV(estimator =GradientBoostingClassifier(learning_rate=0.05, n_estimators=200,max_depth=4, min_samples_split=2, min_samples_leaf=3,max_features=15, random_state=10), 
param_grid = p_test6, scoring='f1_macro',verbose=2, cv=5)
tuning.fit(X_train,y_train)
tuning.best_params_, tuning.best_score_

In [None]:
new=GradientBoostingClassifier(learning_rate=0.05, n_estimators=200,max_depth=4, min_samples_split=2, min_samples_leaf=3,max_features=15, subsample=1, random_state=10)
new.fit(X_train,y_train)
predictors=list(X_train)
feat_imp = pd.Series(new.feature_importances_, predictors).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Importance of Features')
plt.ylabel('Feature Importance Score')
print('Accuracy of the GBM on test set: {:.3f}'.format(new.score(X_test, y_test)))
pred=new.predict(X_test)
print(classification_report(y_test, pred))

In [None]:
new.fit(X_train, y_train)
preds = new.predict(scaled_test_data)
Id = test_data['customer_id'].values
d = {'customer_id': Id , 'churn_risk_score': preds}
submission = pd.DataFrame(data=d)
submission.to_csv('gb_tuned2.csv', index = False)

In [None]:
from xgboost import XGBRFClassifier
xgbrf = XGBRFClassifier()
xgbrf.fit(X_train, y_train)
preds = new.predict(scaled_test_data)
Id = test_data['customer_id'].values
d = {'customer_id': Id , 'churn_risk_score': preds}
submission = pd.DataFrame(data=d)
submission.to_csv('xgbrf.csv', index = False)