## **Importing Required Libraries are Reading the dataset**

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from hyperopt import tpe, hp, fmin, STATUS_OK,Trials
from hyperopt.pyll.base import scope
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
import xgboost

In [None]:
df=pd.read_csv('../input/churn-risk-rate-hackerearth-ml/train.csv',na_values=['?','-999','Error','xxxxxxxx','Unknown'])
df.head()

# **Exploratory Data Analysis**

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
df=df.drop(['customer_id','Name','security_no','referral_id','last_visit_time'],axis=1)

In [None]:
df['churn_risk_score'].value_counts()

In [None]:
df['avg_time_spent']=np.where(df['avg_time_spent']<0,np.nan,df['avg_time_spent'])

In [None]:
df.shape

In [None]:
df.info()

In [None]:
for i in df.columns:
  if df[i].dtype=='float64':
    df[i]=df[i].fillna(df[i].mean())
  else:
    df[i]=df[i].fillna(method='ffill')

In [None]:
df.isnull().sum()

In [None]:
df.fillna('Both',inplace=True)

In [None]:
df['joining_year']=df['joining_date'].apply(lambda x:int(x.split('-')[0]))

In [None]:
plt.rcParams['figure.figsize']=[10,10]
sns.heatmap(df.corr(),annot=True)

In [None]:
df.drop('joining_date',axis=1,inplace=True)

In [None]:
df1=pd.get_dummies(df,drop_first=True)

# **Splitting the Dataset into dependant and independant variables.**

In [None]:
x=df1.drop('churn_risk_score',axis=1)
y=df1['churn_risk_score']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,stratify=y,random_state=0)

In [None]:
rd=RandomForestClassifier()

In [None]:
rd.fit(x_train,y_train)

In [None]:
rdpred=rd.predict(x_test)

In [None]:
print(classification_report(y_test,rdpred))

In [None]:
print(f1_score(y_test,rdpred,average='macro'))

# **Hyperparameter Optimization**

In [None]:
params=[{'n_estimators':[100, 300, 500, 800, 1200],
'max_depth':[5, 8, 15, 25, 30],
'min_samples_split':[2, 5, 10, 15, 100],
'min_samples_leaf':[1, 2, 5, 10]}]

In [None]:
rsearch=RandomizedSearchCV(rd, param_distributions = params, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1,scoring='f1_macro')

In [None]:
# rsearch.fit(x_train,y_train)  # Takes more time to run. please uncomment and run

# **Crossvalidation**

In [None]:
cv=StratifiedKFold()

In [None]:
cross_val_score(rd,x,y,cv=cv,scoring='f1_macro',n_jobs=-1).mean()

# **Feature Selection using Recursive Feature Elimination.**

In [None]:
xgc=xgboost.XGBClassifier()

In [None]:
from sklearn.feature_selection import RFE

In [None]:
rfe=RFE(xgc,n_features_to_select=30,step=1)  # 77.138

In [None]:
rfe.fit(x_train,y_train)

In [None]:
pd.DataFrame(rfe.ranking_,index=x.columns,columns=['imp'])['imp'][pd.DataFrame(rfe.ranking_,index=x.columns,columns=['imp'])['imp']==1].index

In [None]:
x_select=x[['age', 'days_since_last_login', 'avg_time_spent',                        # 77.1981
       'avg_transaction_value', 'avg_frequency_login_days', 'points_in_wallet',
       'joining_year','region_category_Village',
       'membership_category_Gold Membership',
       'membership_category_No Membership',
       'membership_category_Platinum Membership',
       'membership_category_Premium Membership',
       'membership_category_Silver Membership', 'joined_through_referral_Yes',
       'preferred_offer_types_Gift Vouchers/Coupons',
       'medium_of_operation_Desktop', 'medium_of_operation_Smartphone',
       'internet_option_Mobile_Data', 'internet_option_Wi-Fi',
       'offer_application_preference_Yes', 'past_complaint_Yes',
       'complaint_status_Solved in Follow-up',
       'feedback_Poor Customer Service', 'feedback_Poor Product Quality',
       'feedback_Poor Website', 'feedback_Products always in Stock',
       'feedback_Quality Customer Care', 'feedback_Reasonable Price',
       'feedback_Too many ads', 'feedback_User Friendly Website']]

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x_select,y,test_size=0.3,stratify=y,random_state=42)

# **Hypertuning using Hyperopt but unfortunately we did not achieve good results.**

In [None]:
space = {
    "learning_rate": hp.quniform('learning_rate',0,0.3,0.05),
    "max_depth": hp.choice("max_depth",[ 3, 4, 5, 6, 8, 10]),
    "min_child_weight": hp.quniform("min_child_weight",1,10,1),
    "gamma":hp.quniform("gamma",0,1,0.1),
    "colsample_bytree":hp.quniform("colsample_bytree",0,1,0.1)
}

In [None]:
def hyperparameter_tuning(params):
    clf = xgboost.XGBClassifier(**params,n_jobs=-1)
    acc = cross_val_score(clf, x_select, y,scoring="f1_macro").mean()
    return {"loss":-acc, "status": STATUS_OK}

In [None]:
#trials = Trials()   uncomment and run
#best = fmin(
#    fn=hyperparameter_tuning,
#    space = space, 
#    algo=tpe.suggest, 
#    max_evals=100, 
#   trials=trials
#)

# print("Best: {}".format(best))

In [None]:
xgc.fit(x_train,y_train)

In [None]:
xgcpred=xgc.predict(x_test)

In [None]:
print(f1_score(y_test,xgcpred,average='macro'))

In [None]:
cross_val_score(xgc,x_select,y,cv=3,scoring='f1_macro',n_jobs=-1)

# **Hyperparmater tuning for xgboost as it was giving good F1 score compared to all the boosting and bagging techniques.**

In [None]:
params={"learning_rate" : [0.05, 0.10, 0.15, 0.20],
 "max_depth"        : [ 3, 4, 5, 6, 8, 10],
 "min_child_weight" : [ 1, 3, 5, 7,9],
 "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4,0.5],
 "colsample_bytree" : [ 0.3, 0.4, 0.5 ,0.6,0.7]}

In [None]:
rsearch=RandomizedSearchCV(xgc, param_distributions = params, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1,scoring='f1_macro')

In [None]:
# rsearch.fit(x_select,y) # Takes more time ,uncomment and run

In [None]:
# rsearch.best_estimator_

# **These were the best hyperparameters we got from above tuning.**

In [None]:
xgc=xgboost.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,  # 77.1987 
              colsample_bynode=1, colsample_bytree=0.7, gamma=0.01,
              learning_rate=0.1, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=None, n_estimators=122, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [None]:
xgc.fit(x_train,y_train)

In [None]:
xgcpred=xgc.predict(x_test)

In [None]:
print(f1_score(y_test,xgcpred,average='macro'))

# **We fit the model on whole training dataset**

In [None]:
xgc.fit(x_select,y)

# **Evaluating on test Data Set**

In [None]:
test=pd.read_csv('../input/churn-risk-rate-hackerearth-ml/test.csv',na_values=['?','-999','Error','xxxxxxxx','Unknown'])
test.head()

In [None]:
test=test.drop(['customer_id','Name','security_no','referral_id','last_visit_time'],axis=1)

In [None]:
test['avg_time_spent']=np.where(test['avg_time_spent']<0,np.nan,test['avg_time_spent'])

In [None]:
test.isnull().sum()

In [None]:
for i in test.columns:
  if test[i].dtype=='float64':
    test[i]=test[i].fillna(test[i].mean())
  else:
    test[i]=test[i].fillna(method='ffill')

In [None]:
test['joining_year']=test['joining_date'].apply(lambda x:int(x.split('-')[0]))

In [None]:
test.drop('joining_date',axis=1,inplace=True)          

In [None]:
test1=pd.get_dummies(test,drop_first=True)

In [None]:
test1_select=test1[['age', 'days_since_last_login', 'avg_time_spent',
       'avg_transaction_value', 'avg_frequency_login_days', 'points_in_wallet',
       'joining_year','region_category_Village',
       'membership_category_Gold Membership',
       'membership_category_No Membership',
       'membership_category_Platinum Membership',
       'membership_category_Premium Membership',
       'membership_category_Silver Membership', 'joined_through_referral_Yes',
       'preferred_offer_types_Gift Vouchers/Coupons',
       'medium_of_operation_Desktop', 'medium_of_operation_Smartphone',
       'internet_option_Mobile_Data', 'internet_option_Wi-Fi',
       'offer_application_preference_Yes', 'past_complaint_Yes',
       'complaint_status_Solved in Follow-up',
       'feedback_Poor Customer Service', 'feedback_Poor Product Quality',
       'feedback_Poor Website', 'feedback_Products always in Stock',
       'feedback_Quality Customer Care', 'feedback_Reasonable Price',
       'feedback_Too many ads', 'feedback_User Friendly Website']]

In [None]:
rdpred=xgc.predict(test1_select)

In [None]:
df2=pd.read_csv('../input/churn-risk-rate-hackerearth-ml/test.csv',usecols=['customer_id'])

In [None]:
df2['churn_risk_score']=rdpred

In [None]:
df2.to_csv('Result.csv')