In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Task 1- Importing libraries and data

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split 
from sklearn.metrics import f1_score
import sklearn.base as skb
import sklearn.metrics as skm
import sklearn.model_selection as skms
import sklearn.preprocessing as skp
import sklearn.linear_model as sklm
import warnings
warnings.filterwarnings('ignore')
import random
seed = 5
np.random.seed(seed)

sns.set_style('whitegrid')
plt.style.use('seaborn-deep')
plt.style.use('fivethirtyeight')
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.monospace'] = 'Ubunto Mono'
plt.rcParams['font.size'] = 10
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['axes.titlesize'] = 12
plt.rcParams['xtick.labelsize'] = 8
plt.rcParams['ytick.labelsize'] = 8
plt.rcParams['legend.fontsize'] = 12
plt.rcParams['figure.titlesize'] = 14
plt.rcParams['figure.figsize'] = (12,8)

In [None]:
df = pd.read_csv('../input/hackerearth-how-not-to-lose-a-customer-in-10-days/train.csv')
df

In [None]:
df_test = pd.read_csv('../input/hackerearth-how-not-to-lose-a-customer-in-10-days/test.csv')
df_test

In [None]:
# Setting the target feature
targetFeature = 'churn_risk_score'

# Task 2- Exploratory Data Analysis (EDA)

In [None]:
df.nunique()

In [None]:
# Spliting the dataset into numerical and categorical features

def datasetShape(df):
    rows,cols = df.shape
    print("The dataframe has "+ str(rows)+" No. of rows and "+str(cols)+" of columns")
def divideFeatures(df):
    numerical_features = df.select_dtypes(include=[np.number])
    categorical_features = df.select_dtypes(include=[np.object])
    return numerical_features, categorical_features

In [None]:
print(df.shape)
df.drop_duplicates(inplace=True)
print(df.shape)
num_feature,cat_feature = divideFeatures(df)
num_feature

In [None]:
cat_feature

In [None]:
df.drop(['customer_id','Name','security_no','referral_id'],axis=1,inplace=True)
df_test.drop(['customer_id','Name','security_no','referral_id'],axis=1,inplace=True)


In [None]:
num_feature,cat_feature = divideFeatures(df)
num_feature

In [None]:
cat_feature.nunique()

In [None]:
cat_feature['joining_date']

In [None]:
cat_feature['last_visit_time']

In [None]:
cat_feature.info()

In [None]:
cat_feature['avg_frequency_login_days'].replace(['Error'],np.nan,inplace=True)

In [None]:
arr = cat_feature['avg_frequency_login_days'].unique()

In [None]:
cat_feature['avg_frequency_login_days'].astype('float64').plot(kind='box')

In [None]:
fig = plt.figure(figsize=(16,16))
for i in range(len(num_feature.columns)):
    fig.add_subplot(3,4,i+1)
    sns.boxplot(y=num_feature.iloc[:,i])
plt.tight_layout()
plt.show()

In [None]:
df['avg_frequency_login_days'].replace(['Error'],np.nan,inplace=True)
df_test['avg_frequency_login_days'].replace(['Error'],np.nan,inplace=True)

In [None]:
df['avg_frequency_login_days'] = df['avg_frequency_login_days'].astype('float64')
df_test['avg_frequency_login_days'] = df_test['avg_frequency_login_days'].astype('float64')

In [None]:
num_feature,cat_feature = divideFeatures(df)
num_feature

In [None]:
fig = plt.figure(figsize=(16,16))
for i in range(len(num_feature.columns)):
    fig.add_subplot(3,3,i+1)
    sns.boxplot(y=num_feature.iloc[:,i])
plt.tight_layout()
plt.show()

In [None]:
sns.pairplot(df)
plt.show()

In [None]:
corr = df.corr()
mask = np.zeros_like(corr,dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr,mask=mask,annot=True)
plt.show()

In [None]:
skewed_features = num_feature.apply(lambda x:x.skew()).sort_values(ascending=False)
skewed_features

In [None]:
df.isnull().sum()

In [None]:
df.region_category.value_counts()

In [None]:
sns.set_theme(style="white", context="talk")
f,(ax1,ax2,ax3) = plt.subplots(3,1,figsize=(15,15))
sns.barplot(x = 'gender',y =targetFeature,palette="rocket",ax=ax1,data=df)
sns.barplot(x = 'region_category',y =targetFeature,palette="vlag",ax=ax2,data=df)
sns.barplot(x = 'membership_category',y =targetFeature,palette="deep",ax=ax3,data=df)

In [None]:
fig,ax = plt.subplots(1,1,figsize=(20,7))
sns.stripplot(y ='region_category', x =targetFeature, data = df, 
              jitter = True, hue ='gender', dodge = True,ax=ax)

In [None]:
df.info()

In [None]:
df['avg_time_spent'].plot(kind='box')

# Task 3- Data Preparation (with Feature Engineering) 

In [None]:
# Dropping all rows whose 'avg_time_spent' is negative 
low = df['avg_time_spent'] < 0
low = np.where(low)
df.drop(low[0],inplace=True)

In [None]:
df.isnull().sum()

In [None]:
# Reseting the default index
df.index = range(0,df.shape[0])

In [None]:
# Dropping all rows whose 'points_in_wallet' is negative 
l1 = np.where(df['points_in_wallet'] < 0)
df.drop(l1[0],inplace=True)
df.index = range(0,df.shape[0])

In [None]:
# Dropping all rows whose 'avg_frequency_login_days' is negative 
l2 = np.where(df['avg_frequency_login_days'] < 0)
df.drop(l2[0],inplace=True)
df.index = range(0,df.shape[0])

In [None]:
# Dropping all rows whose 'avg_transaction_value' is negative 
l3 = np.where(df['avg_transaction_value'] < 0)
df.drop(l3[0],inplace=True)
df.index = range(0,df.shape[0])

In [None]:
# Dropping all rows whose 'days_since_last_login' is negative 
l4 = np.where(df['days_since_last_login'] < 0)
df.drop(l4[0],inplace=True)
df.index = range(0,df.shape[0])

In [None]:
df.dropna(how='any',inplace=True)

In [None]:
df_test.info()

In [None]:
df['joining_date'] = pd.to_datetime(df['joining_date'])
df['last_visit_time'] = pd.to_datetime(df['last_visit_time'])

df_test['joining_date'] = pd.to_datetime(df_test['joining_date'])
df_test['last_visit_time'] = pd.to_datetime(df_test['last_visit_time'])

In [None]:
df['jday'] = df['joining_date'].dt.day
df_test['jday'] = df_test['joining_date'].dt.day

In [None]:
df['jday'] = df['joining_date'].dt.day
df['jmonth'] = df['joining_date'].dt.month
df['jweek'] = df['joining_date'].dt.dayofweek

df_test['jday'] = df_test['joining_date'].dt.day
df_test['jmonth'] = df_test['joining_date'].dt.month
df_test['jweek'] = df_test['joining_date'].dt.dayofweek

In [None]:
df['diff'] =  ((df['last_visit_time'] - df['joining_date']).apply(lambda x:str(x).split()[0])).astype('float64')
df_test['diff'] =  ((df_test['last_visit_time'] - df_test['joining_date']).apply(lambda x:str(x).split()[0])).astype('float64')

In [None]:
df.index = range(0,df.shape[0])

# Removing all the rows having 'churn_risk_score' == -1 
l5 = np.where(df['churn_risk_score'] == -1)
df.drop(l5[0],inplace=True)
df.index = range(0,df.shape[0])

In [None]:
df

In [None]:
# Clearly target labels are imbalanced. So we'll use Over sampling techinques
df['churn_risk_score'].value_counts()


In [None]:
df_test.info()

In [None]:
df.info()

In [None]:
# Selecting categorical features into a new dataframe
sm = ['gender','region_category','membership_category','joined_through_referral',
      'preferred_offer_types','medium_of_operation','internet_option','used_special_discount',
       'offer_application_preference','past_complaint','complaint_status','feedback']
df_sm = df[sm]
df_test_sm = df_test[sm]

In [None]:
# Creating dummies for categorical feature columns
df_sm = pd.get_dummies(df_sm)
df_test_sm = pd.get_dummies(df_test_sm)

In [None]:
# Selecting Numerical features into a new dataframe

ss = ['days_since_last_login','avg_time_spent','avg_transaction_value','avg_frequency_login_days',
'points_in_wallet','used_special_discount','churn_risk_score','jday','jmonth','jweek','diff']
df_ss = df[ss]
ss_t = ['days_since_last_login','avg_time_spent','avg_transaction_value','avg_frequency_login_days',
'points_in_wallet','used_special_discount','jday','jmonth','jweek','diff']
df_test_ss = df_test[ss_t]

In [None]:
# Concatenating the dummy columns and numerical feature columns into a new dataframe
df_new = pd.concat([df_ss,df_sm],axis=1)
df_test_new = pd.concat([df_test_ss,df_test_sm],axis=1)

In [None]:
df_test_new.info()

In [None]:
df_new.drop(['used_special_discount'],axis=1,inplace=True)
df_test_new.drop(['used_special_discount'],axis=1,inplace=True)

In [None]:
df_test_new.isnull().sum()

In [None]:
df_test_new['avg_frequency_login_days'].fillna(value=df_test_new['avg_frequency_login_days'].mean(),inplace=True)
df_test_new['points_in_wallet'].fillna(value=df_test_new['points_in_wallet'].mean(),inplace=True)

In [None]:
df_test_new.info()

In [None]:
df_new.dropna(axis=0,how='any',inplace=True)

In [None]:
df_new.info()

In [None]:
X = df_new.drop(['churn_risk_score'],axis=1).values
Y = df_new['churn_risk_score'].values

In [None]:
# Applying SMOTE Over Sampling Strategy
from imblearn.over_sampling import SMOTE
oversampler = SMOTE()
X_ov,Y_ov = oversampler.fit_resample(X,Y)
print('The dataset before oversampling: ',X.shape,Y.shape)
print('The dataset after oversampling: ',X_ov.shape,Y_ov.shape)

In [None]:
# Now we are having equally balanced target labels
print(sum(Y == 1),sum(Y_ov == 1))
print(sum(Y == 2),sum(Y_ov == 2))
print(sum(Y == 3),sum(Y_ov == 3))
print(sum(Y == 4),sum(Y_ov == 4))
print(sum(Y == 5),sum(Y_ov == 5))


In [None]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
X_ov = scaler.fit_transform(X_ov)
X_test = df_test_new.values
X_test = scaler.transform(X_test)

# Task 4- Data Modelling

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X_ov,Y_ov,train_size=0.7)
print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix,f1_score
rf = RandomForestClassifier(n_estimators=1000,max_depth=25)
rf.fit(x_train,y_train)
y_pred = rf.predict(x_test)
print(classification_report(y_true=y_test,y_pred=y_pred))
print(f1_score(y_true=y_test,y_pred=y_pred,average='macro'))

In [None]:
fig = plt.figure(figsize=(6, 6))
ax= plt.subplot()
cm = confusion_matrix(y_true=y_test,y_pred=y_pred)
sns.heatmap(cm, annot=True, ax = ax, fmt = 'g')
ax.set_xlabel('Predicted label')
ax.set_ylabel('Actual label')
plt.show()

## MLPClassifier with three hidden layers of 100 nodes each

In [None]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(100,100,100))
mlp.fit(x_train,y_train)
y_pred = mlp.predict(x_test)
print(classification_report(y_true=y_test,y_pred=y_pred))
print(f1_score(y_true=y_test,y_pred=y_pred,average='macro'))

In [None]:
fig = plt.figure(figsize=(6, 6))
ax= plt.subplot()
cm = confusion_matrix(y_true=y_test,y_pred=y_pred)
sns.heatmap(cm, annot=True, ax = ax, fmt = 'g')
ax.set_xlabel('Predicted label')
ax.set_ylabel('Actual label')
plt.show()

## Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion='entropy',max_depth=15, min_samples_split=10, min_samples_leaf=12)
tree.fit(x_train,y_train)
y_pred = tree.predict(x_test)
print(classification_report(y_true=y_test,y_pred=y_pred))
print(f1_score(y_true=y_test,y_pred=y_pred,average='macro'))

In [None]:
fig = plt.figure(figsize=(6, 6))
ax= plt.subplot()
cm = confusion_matrix(y_true=y_test,y_pred=y_pred)
sns.heatmap(cm, annot=True, ax = ax, fmt = 'g')
ax.set_xlabel('Predicted label')
ax.set_ylabel('Actual label')
plt.show()

## Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbrt = GradientBoostingClassifier(n_estimators=100,max_depth=10,min_samples_leaf = 4, min_samples_split= 5)
gbrt.fit(x_train,y_train)
y_pred = gbrt.predict(x_test)
print(classification_report(y_true=y_test,y_pred=y_pred))
print(f1_score(y_true=y_test,y_pred=y_pred,average='macro'))

In [None]:
fig = plt.figure(figsize=(6, 6))
ax= plt.subplot()
cm = confusion_matrix(y_true=y_test,y_pred=y_pred)
sns.heatmap(cm, annot=True, ax = ax, fmt = 'g')
ax.set_xlabel('Predicted label')
ax.set_ylabel('Actual label')
plt.show()

## XGBClassifier

In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(n_jobs=-1,n_estimators=1000,max_depth=10)
xgb.fit(x_train,y_train)
y_pred = xgb.predict(x_test)
print(classification_report(y_true=y_test,y_pred=y_pred))
print(f1_score(y_true=y_test,y_pred=y_pred,average='macro'))

In [None]:
fig = plt.figure(figsize=(6, 6))
ax= plt.subplot()
cm = confusion_matrix(y_true=y_test,y_pred=y_pred)
sns.heatmap(cm, annot=True, ax = ax, fmt = 'g')
ax.set_xlabel('Predicted label')
ax.set_ylabel('Actual label')
plt.show()

# Mixed model of XGB ,Random Forest & Gradient Boosting Classifier

In [None]:
class MixModel(skb.BaseEstimator, skb.RegressorMixin, skb.TransformerMixin):

    def __init__(self, algs):
        self.algs = algs

    def fit(self, X, y):
        self.algs_ = [skb.clone(x) for x in self.algs]
        
       
        for alg in self.algs_:
            alg.fit(X, y)

        return self
    

    def predict(self, X):
        predictions = np.column_stack([
            stacked_model.predict(X) for stacked_model in self.algs_
        ])
        return (np.mean(predictions, axis=1)).astype('int64')

In [None]:
mixed_model = MixModel(algs = [xgb,rf,gbrt])
mixed_model.fit(x_train, y_train)


In [None]:
y_pred = mixed_model.predict(x_test)
print(classification_report(y_true=y_test,y_pred=y_pred))
print(f1_score(y_true=y_test,y_pred=y_pred,average='macro'))

In [None]:
fig = plt.figure(figsize=(6, 6))
ax= plt.subplot()
cm = confusion_matrix(y_true=y_test,y_pred=y_pred)
sns.heatmap(cm, annot=True, ax = ax, fmt = 'g')
ax.set_xlabel('Predicted label')
ax.set_ylabel('Actual label')
plt.show()

# Task 5- Test Evaluation and Submission

In [None]:
print(X_test.shape,X.shape)

In [None]:
df_sub = pd.read_csv('../input/hackerearth-how-not-to-lose-a-customer-in-10-days/sample_submission.csv')
df_sub

In [None]:
client_score = pd.DataFrame(xgb.predict(X_test),columns=['churn_risk_score'])
client_score

In [None]:
df_test = pd.read_csv('../input/hackerearth-how-not-to-lose-a-customer-in-10-days/test.csv')
df_pr = pd.DataFrame(df_test['customer_id'],columns=['customer_id'])
df_pr

In [None]:
df_pr = pd.concat([df_pr,client_score],axis=1)

In [None]:
df_pr

In [None]:
df_pr.to_csv('./Churn_risk_final_submission.csv')

# Last Notes
Try more different type of models and hypertunning with current models to find better results.XGB Classifier is giving best results for me.
Also some better feature engineering may bring excellent results.

If you like my work, show your appreciation with an upvote and share this notebook.
