In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

from scipy.stats import normaltest

from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
pd.set_option('display.max_columns',25)
pd.set_option('display.max_rows',25)

In [None]:
df = pd.read_csv('../input/hackerearth-how-not-to-lose-a-customer-in-10-days/train.csv', na_values=['?','Error','Unknown'])
df.head()

In [None]:
def shape(x):
    row,column = x.shape
    print(f'The dataset has {row} rows and {column} columns.')

In [None]:
shape(df)

In [None]:
df['churn_risk_score'].value_counts()

`As it is mentioned, the churn_risk_score is in the range of 1 to 5. So we are dropping -1 values as it might alter our prediction`

In [None]:
df = df[df['churn_risk_score']!=-1]

In [None]:
shape(df)

In [None]:
df['churn_risk_score'].value_counts()

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df.describe(include='object').T

In [None]:
#dropping the categorical features with unique values
df.drop(['customer_id','Name','security_no','referral_id','last_visit_time'],1,inplace=True)

In [None]:
df.head()

In [None]:
shape(df)

In [None]:
plt.figure(figsize=(18,6))
g = sns.barplot(x=df.columns, y=df.isna().sum(), palette='Pastel2')
plt.xticks(rotation=90)#, color='#822659')
plt.title('Missing Values', size=16, color = '#025955')
for p in g.patches:
    g.annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.4, p.get_height()+20),ha='center', va='bottom',
               color= 'black')
plt.show()

In [None]:
missing_cols = pd.Series(df.isna().sum().sort_values(ascending=False))
missing_cols = missing_cols[missing_cols != 0]
missing_cols

In [None]:
for col in missing_cols.index:
    if df[col].dtype == 'object':
        print(df[col].value_counts())
        print('-----------------------')

In [None]:
normaltest(df['avg_frequency_login_days'], nan_policy='omit')

In [None]:
normaltest(df['points_in_wallet'], nan_policy='omit')

`Since both the distribution are not normal, we will impute the missing values with the median value`

In [None]:
for col in missing_cols.index:
    if df[col].dtype == 'object':
        df[col] = df[col].fillna(df[col].mode()[0])
    else:
        df[col] = df[col].fillna(df[col].median())

In [None]:
df.isna().sum().sum() #missing values have been imputed

In [None]:
#Checking the distribution of target variable among categorical variables
for col in df.columns:
    if df[col].dtype == 'object' and col!='joining_date':
        print(pd.crosstab(df[col],df['churn_risk_score']))
        print('=====================================================')

### EDA

In [None]:
plt.figure(figsize=(18,8))
plt.subplot(1,2,1)
vc = df['churn_risk_score'].value_counts()
g = sns.barplot(x=vc.index,y=vc, palette='Pastel2')
for p in g.patches:
    g.annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.4, p.get_height()+20),ha='center', va='bottom',
               color= 'black')
plt.title('Count of different classes')
plt.subplot(1,2,2)
colors = ['#CFD6E4', '#EFCFE3', '#E4F0CF', '#F3CFB6', '#B9DCCC']
df['churn_risk_score'].value_counts().plot(kind='pie', explode=[0.1,0,0.1,0,0.1], autopct='%.2f%%', colors=colors)
plt.title('Distribution of different classes')
plt.show()

In [None]:
plt.figure(figsize=(18,40))
i = 1
for col in df.columns:
    if col not in ['churn_risk_score','joining_date']:
        if df[col].dtype == 'object':
            plt.subplot(8,3,i)
            sns.countplot(x=col, hue='churn_risk_score', data=df, palette='Pastel2')
            i += 1
            plt.xticks(rotation=90)
        elif df[col].dtype in ['float64','int64']:
            plt.subplot(8,3,i)
            sns.kdeplot(x=col, hue='churn_risk_score', data=df, palette='Pastel2')
            i+=1
            plt.xticks(rotation=90)
plt.tight_layout()
plt.suptitle('Distribution of independent variables w.r.t. dependent variable', size=16, color='#025955',y=1.01)
plt.show()

### Feature Engineering

In [None]:
#Creating a new columns which tells the number of days since the customer joined
df['joining_date'] = pd.to_datetime(df['joining_date'])

df['joined_days'] = df['joining_date'].apply(lambda x: int(str(pd.to_datetime('now')-x).split()[0]))

In [None]:
#dropping the joining_date column
df.drop('joining_date',1,inplace=True)

In [None]:
#Creating a new column for high transactions
def transaction_range(x):
    if x<50000:
        return 0
    else:
        return 1

In [None]:
df['transaction_range'] = df['avg_transaction_value'].apply(transaction_range)

In [None]:
def gender(x):
    return 0 if x == 'F' else 1

def region_category(x):
    return 0 if x=='Village' else 1 if x=='City' else 2 if x=='Town' else 1

def membership_category(x):
    if x =='Platinum Membership':
        return 0
    elif x=='Premium Membership':
        return 1
    elif x=='No Membership':
        return 2
    elif x=='Gold Membership':
        return 3
    elif x =='Silver Membership':
        return 4
    elif x=='Basic Membership':
        return 5
    else:
        return 6
    
def joined_through_referral(x):
    if x=='No':
        return 0
    elif x=='Yes':
        return 1
    else:
        return 2
    
def preferred_offer_types(x):
    if x=='Gift Vouchers/Coupons':
        return 0
    elif x=='Credit/Debit Card Offers':
        return 1
    elif x=='Without Offers':
        return 2
    else:
        return 0
    

def medium_of_operation(x):
    if x=='Desktop':
        return 0
    elif x=='Smartphone':
        return 1
    elif x=='Both':
        return 2
    else:
        return 3
    
def internet_option(x):
    if x=='Wi-Fi':
        return 0
    elif x=='Mobile_Data':
        return 1
    elif x=='Fiber_Optic':
        return 2
    else:
        return 3

def used_special_discount(x):
    if x== 'No':
        return 0
    else:
        return 1
    
def offer_application_preference(x):
    if x=='No':
        return 0
    else:
        return 1
    
def past_complaint(x):
    if x=='No':
        return 0
    else:
        return 1
    
def complaint_status(x):
    if x=='Solved':
        return 0
    elif x=='Solved in Follow-up':
        return 1
    elif x=='Unsolved':
        return 2
    elif x=='No Information Available':
        return 3
    else: return 4
    
def feedback(x):
    if x=='Products always in Stock':
        return 0
    elif x=='Quality Customer Care':
        return 1
    elif x=='Poor Website':
        return 2
    elif x=='No reason specified':
        return 3
    elif x== 'Poor Product Quality':
        return 4
    elif x=='Poor Customer Service':
        return 5
    elif x=='Too many ads':
        return 6
    elif x=='User Friendly Website':
        return 7
    elif x=='Reasonable Price':
        return 8
    else:
        return 9

In [None]:
df1 = df.copy(deep=True)

In [None]:
#label encoding the categorical features
df['gender'] = df['gender'].apply(gender)
df['region_category'] = df['region_category'].apply(region_category)
df['membership_category'] = df['membership_category'].apply(membership_category)
df['joined_through_referral'] = df['joined_through_referral'].apply(joined_through_referral)
df['preferred_offer_types'] = df['preferred_offer_types'].apply(preferred_offer_types)
df['medium_of_operation'] = df['medium_of_operation'].apply(medium_of_operation)
df['internet_option'] = df['internet_option'].apply(internet_option)
df['used_special_discount'] = df['used_special_discount'].apply(used_special_discount)
df['offer_application_preference'] = df['offer_application_preference'].apply(offer_application_preference)
df['past_complaint'] = df['past_complaint'].apply(past_complaint)
df['complaint_status'] = df['complaint_status'].apply(complaint_status)
df['feedback'] = df['feedback'].apply(feedback)

In [None]:
df.head()

In [None]:
#splitting the data into train and test
from sklearn.model_selection import train_test_split

In [None]:
X = df.drop('churn_risk_score',1)
y = df['churn_risk_score']

In [None]:
y = y.map({1:0,2:1,3:2,4:3,5:4})

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
#Standardizing the values
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_train_sc = pd.DataFrame(X_train_sc, columns=X_train.columns)
X_test_sc = sc.transform(X_test)
X_test_sc = pd.DataFrame(X_test_sc, columns=X_test.columns)

In [None]:
sc1 = StandardScaler()
X_sc = sc1.fit_transform(X)
X_sc = pd.DataFrame(X_sc, columns=X.columns)

In [None]:
shape(X_sc)

In [None]:
from sklearn.metrics import f1_score,classification_report

In [None]:
def model_eval(algo, xtrain, xtest, ytrain, ytest):
    a = algo
    a.fit(xtrain,ytrain)
    ytrain_pred = a.predict(xtrain)
    ytrain_prob = a.predict_proba(xtrain)[:,1]
    ytest_pred = a.predict(xtest)
    ytest_prob = a.predict_proba(xtest)[:,1]
    print('F1 score of Train: ',f1_score(ytrain,ytrain_pred,average='macro'))
    print('----------------------------------')
    print('F1 score of Test: ',f1_score(ytest,ytest_pred,average='macro'))
    print('\n==================================\n')

In [None]:
algos = {'ExtraTrees':ExtraTreesClassifier(),'XGBoost':XGBClassifier(),
         'RandomForest':RandomForestClassifier(),
        'GradientBoost':GradientBoostingClassifier(), 'LGBM':LGBMClassifier()}

In [None]:
for algo in algos:
    print(f'### {algo} ###')
    model_eval(algos[algo],X_train_sc,X_test_sc, y_train,y_test)

In [None]:
xgbp = {'learning_rate': 0.8,
 'max_depth': 6,
 'min_child_weight': 6,
 'n_estimators': 150,
 'reg_alpha': 53,
 'reg_lambda': 42}
rfp = {'max_depth': 9,
 'max_features': 10,
 'min_samples_leaf': 12,
 'min_samples_split': 18,
 'n_estimators': 56}
gbp = {'max_depth': 4,
 'max_features': 12,
 'min_samples_leaf': 2,
 'min_samples_split': 11,
 'n_estimators': 63}

In [None]:
algos1 = {'GradientBoost':GradientBoostingClassifier(**gbp),'XGBoost':XGBClassifier(**xgbp),
         'RandomForest':RandomForestClassifier(**rfp)}

In [None]:
for algo in algos1:
    print(f'### {algo} ###')
    model_eval(algos1[algo],X_train_sc,X_test_sc, y_train,y_test)

### Test Dataset

In [None]:
test = pd.read_csv('../input/hackerearth-how-not-to-lose-a-customer-in-10-days/test.csv', na_values=['?','Error','Unknown'])
test.head()

In [None]:
shape(test)

In [None]:
id = test['customer_id']

In [None]:
#changing joining_date to datetime format
test['joining_date'] = pd.to_datetime(test['joining_date'])

test['joined_days'] = test['joining_date'].apply(lambda x: int(str(pd.to_datetime('now')-x).split()[0]))

In [None]:
test['transaction_range'] = test['avg_transaction_value'].apply(transaction_range)

In [None]:
#dropping the categorical features with unique values
test.drop(['customer_id','Name','security_no','referral_id','last_visit_time','joining_date'],1,inplace=True)

In [None]:
missing_cols_test = pd.Series(test.isna().sum().sort_values(ascending=False))
missing_cols_test = missing_cols_test[missing_cols_test != 0]
missing_cols_test

In [None]:
#Imputing the missing values:
def missingValImputation(df1, df2, col):
    if df1[col].dtype == 'object':
        df1[col] = df1[col].fillna(df2[col].mode()[0])
    else:
        df1[col] = df1[col].fillna(df2[col].mean())

In [None]:
for col in missing_cols_test.index:
    missingValImputation(test, df1, col)

In [None]:
#label encoding the categorical features
test['gender'] = test['gender'].apply(gender)
test['region_category'] = test['region_category'].apply(region_category)
test['membership_category'] = test['membership_category'].apply(membership_category)
test['joined_through_referral'] = test['joined_through_referral'].apply(joined_through_referral)
test['preferred_offer_types'] = test['preferred_offer_types'].apply(preferred_offer_types)
test['medium_of_operation'] = test['medium_of_operation'].apply(medium_of_operation)
test['internet_option'] = test['internet_option'].apply(internet_option)
test['used_special_discount'] = test['used_special_discount'].apply(used_special_discount)
test['offer_application_preference'] = test['offer_application_preference'].apply(offer_application_preference)
test['past_complaint'] = test['past_complaint'].apply(past_complaint)
test['complaint_status'] = test['complaint_status'].apply(complaint_status)
test['feedback'] = test['feedback'].apply(feedback)

In [None]:
test

In [None]:
test_sc = sc1.transform(test)
test_sc = pd.DataFrame(test_sc, columns=test.columns)

In [None]:
test_sc.head()

In [None]:
xg = XGBClassifier(**xgbp, max_delta_step=1, n_jobs=-1, subsample=0.99)

In [None]:
xg.fit(X_sc,y)

In [None]:
result = pd.Series(xg.predict(test_sc))
result

In [None]:
result = result.map({0:1,1:2,2:3,3:4,4:5})

In [None]:
from collections import Counter
Counter(result)

In [None]:
sub = pd.DataFrame({'customer_id':id, 'churn_risk_score':result})

***