In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import folium
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold,RepeatedStratifiedKFold,GridSearchCV,cross_val_score
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report

### About the dataset
* state: string. 2-letter code of the US state of customer residence
* account_length: numerical. Number of months the customer has been with the current telco provider
* area_code: string="area_code_AAA" where AAA = 3 digit area code.
* international_plan: (yes/no). The customer has international plan.
* voice_mail_plan: (yes/no). The customer has voice mail plan.
* number_vmail_messages: numerical. Number of voice-mail messages.
* total_day_minutes: numerical. Total minutes of day calls.
* total_day_calls: numerical. Total number of day calls.
* total_day_charg: numerical. Total charge of day calls.
* total_eve_minutes: numerical. Total minutes of evening calls.
* total_eve_calls: numerical. Total number of evening calls.
* total_eve_charge: numerical. Total charge of evening calls.
* total_night_minutes: numerical. Total minutes of night calls.
* total_night_calls: numerical. Total number of night calls.
* total_night_charge: numerical. Total charge of night calls.
* total_intl_minutes: numerical. Total minutes of international calls.
* total_intl_calls: numerical. Total number of international calls.
* total_intl_charge: numerical. Total charge of international calls
* number_customer_service_calls: numerical. Number of calls to customer service
* churn: (yes/no) Customer churn - target variable.

In [None]:
#Read the dataset
df = pd.read_csv("/kaggle/input/customer-churn-prediction-2020/train.csv")

In [None]:
# The data set on which the prediction will be done and result will be submitted for evaluation.
test = pd.read_csv('/kaggle/input/customer-churn-prediction-2020/test.csv')

In [None]:
# first 5 records from the dataset
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
#Function to get summary statistics for categorical variable.

def dataQuality(data):
    d={}
    def cat_quality(data):
        def count(x):
            return x.count()
        def miss_per(x):
            return x.isnull().sum()/len(x)
        def unique(x):
            return len(x.unique())
        def freq_cat(x):
            return x.value_counts().sort_values(ascending=False).index[0]
        def freq_cat_per(x):
            return x.value_counts().sort_values(ascending=False).index[0]/len(x)
        qr=dict()
        #select only categorical data types
        data=data.select_dtypes(include=[object])
        for i in np.arange(0,len(data.columns),1):
            xi=data.agg({data.columns[i]:[count,unique,miss_per,freq_cat]})
            qr[data.columns[i]]=xi.reset_index(drop=True)[data.columns[i]]
            df2=pd.DataFrame(qr)
            #df2.index=xi.index
        df2.index=["Count","Unique","Miss_percent","Freq_Level"]
        return df2.T
    d['categorical']=cat_quality(data)
    return d

In [None]:
(dataQuality(df)['categorical'])

In [None]:
plt.figure(figsize=(5,5))
splot=sns.countplot(data=df,x='churn',palette='GnBu')
sns.set_style('ticks')
total = float(len(df))
for p in splot.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total)
    x = p.get_x() + p.get_width()
    y = p.get_height()
    splot.annotate(percentage,(x,y),ha = 'center', va = 'center')
plt.title("Churns")
plt.xlabel("Churn")
plt.ylabel("Number of customers")

#### We can see from the data that 14% of the population have churned.

In [None]:
plt.figure(figsize=(5,5))
sns.catplot(data=df[['account_length','churn']],x='churn',y='account_length',kind="box",palette='GnBu')
plt.title("Loyalty and cusomer churn")
plt.ylabel("Number of months with operator")

#### From the above box plot we dont see any specific behavious of account_length for customers who churned and who did not.

In [None]:
sns.catplot(data=df[['total_day_charge','churn']],x='churn',y='total_day_charge',kind="box",palette='GnBu')
plt.title("Day call charges Vs Churn")
plt.ylabel("Call charges in USD")

sns.catplot(data=df[['total_eve_charge','churn']],x='churn',y='total_eve_charge',kind="box",palette='GnBu')
plt.title("Evening call charges Vs Churn")
plt.ylabel("Call charges in USD")

sns.catplot(data=df[['total_night_charge','churn']],x='churn',y='total_night_charge',kind="box",palette='GnBu')
plt.title("Night call charges Vs Churn")
plt.ylabel("Call charges in USD")

sns.catplot(data=df[['total_intl_charge','churn']],x='churn',y='total_intl_charge',kind="box",palette='GnBu')
plt.title("International call charges Vs Churn")
plt.ylabel("Call charges in USD")

#### Conclusion:
From the above box-plots we could see that the average 'day call charges' are more for the customers who churned.\
This can be one of the significant reasons why most of the customers have left.\
For rest of the call charges the behaviour is more or less same for customers who have churned and who have not.

### Which locations have the maximum number of customer churns?

In [None]:
# creaet a dataframe containing states and count of customer churns those states.
state_count1=df[['state','churn']]
state_churn=state_count1[state_count1['churn']== 'yes'].groupby('state',as_index = False).count()

state_count2=df[['state','churn']].groupby('state',as_index = False).count()
state_count2.rename(columns={"churn":"total_cust"},inplace=True)

state_churn['total_cust']=state_count2['total_cust']
state_churn['%churn']=state_churn['churn']/state_count2['total_cust']

In [None]:
# Coropleth
fig = go.Figure(data=go.Choropleth(
    locations=state_churn['state'], # Spatial coordinates
    z = state_churn['%churn'].astype(float), # Data to be color-coded
    locationmode = 'USA-states', # set of locations match entries in `locations`
    colorscale = 'GnBu',
    colorbar_title = "churn percentage",
))

fig.update_layout(
    title_text = 'Customer churns by State',
    geo_scope='usa', # limit map scope to USA
)

fig.show()

#### Conclusion:
New Jersey has the maximum percentage of customer churns (27%) followed by California (25%) and Washington(22%)


### Which area code have the maximum customer churns?

In [None]:
plt.figure(figsize=(5,5))
splot=sns.countplot(data=df,x='area_code',palette='GnBu',hue = 'churn')
sns.set_style('ticks')
total = float(len(df))
for p in splot.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total)
    x = p.get_x() + p.get_width()
    y = p.get_height()
    splot.annotate(percentage,(x,y),ha = 'center', va = 'center')
plt.title("Customer churns in different area codes")
plt.xlabel('Area code')
plt.ylabel('Number of customers')
plt.xticks(rotation=45)
plt.show()

#### Concluion:
Area code 415 has the maximum customer churn of 6.8%

In [None]:
plt.figure(figsize=(5,5))
splot=sns.countplot(data=df,x='international_plan',palette='GnBu',hue = 'churn')
sns.set_style('ticks')
total = float(len(df))
for p in splot.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total)
    x = p.get_x() + p.get_width()
    y = p.get_height()
    splot.annotate(percentage,(x,y),ha = 'center', va = 'center')
plt.xlabel('International Plan?')
plt.ylabel('Number of customers')
plt.xticks(rotation=45)
plt.show()

In [None]:
plt.figure(figsize=(5,5))
splot=sns.countplot(data=df,x='voice_mail_plan',palette='GnBu',hue = 'churn')
sns.set_style('ticks')
total = float(len(df))
for p in splot.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total)
    x = p.get_x() + p.get_width()
    y = p.get_height()
    splot.annotate(percentage,(x,y),ha = 'center', va = 'center')
plt.xlabel('Voice mail plan?')
plt.ylabel('Number of customers')
plt.xticks(rotation=45)
plt.show()

### Does customer service calls have impact on churn?

In [None]:
plt.figure(figsize=(5,5))
sns.catplot(data=df[['number_customer_service_calls','churn']],x='churn',y='number_customer_service_calls',kind="box",palette='GnBu')
plt.title("Customer support and churn")
plt.ylabel("Number of customer support calls")

#### Conclusion:
It seems that the customers who have not been assisted properly by the customer service agents for the resolution of issues have finally churned. More than 50% of the churned customers had called customer service at least twice and the maximum mumber of calls being 9.

In [None]:
df.head()

### Convert yes and no values to 1 and 0

In [None]:
df['international_plan'] = np.where(df['international_plan'].str.contains('yes'), 1, 0)
df['voice_mail_plan'] = np.where(df['voice_mail_plan'].str.contains('yes'), 1, 0)
df['churn'] = np.where(df['churn'].str.contains('yes'), 1, 0)

# Do the same for our test set
test['international_plan'] = np.where(test['international_plan'].str.contains('yes'), 1, 0)
test['voice_mail_plan'] = np.where(test['voice_mail_plan'].str.contains('yes'), 1, 0)

In [None]:
df.head()

In [None]:
test.head()

In [None]:
# get X and y from the dataset
X=df.drop('churn',axis=1)
y = df[['churn']]
X.head()

### One hot encoding to create dummy variables

In [None]:
X=pd.get_dummies(X,columns=['state','area_code'])
test=pd.get_dummies(test,columns=['state','area_code'])

### Splitting the data set to train and test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=41)

### SMOTE with  to handle imbalanced data set
As we have an imbalanced dataset with 86 percent data where customer has not churned, the model prediction might get biased. If a model only predicts that the customers have not churned, then the accuracy of the model would be 86%. To get the dataset balanced we have implemented SMOTE.
We first oversampled the minority class and then undersampled the majority class

In [None]:
over=SMOTE(sampling_strategy = 0.2)
under=RandomUnderSampler(sampling_strategy=0.6)
steps=[('o',over),('u',under)]
pipeline=Pipeline(steps=steps)
X_train,y_train=pipeline.fit_resample(X_train,y_train)

In [None]:
sns.countplot(x='churn',data=y_train,palette='YlGnBu')

### Logistic Regression

In [None]:
# define model
model = LogisticRegression()
# define evaluation
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
# define search space
space = dict()
space['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
space['penalty'] = ['none', 'l1', 'l2', 'elasticnet']
space['C'] = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
# define search
search = GridSearchCV(model, space, scoring='accuracy', n_jobs=-1, cv=cv)
# execute search
result = search.fit(X_train, y_train['churn'])

In [None]:
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

In [None]:
# Accuracy
print("score of the LogisticRegression: ",result.score(X_test,y_test))

### Random forest classifier

In [None]:
forest=RandomForestClassifier()
grid = dict()
grid['n_estimators'] = [4,5,6,7]
grid['max_features'] = ['auto', 'sqrt']
grid['min_samples_leaf'] = [3,4,5]
grid['criterion']=['gini','entropy']
forest.fit(X_train,y_train['churn'])
cv = KFold(n_splits=5,random_state=1,shuffle=True)
search = GridSearchCV(forest, grid, scoring='accuracy', n_jobs=-1, cv=cv,verbose=1)
best_model = search.fit(X_train, y_train['churn'])

In [None]:
# summarize result
print('Best Score: %s' % best_model.best_score_)
print('Best Hyperparameters: %s' % best_model.best_params_)

In [None]:
print("score of the Randomforest: ",best_model.score(X_test,y_test))

#### Let's have a look at the confusion matrix and the classification report.Even if the accuracy is 88%, the Precision and Recall for the positive class is not decent.

In [None]:
plot_confusion_matrix(best_model, 
                      X_test, 
                      y_test,
                      values_format='d',
                      cmap='inferno',
                      display_labels=["Did not leave", "Left"])

In [None]:
print(classification_report(y_test,best_model.predict(X_test),target_names=['Did not leave','left']))

### XGBoost

In [None]:
import xgboost as xgb

In [None]:
clf=xgb.XGBClassifier( 
        n_estimators=2000,
        max_depth=12, 
        learning_rate=0.02, 
        subsample=0.8,
        colsample_bytree=0.4, 
        missing=-1, 
        eval_metric='auc')
clf_best = clf.fit(X_train, y_train, 
        eval_set=[(X_train, y_train)],
        verbose=50, early_stopping_rounds=100)

#### Let's now check the confusion matrix and the classification report. Certainly there is a great improvement. The accuracy is now 99% and the Precision and Recall has improved drastically.

In [None]:
plot_confusion_matrix(clf_best, 
                      X_test, 
                      y_test,
                      values_format='d',
                      cmap='inferno',
                      display_labels=["Did not leave", "left"])

In [None]:
print(classification_report(y_test,clf_best.predict(X_test),target_names=['Did not leave','left']))

### Save model

In [None]:
import pickle
filename = open('churn_prediction_model.pkl','wb')
pickle.dump(clf_best,filename)

In [None]:
# Features used in training the model
cols = X_test.columns
# Use the model to make predictions
predicted = clf_best.predict(test[cols])

### Result Submission

In [None]:
submission = pd.DataFrame({'id': test.id, 'churn': predicted})

#Convert DataFrame to a csv file that can be uploaded
#This is saved in the same directory as your notebook
filename = 'churn.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)