In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
df=pd.read_csv('../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [None]:
df.head(5)

In [None]:
df=df.drop(['customerID'],axis=1)

In [None]:
df.describe(include='all')

In [None]:
df.isnull().sum()

# Exploratory Data Analysis

In [None]:
#Gender Representation
df['gender'].value_counts().plot(kind='pie')
plt.title("Gender Representation ")

In [None]:
# Tenure
sns.distplot(df['tenure'],kde=True,color='blue')
plt.xlim(-5,80)
plt.title("Tenure Plot")

# A large chunk of customer are customers who are recent.

In [None]:
sns.boxplot(y=df["tenure"],x=df["Churn"])

In [None]:
pd.crosstab(df["PhoneService"],df['Churn'],normalize='columns').plot(kind='bar')
plt.title('There is not a huge impact that Phone service has on churn')

#sns.catplot(kind='bar',x='Churn',y='PhoneService',data=ps)
#sns.catplot(data='df',kind='bar',)

In [None]:
mulin=pd.crosstab(df["MultipleLines"],df["Churn"],normalize="columns").plot(kind="bar")
plt.title('Surprisingly people with multiple lines are more likely to unsubscribe ')

In [None]:
internet=pd.crosstab(df["InternetService"],df["Churn"],normalize="columns").plot(kind="bar")
plt.title("People with a fibre optic connection are likely to churn at a higher rate, while people at a DSL connection are more likely to be retained")

In [None]:
internet=pd.crosstab(df["OnlineSecurity"],df["Churn"],normalize="columns").plot(kind="bar")
#plt.title("People with internet service are less likely to churn")
internet


In [None]:
internet=pd.crosstab(df["OnlineBackup"],df["Churn"],normalize="columns").plot(kind="bar")
plt.title("People with internet backup are less likely to churn")

In [None]:

internet=pd.crosstab(df["DeviceProtection"],df["Churn"],normalize="columns").plot(kind="bar")
plt.title("People with device protection are less likely to churn")
plt.show()


In [None]:

internet=pd.crosstab(df["TechSupport"],df["Churn"],normalize="columns").plot(kind="bar")
plt.title("People with techsupport protection are less likely to churn")
plt.show()

In [None]:
internet=pd.crosstab(df["Contract"],df["Churn"],normalize="columns").plot(kind="bar")
plt.title("Month to month charges have a much higher churn ratio")

In [None]:
sns.boxplot(y=df["MonthlyCharges"],x=df["Churn"])

# Feature Engineering

In [None]:
df.dtypes

In [None]:
df['TotalCharges']=pd.to_numeric(df["TotalCharges"],downcast='float',errors='coerce')

In [None]:
df.dtypes

In [None]:
df=pd.get_dummies(df,columns=['gender','Partner','Dependents','PhoneService','MultipleLines','InternetService','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies','Contract','PaperlessBilling','PaymentMethod','Churn'])

In [None]:
df.drop([ 'Churn_No', 'gender_Female', 'Partner_No',
        'Dependents_No', 'PhoneService_No', 'PaperlessBilling_No'],axis=1,inplace=True)

# Classification Algos

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report

In [None]:
df.dropna()

In [None]:
df=df.drop(['TotalCharges'],axis=1)

In [None]:
X=df.iloc[:,:-1].values
y=df['Churn_Yes'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [None]:
lr=LogisticRegression()
lr.fit(X_train,y_train)

In [None]:
y_pred=lr.predict(X_test)

In [None]:
print('MSE of Test Error with Logistic Regression:',mse(y_pred,y_test),'\n'
        'Accuracy Score with Logistic Regression',accuracy_score(y_pred,y_test),'\n'
        'Confusion Matrix with Logistic Regression', confusion_matrix(y_pred,y_test))

In [None]:
cm=confusion_matrix(y_pred,y_test)
cm
#sns.heatmap(cm,annot=True)

In [None]:
print(classification_report(y_pred,y_test))

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dc=DecisionTreeClassifier()
dc.fit(X_train,y_train)
y_pred=dc.predict(X_test)

In [None]:
print('MSE of Test Error with Decision Trees:',mse(y_pred,y_test),'\n'
        'Accuracy Score with Decision Tree',accuracy_score(y_pred,y_test),'\n'
        'Confusion Matrix with Decision Tree', confusion_matrix(y_pred,y_test))

In [None]:
features=dc.feature_importances_
features.shape

In [None]:
df.columns

In [None]:
feat=pd.DataFrame(features,index=['SeniorCitizen', 'tenure', 'MonthlyCharges', 'gender_Male',
       'Partner_Yes', 'Dependents_Yes', 'PhoneService_Yes', 'MultipleLines_No',
       'MultipleLines_No phone service', 'MultipleLines_Yes',
       'InternetService_DSL', 'InternetService_Fiber optic',
       'InternetService_No', 'OnlineSecurity_No',
       'OnlineSecurity_No internet service', 'OnlineSecurity_Yes',
       'OnlineBackup_No', 'OnlineBackup_No internet service',
       'OnlineBackup_Yes', 'DeviceProtection_No',
       'DeviceProtection_No internet service', 'DeviceProtection_Yes',
       'TechSupport_No', 'TechSupport_No internet service', 'TechSupport_Yes',
       'StreamingTV_No', 'StreamingTV_No internet service', 'StreamingTV_Yes',
       'StreamingMovies_No', 'StreamingMovies_No internet service',
       'StreamingMovies_Yes', 'Contract_Month-to-month', 'Contract_One year',
       'Contract_Two year', 'PaperlessBilling_Yes',
       'PaymentMethod_Bank transfer (automatic)',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check'])

In [None]:
feat.sort_values(by=0,ascending=False).head(10).plot(kind='barh')

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf=RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred=rf.predict(X_test)
print(classification_report(y_test,y_pred))


In [None]:
feat2=rf.feature_importances_
features=pd.DataFrame(feat2,index=['SeniorCitizen', 'tenure', 'MonthlyCharges', 'gender_Male',
       'Partner_Yes', 'Dependents_Yes', 'PhoneService_Yes', 'MultipleLines_No',
       'MultipleLines_No phone service', 'MultipleLines_Yes',
       'InternetService_DSL', 'InternetService_Fiber optic',
       'InternetService_No', 'OnlineSecurity_No',
       'OnlineSecurity_No internet service', 'OnlineSecurity_Yes',
       'OnlineBackup_No', 'OnlineBackup_No internet service',
       'OnlineBackup_Yes', 'DeviceProtection_No',
       'DeviceProtection_No internet service', 'DeviceProtection_Yes',
       'TechSupport_No', 'TechSupport_No internet service', 'TechSupport_Yes',
       'StreamingTV_No', 'StreamingTV_No internet service', 'StreamingTV_Yes',
       'StreamingMovies_No', 'StreamingMovies_No internet service',
       'StreamingMovies_Yes', 'Contract_Month-to-month', 'Contract_One year',
       'Contract_Two year', 'PaperlessBilling_Yes',
       'PaymentMethod_Bank transfer (automatic)',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check'])
features.sort_values(by=0,ascending=False).head(10).plot(kind='barh')

# Very Interesting ! In Random forest the tenure has the highest degree of feature importance . Furthermore, accuracy of logistic regression is higher than Random Forest.Lets do a grid search

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
estimators=range(50,100)
max_features = ['auto', 'sqrt']
max_depth = range(4,12)
min_samples_split = range(2,8)
min_samples_leaf = range(1,8)
bootstrap = [True, False]
criterion=['gini','entropy']


In [None]:
random_grid = {'n_estimators':estimators,
              'max_features':max_features,
              'max_depth':max_depth,
              'min_samples_split':min_samples_split,
              'min_samples_leaf':min_samples_leaf,
              'bootstrap':bootstrap,
              'criterion':criterion}

In [None]:
rf=RandomForestClassifier()
rf=RandomizedSearchCV(estimator=rf,param_distributions=random_grid,cv=5,verbose=1)

In [None]:
rf.fit(X_train,y_train)

In [None]:
best_estimate=rf.best_params_
best_estimate

In [None]:
rf2=RandomForestClassifier(**best_estimate)

In [None]:
rf2.fit(X_train,y_train)

In [None]:
y_pred=rf2.predict(X_test)

In [None]:
print(classification_report(y_pred,y_test))

# Thank you for your attention..We found out through algorithms that Logistic Regression was far better when it came to accuracy. Moreover, the most important features were contract type, and monthly charges. A more detailed analysis would include what determines monthly charges and how can businesses address these issues to retain customers