In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
import io
import plotly.offline as py#visualization
py.init_notebook_mode(connected=True)#visualization
import plotly.graph_objs as go#visualization
import plotly.tools as tls#visualization
import plotly.figure_factory as ff#visualization


from sklearn import metrics
from sklearn.model_selection import GridSearchCV,KFold,train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from category_encoders import CatBoostEncoder, TargetEncoder

from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import accuracy_score,confusion_matrix,roc_auc_score,roc_curve,classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# Read me please

### Is it a Telco Churn prediction model?

From my experience in Telecom data so far, this dataset we are using here is not the typical telco data. So the model derived from it will not obviously be a Telco model... So I generalized this model as a typical Churn Model. 

If you need to have some ideas on the data set of a Telco Churn model, here is a reference for you :)

1. Customer Relatioship Data (CRM) data -- all customer GSMs, Type of subscription, birthday, gender, the location of living and more 
2. Location data -- the location of the transactions, giving the longitude and latitude, sub-area, area, city, and state, serving cell name etc.
3. complaints data -- all complaints submitted and statistics inquiries related to coverage, problems in offers and packages, and any problem related to the telecom business
4. Network logs data -- the internal sessions related to internet, calls, and SMS for each transaction in Telecom operator
5. Call details records (CDR) data -- all charging information about calls, SMS, MMS, and internet transaction made by customers
6. Handset information -- contains the brand, model, type of the mobile phone and if it’s dual or mono SIM device

Hope, it helps you!

https://journalofbigdata.springeropen.com/articles/10.1186/s40537-019-0191-6

This kernel is from the help of below kernel which gives a good read on customer attrition

https://www.kaggle.com/pavanraj159/telecom-customer-churn-prediction

# Data Overview

In [None]:
churn_data = pd.read_csv(r"/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv")
print('''
        Details of the dataset
----------------------------------------

''')
print(churn_data.info(verbose=True, null_counts=True, memory_usage='deep'))

## Dataset Description

1. customerID Customer ID
2. gender Whether the customer is a male or a female
3. SeniorCitizen Whether the customer is a senior citizen or not (1, 0)
4. Partner Whether the customer has a partner or not (Yes, No)
5. Dependents Whether the customer has dependents or not (Yes, No)
6. tenure Number of months the customer has stayed with the company
7. PhoneService Whether the customer has a phone service or not (Yes, No)
8. MultipleLines Whether the customer has multiple lines or not (Yes, No, No phone service)
9. InternetService Customer’s internet service provider (DSL, Fiber optic, No)
10. OnlineSecurity Whether the customer has online security or not (Yes, No, No internet service)
11. OnlineBackup Whether the customer has online backup or not (Yes, No, No internet service)
12. DeviceProtection Whether the customer has device protection or not (Yes, No, No internet service)
13. TechSupport Whether the customer has tech support or not (Yes, No, No internet service)
14. StreamingTV Whether the customer has streaming TV or not (Yes, No, No internet service)
15. StreamingMovies Whether the customer has streaming movies or not (Yes, No, No internet service)
16. Contract The contract term of the customer (Month-to-month, One year, Two year)
17. PaperlessBilling Whether the customer has paperless billing or not (Yes, No)
18. PaymentMethod The customer’s payment method (Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic))
19. MonthlyCharges The amount charged to the customer monthly
20. TotalCharges The total amount charged to the customer
21. Churn Whether the customer churned or not (Yes or No)

In [None]:
pd.set_option('display.max_colwidth',500)
pd.set_option('display.max_columns',100)
churn_data.head(10)

In [None]:
churn_data.describe(include='all')

# Exploratory Data Analysis (EDA)

Imputations or handling missing values isn't a headache in this dataset. So, we worry about...

a. whether balanced/imbalanced problem?

b. which features have high +/- ve correlations? 

c. what manipulations will we do to the dataset?

d. how is the distribution of each feature?

e. how to build initial model?

In [None]:
label=churn_data['Churn'].value_counts().keys().tolist()
value=churn_data['Churn'].value_counts().tolist()

data = go.Pie(labels = label ,
               values = value ,
               marker = dict(colors =  [ 'lime','red'],
                             line = dict(color = "white",
                                         width =  2.5)
                            ),
               rotation = 90,
               hoverinfo = "label+value+text",
               hole = .5
              )
layout = go.Layout(dict(title = "Customer Churn Proportion",
                        plot_bgcolor  = "rgb(243,243,243)",
                        paper_bgcolor = "rgb(243,243,243)",
                       )
                  )
fig=go.Figure(data=data,layout=layout)
py.iplot(fig)

## Manipulation

In [None]:
churn_data['Churn']=np.where(churn_data.Churn =='Yes',1,0)
churn_data.TotalCharges=churn_data.TotalCharges.replace(' ',np.nan)
churn_data.dropna(inplace=True)
churn_data.TotalCharges=churn_data.TotalCharges.astype(float)

In [None]:
#replace 'No internet service' to No for the following columns
replace_cols = [ 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                'TechSupport','StreamingTV', 'StreamingMovies']
for i in replace_cols : 
    churn_data[i]  = churn_data[i].replace({'No internet service' : 'No'})

## Co-relation with Numerical features

In [None]:
%matplotlib notebook
%matplotlib inline
def cor_heat(df):
    cor=df.corr()
    plt.figure(figsize=(20,7),dpi=100)
    sns.heatmap(data=cor,annot=True,square=True,linewidths=0.1,cmap='YlGnBu')
    plt.title("Pearson Co-relation for numerical features: Heat Map")
cor_heat(churn_data.filter(regex='Senior|tenure|Charges|Churn'))

## Co-relation with Categorical featues

In [None]:
def cor_categorical(col):
    return churn_data.groupby(col)['Churn'].value_counts(normalize=True).unstack()[1].sort_values(ascending=False)    

In [None]:
print('''
Categorical features correlation with predictor
-----------------------------------------------''')
print(cor_categorical('gender'))
print('-'*47)
print(cor_categorical('Partner'))
print('-'*47)
print(cor_categorical('Dependents'))
print('-'*47)
print(cor_categorical('PhoneService'))
print('-'*47)
print(cor_categorical('MultipleLines'))
print('-'*47)
print(cor_categorical('InternetService'))
print('-'*47)
print(cor_categorical('OnlineSecurity'))
print('-'*47)
print(cor_categorical('OnlineBackup'))
print('-'*47)
print(cor_categorical('DeviceProtection'))
print('-'*47)
print(cor_categorical('TechSupport'))
print('-'*47)
print(cor_categorical('StreamingTV'))
print('-'*47)
print(cor_categorical('StreamingMovies'))
print('-'*47)
print(cor_categorical('Contract'))
print('-'*47)
print(cor_categorical('PaperlessBilling'))
print('-'*47)
print(cor_categorical('PaymentMethod'))

### Label encoding categorical features

In [None]:
def label_encoder(col):
    churn_data[col]=LabelEncoder().fit_transform(churn_data[col])

for cols in churn_data.columns.drop(['customerID','TotalCharges','tenure','MonthlyCharges','Churn']).tolist():
    label_encoder(cols)

# Model Building

In [None]:
X=churn_data.drop(['Churn'],axis=1).set_index('customerID')
y=churn_data[['Churn']]
X_train, X_test, y_train, y_test = train_test_split(X,y, 
                                                    test_size=0.2, 
                                                    random_state=0,
                                                    stratify=y)

In [None]:
def model_building(model):
    your_model=model
    your_model.fit(X_train,y_train)
    pred=your_model.predict(X_test)
    print("Accuracy of {0} : {1}".format(str(model)[:],accuracy_score(y_test,pred)))
    print("AUC :",roc_auc_score(y_test,pred))

In [None]:
model_building(LogisticRegression(solver='newton-cg'))

In [None]:
model_building(LogisticRegression(solver='liblinear'))

In [None]:
model_building(GaussianNB())

In [None]:
model_building(BernoulliNB())

In [None]:
model_building(SVC(kernel='rbf'))

In [None]:
model_building(SVC(kernel='linear'))

In [None]:
model_building(CatBoostClassifier(eval_metric='AUC'))

In [None]:
model_building(XGBClassifier())

In [None]:
model_building(SGDClassifier())

In [None]:
model_building(LGBMClassifier())