In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os


You can analyze all relevant customer data and develop focused customer retention programs." [IBM Sample Data Sets]
Each row represents a customer, each column contains customer’s attributes described on the column Metadata.
The raw data contains 7043 rows (customers) and 21 columns (features).
The “Churn” column is our target.

kaggle competion: https://www.kaggle.com/blastchar/telco-customer-churn

## Dataset Description:

**customerID:** Customer ID

**gender:** Customer gender (female, male)

**SeniorCitizen:** Whether the customer is a senior citizen or not (1, 0)

**Partner:** Whether the customer has a partner or not (Yes, No)

**Dependents:** Whether the customer has dependents or not (Yes, No)

**tenure:** Number of months the customer has stayed with the company

**PhoneService:** Whether the customer has a phone service or not (Yes, No)

**MultipleLines:** Whether the customer has multiple lines or not (Yes, No, No phone service)

**InternetService:** Customer’s internet service provider (DSL, Fiber optic, No)

**OnlineSecurity:** Whether the customer has online security or not (Yes, No, No internet service)

**OnlineBackup:** Whether the customer has online backup or not (Yes, No, No internet service)

**DeviceProtection:** Whether the customer has device protection or not (Yes, No, No internet service)

**TechSupport:** Whether the customer has tech support or not (Yes, No, No internet service)

**StreamingTV:** Whether the customer has streaming TV or not (Yes, No, No internet service)

**StreamingMovies:** Whether the customer has streaming movies or not (Yes, No, No internet service)

**Contract:** The contract term of the customer (Month-to-month, One year, Two year)

**PaperlessBilling:** Whether the customer has paperless billing or not (Yes, No)

**PaymentMethod:** The customer’s payment method (Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic))

**MonthlyCharges:** The amount charged to the customer monthly

**TotalCharges:** The total amount charged to the customer

**Churn:** Whether the customer churned or not (Yes or No)


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
    
from sklearn.preprocessing import OneHotEncoder
%matplotlib inline

In [None]:
data = pd.read_csv('../input/WA_Fn-UseC_-Telco-Customer-Churn.csv')
data.head()

In [None]:
data['Churn'].value_counts()

In [None]:
data.shape

In [None]:
data.dtypes

As we can see from the data, the variable 'TotalChares' is numerical but when we are reading it using pandas it is of type object so convert it back to numerical.

In [None]:
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')

In [None]:
data.isnull().sum()

## Univariate Analysis

Univariate analysis is the simplest form of analyzing data. “Uni” means “one”, so in other words your data has only one variable. It doesn’t deal with causes or relationships (unlike regression) and it’s major purpose is to describe; it takes data, summarizes that data and finds patterns in the data. source [here](https://www.statisticshowto.datasciencecentral.com/univariate/)


### Categorical Variables:
   1. customerID
   2. gender
   3. Partner
   4. Dependents
   5. PhoneService
   6. MultipleLines
   7. InternetService
   8. OnlineSecurity
   9. OnlineBackup
   10. DeviceProtection
   11. TechSupport
   12. StreamingTV
   13. StreamingMovies
   14. Contract
   15. PaperlessBilling
   16. PaymentMethod
   17. Churn


In [None]:
categorical_var = list(data.dtypes.loc[data.dtypes == 'object'].index)
print(len(categorical_var))
print(categorical_var)

In [None]:
categorical_var.remove('customerID')

In [None]:
fig, ax =plt.subplots(6,3,figsize=(12,20))


sns.countplot(data['gender'], ax=ax[0][0])
sns.countplot(data['Partner'], ax=ax[0][1])
sns.countplot(data['Dependents'], ax=ax[0][2])

sns.countplot(data['PhoneService'], ax=ax[1][0])
sns.countplot(data['MultipleLines'], ax=ax[1][1])
sns.countplot(data['InternetService'], ax=ax[1][2])

sns.countplot(data['OnlineSecurity'], ax=ax[2][0])
sns.countplot(data['OnlineBackup'], ax=ax[2][1])
sns.countplot(data['DeviceProtection'], ax=ax[2][2])

sns.countplot(data['TechSupport'], ax=ax[3][0])
sns.countplot(data['StreamingTV'], ax=ax[3][1])
sns.countplot(data['StreamingMovies'], ax=ax[3][2])

sns.countplot(data['Contract'], ax=ax[4][0])
sns.countplot(data['PaperlessBilling'], ax=ax[4][1])
sns.countplot(data['PaymentMethod'], ax=ax[4][2])

sns.countplot(data['Churn'], ax=ax[5][0])

fig.show()

### Continuous Variables

1. SeniorCitizen
2. tenure
3. MonthlyCharges
4. TotalCharges

In [None]:
continuous_var = ['SeniorCitizen','tenure','MonthlyCharges','TotalCharges']
data.describe()

Finally, to get a quick glimpse of all continuous variables in a data set, let's plot histograms for all numeric variables to determine if all variables are skewed. 

In [None]:
nd = pd.melt(data, value_vars = continuous_var)
n1 = sns.FacetGrid (nd, col='variable', col_wrap=2, sharex=False, sharey = False)
n1 = n1.map(sns.distplot, 'value')
n1

## Bivariate Analysis

Bivariate analysis deals with the analysis of two variables, to find the relationship between them. There are three cases which we will discuss here.

### Continuous -Continuous Variables :
    1. Scatter plots
    2. Correlation

### Categorical-Categorical Variables:
    1. cross tables
    2. stacked column charts
    3. chi-square test
    
### Categorical-Continuous Variables:
    1. z-test/t-test
    2. ANOVA test

### 1. Continuous-Continuous Variables

In [None]:
corr = data[continuous_var].corr()
sns.heatmap(corr)

In [None]:
print (corr['TotalCharges'].sort_values(ascending=False), '\n') 

We can say that the variable `TotalCharges` and `tenure` are highly positive correlated.

In [None]:
sns.jointplot(x=data['TotalCharges'], y=data['tenure'])

here also we can see the direct correlation between thse two variables. We can also spot that there is no outliers present in them.

### 2. Categorical-Categorical Vairables

In [None]:
for var in categorical_var:
    if var!='Churn':
        test = data.groupby([var,'Churn'])
        print(test.size(),'\n\n')

### Chi Square test


In [None]:
import scipy.stats as stats
from scipy.stats import chi2_contingency

class ChiSquare:
    def __init__(self, dataframe):
        self.df = dataframe
        self.p = None #P-Value
        self.chi2 = None #Chi Test Statistic
        self.dof = None
        
        self.dfObserved = None
        self.dfExpected = None
        
    def _print_chisquare_result(self, colX, alpha):
        result = ""
        if self.p<alpha:
            result="{0} is IMPORTANT for Prediction".format(colX)
        else:
            result="{0} is NOT an important predictor. (Discard {0} from model)".format(colX)

        print(result)
        
    def TestIndependence(self,colX,colY, alpha=0.05):
        X = self.df[colX].astype(str)
        Y = self.df[colY].astype(str)
        
        self.dfObserved = pd.crosstab(Y,X) 
        chi2, p, dof, expected = stats.chi2_contingency(self.dfObserved.values)
        self.p = p
        self.chi2 = chi2
        self.dof = dof 
        
        self.dfExpected = pd.DataFrame(expected, columns=self.dfObserved.columns, index = self.dfObserved.index)
        
        self._print_chisquare_result(colX,alpha)

df = data
#Initialize ChiSquare Class
cT = ChiSquare(df)

#Feature Selection
for var in categorical_var:
    cT.TestIndependence(colX=var,colY="Churn" ) 

### 3. Categorical-Continuous Variables

#### ANOVA test

In [None]:
# ANOVA test
import scipy.stats as stats
    
for var in continuous_var:    
    result = stats.f_oneway(data[var][data['Churn'] == 'Yes'], 
                            data[var][data['Churn'] == 'No'])
    print(var)
    print(result)

#### t-test/z-test


In [None]:
from sklearn.feature_selection import SelectKBest
from scipy.stats import ttest_ind

t_stat = []
for var in continuous_var:
    var_no_churn = data[var][data["Churn"] == "No"]
    var_yes_churn = data[var][data["Churn"] == "Yes"]
    t_value = ttest_ind(var_no_churn, var_yes_churn, equal_var=False)
    print(var)
    print(t_value)
    #t_stat.append(t_value)

## Missing Value treatment


In [None]:
data.isnull().sum()

In [None]:
data['TotalCharges'].fillna(data['TotalCharges'].mean(), inplace=True)

## Feature selection / extraction
 There are three ways for performing the feature seletion:
 1. Filter based method
 3. Wrapper based methods
 4. Embedded methods
 
### 1. Filter based methods
Statistical tests can be used to select those features that have the strongest relationship with the output variable. Here we can select the features based on the output of the statistical test. Different statistical which can be use for feature selection are:
  - chi-square test
  - z/t test
  - ANOVA

### 2. Filter based methods
  - Forward selection
  - Backward elimination
  - Recursive Feature elimination
  
### 3. Embedded methods
  - LASSO
  - Elastic net
  - Ridge regression


Now we will now encode all the categorical variables into numeric values as some of the ML algorithms will not work direclty with the categorical variables.`LabelEncoder` function from sklearn is used to encode variables.

In [None]:
categorical_var

Convert all the categorical variables to numerical form using labelencoder

In [None]:
#first convert all the string columns to categorical form
for var in categorical_var:
    data[var] = data[var].astype('category')

In [None]:
data[categorical_var] = data[categorical_var].apply(lambda x: x.cat.codes)

In [None]:
target = data['Churn']
data=data.drop('customerID',axis=1)
all_columns = list(data.columns)
all_columns.remove('Churn')

In [None]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

X = data[all_columns] # Features
y = data['Churn'] # Target variable

# Feature extraction
model = LogisticRegression()
rfe = RFE(model, 8)
fit = rfe.fit(X, y)
print("Num Features: %s" % (fit.n_features_))
print("Selected Features: %s" % (fit.support_))
print("Feature Ranking: %s" % (fit.ranking_))

In [None]:
selected_features_rfe = list(fit.support_)


## Model buidling

### using RFE + logistic_regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

final_features_rfe = []    
for status, var in zip(selected_features_rfe, all_columns):
    if status == True:
        final_features_rfe.append(var)
        
final_features_rfe

In [None]:
X_rfe_lr = data[final_features_rfe]
y = data['Churn']

X_train_rfe_lr,X_test_rfe_lr,y_train_rfe_lr,y_test_rfe_lr=train_test_split(X_rfe_lr,y,test_size=0.25,random_state=0)

lr_model = LogisticRegression()

# fit the model with data
lr_model.fit(X_train_rfe_lr,y_train_rfe_lr)
y_pred_rfe_lr=lr_model.predict(X_test_rfe_lr)

acc_rfe_lr = metrics.accuracy_score(y_test_rfe_lr, y_pred_rfe_lr)
print("Accuracy: ",acc_rfe_lr)


### Logistic regression

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)
# instantiate the model (using the default parameters)
lr_model_single = LogisticRegression()

# fit the model with data
lr_model_single.fit(X_train,y_train)
y_pred=lr_model_single.predict(X_test)

lr_acc = metrics.accuracy_score(y_test, y_pred)
print("Accuracy: ",lr_acc)

## Performance Measures


In [None]:
# import the metrics class
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

In [None]:
from sklearn.metrics import roc_curve, auc
fpr_1, tpr_1, thresholds = roc_curve(y_test, y_pred_rfe_lr)
fpr_2, tpr_2, thresholds = roc_curve(y_test, y_pred)
roc_auc_1 = auc(fpr_1, tpr_1)
roc_auc_2 = auc(fpr_2, tpr_2)

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10,10))
plt.title('Receiver Operating Characteristic')
plt.plot(fpr_1,tpr_1, color='red',label = 'AUC = %0.2f' % roc_auc_1)
plt.plot(fpr_2,tpr_2, color='green',label = 'AUC = %0.2f' % roc_auc_2)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],linestyle='--')
plt.axis('tight')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')