In [None]:
import pandas as pd
import numpy as np

from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 


In [None]:
data = pd.read_csv('../input/WA_Fn-UseC_-Telco-Customer-Churn.csv')
data.SeniorCitizen = data.SeniorCitizen.map(lambda x: 'Yes' if x == 1 else 'No') # Convert values 1 and 0 to "Yes" and "No"
data['TotalCharges'] = data['TotalCharges'].replace(" ", "")                     # Delete the whitespaces 
data.TotalCharges = pd.to_numeric(data.TotalCharges)                             # Convert to float type

In [None]:
X = data.drop(['customerID','Churn'], axis=1)                                    # Drop dependent and ID features
X.shape

In [None]:
X.head(3)

#### Split our data into the categorical and numerical features


In [None]:
categorical_columns = [c for c in X.columns if X[c].dtype.name == 'object']
numerical_columns   = [c for c in X.columns if X[c].dtype.name != 'object']
print('List of categorical columns: {:}.\n \nList of numerical: {:}'.format(categorical_columns, numerical_columns))


In [None]:
X[categorical_columns].describe()

In [None]:
X[numerical_columns].describe()

#### As we can see, categorical features has no missing values, but TotalCharges  has 10 missing values in numerical features.  We can full missing values by mean value  in TotalCharges

In [None]:
X['TotalCharges'] = X['TotalCharges'].fillna(X['TotalCharges'].describe()['mean'])

#### Let's check linear dependence

In [None]:
data.corr() # Check linear dependence

#### Also we can see scatter plot with those features

In [None]:
def plotfeatures(col1, col2):

    plt.figure(figsize=(10, 6))

    plt.scatter(X[col1][data['Churn'] == 'Yes'],
                X[col2][data['Churn'] == 'Yes'],
                alpha=0.75,
                color='red',
                label='Yes')

    plt.scatter(X[col1][data['Churn'] == 'No'],
                X[col2][data['Churn'] == 'No'],
                alpha=0.75,
                color='blue',
                label='No')

    plt.xlabel(col1)
    plt.ylabel(col2)
    plt.legend(loc='best');

In [None]:
plotfeatures('tenure', 'TotalCharges')
plotfeatures('MonthlyCharges', 'TotalCharges')

#### The features tensure and TotalCharges has high correlation (Spirman's coef r = 0.82). The features MonthlyCharges and TotalCharges also has a high correlation (r = 0.65). It's mean we have to add the new feature, which contains features with high Spearman's coef, but also we can check this: TotalCharges devided by tensure gives ~MouthlyCharges, it means you can use only MonthlyCharges. 

In [None]:
data['MonthlyCharges_new'] = data['TotalCharges']/data['tenure']

In [None]:
data.corr()

In [None]:
data.columns

####  Split the categorical features to binary and nonbinary features. Then recode the binary features to {0, 1}, and the nonbinary features recode by function get_dummies.

In [None]:
X = X.drop(['tenure','TotalCharges'], axis=1)

In [None]:
binary_columns    = [c for c in categorical_columns if X[str(c)].describe()['unique'] == 2]
nonbinary_columns = [c for c in categorical_columns if X[str(c)].describe()['unique'] > 2]

In [None]:
nonbinary_columns

In [None]:
binary_columns

In [None]:
for c in binary_columns:
    top = X[str(c)].describe()['top']
    top_items = X[c] == top
    X.loc[top_items, c] = 0
    X.loc[np.logical_not(top_items), c] = 1
    

In [None]:
X_dummy = pd.get_dummies(X[nonbinary_columns])
X = X.drop(nonbinary_columns, axis=1)

#### Scale numerical feature (MonthlyCharges)

In [None]:
X['MonthlyCharges'] = (X['MonthlyCharges'] - X['MonthlyCharges'].mean()) / X['MonthlyCharges'].std()

#### Join all features together numerical and categorical (binary and nonbinary)

In [None]:
X_full = pd.concat((X, X_dummy), axis=1)

data.at[data['Churn'] == 'No', 'Churn'] = 0
data.at[data['Churn'] == 'Yes', 'Churn'] = 1
y = data.Churn

In [None]:
X_full.head(4)

#### cross-validation with 5 Kflold

In [None]:
cv = KFold(n_splits=5, shuffle=True) 

#### Build the logistic model with different metriks

In [None]:

scoring = [ 'f1', 'precision', 'recall', 'roc_auc']


for score in scoring:
    lr = linear_model.LogisticRegression()
    scores = np.mean(cross_val_score(lr, X_full, y,
                                 scoring=score,
                                 cv=cv))

    print('{} score: {}'.format(score, scores))


#### RandomForest

In [None]:
from sklearn import ensemble
# RandomForest can give important features. Then, this important features can use for a new model

scoring = [ 'f1', 'precision', 'recall', 'roc_auc']
for score in scoring:
    
    rf = ensemble.RandomForestClassifier(n_estimators=100, random_state=11)
    scores = np.mean(cross_val_score(rf, X_full, y,
                             scoring=score,
                             cv=cv))
    
    print('{} score: {}'.format(score, scores))

#### Here we can figure out what is important features

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_full, y, test_size = 0.3, random_state = 11)

In [None]:
rf.fit(X_train,y_train)
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]

d_first = 30
plt.figure(figsize=(8, 8))
plt.title("Feature importances")
plt.bar(range(d_first), importances[indices[:d_first]], align='center')
plt.xticks(range(d_first), np.array(X_full.columns)[indices[:d_first]], rotation=90)
plt.xlim([-1, d_first]);

In [None]:
best_features = indices[:15]
best_features_names = X_full.columns[best_features]
print(best_features_names)

#### Create a new RandomForest model with first 15 important features

In [None]:
gbt = ensemble.RandomForestClassifier(n_estimators=100, random_state=11)
gbt.fit(X_train[best_features_names], y_train)

quality = np.mean(y_test == gbt.predict(X_test[best_features_names]))
print(quality)