In [6]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

User uploaded file "WA_Fn-UseC_-Telco-Customer-Churn.csv" with length 977501 bytes


Importing the file

In [7]:
import pandas as pd
import io
import numpy as np

df= pd.read_csv(io.StringIO(uploaded['WA_Fn-UseC_-Telco-Customer-Churn.csv'].decode('utf-8')))

df.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


Identifying datatypes and saving into arrays(Categorical/Numerical)

In [0]:
df = df.drop(['customerID'],axis=1)
vars=df.dtypes
categorical=[]
numeric=[]
for i in range(0,len(vars)):
    if vars[i]=="object": 
        categorical.append(df.columns[i])
    else:
        numeric.append(df.columns[i]) 

Label Encoding of categorical variables

In [0]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

df1=df[categorical].apply(LabelEncoder().fit_transform)
df2=df[numeric]
df3=pd.concat([df1, df2], axis=1)
df3['TotalCharges']=df3['TotalCharges'].astype(float)



Splitting the data in train and test

In [0]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df3, test_size=0.2)
Churn_X =train['Churn'] 
train = train.drop(['Churn'],axis=1)
Churn_Y =test['Churn'] 
test = test.drop(['Churn'],axis=1)


Xgboost Run


In [0]:
# https://github.com/dmlc/xgboost
# This specific version is a work-around for a build issue in newer versions.
!pip install -q xgboost==0.4a30
import xgboost

In [0]:

import xgboost as xgb
gbm = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05).fit(train,Churn_X)
predictions = gbm.predict(test)
from sklearn.metrics import accuracy_score
accuracy_score(Churn_Y, predictions)

Catboost Run

In [0]:
!pip install -q catboost
import catboost
!pip install -q ipywidgets
import ipywidgets


In [0]:
categorical_features_indices = np.where(train.dtypes != np.float)[0]
categorical_features_indices

In [0]:
from catboost import CatBoostClassifier
model = CatBoostClassifier(iterations=1200, learning_rate=0.02, depth=7, loss_function='Logloss', eval_metric='AUC', random_seed=99, od_type='Iter', od_wait=100) 
model.fit(train, Churn_X,cat_features=categorical_features_indices,eval_set=(test, Churn_Y),plot=True)

In [0]:
preds_class = model.predict(test)
preds_proba = model.predict_proba(test)
print("class = ", preds_class)
print("proba = ", preds_proba)
from sklearn.metrics import accuracy_score
accuracy_score(Churn_Y, preds_class)

LiteGBM Run

In [0]:
!pip install -q lightgbm
import lightgbm

In [0]:
from lightgbm import LGBMClassifier
model = LGBMClassifier(boosting_type='gbdt', objective='binary',
                       num_class=1,early_stopping = 50,num_iteration=10000,num_leaves=31,
                       is_enable_sparse='true',tree_learner='data',min_data_in_leaf=400,max_depth=8,
                       learning_rate=0.1, n_estimators=100, max_bin=255, subsample_for_bin=50000, 
                       min_split_gain=5, min_child_weight=5, min_child_samples=10, subsample=0.995, 
                       subsample_freq=1, colsample_bytree=1, reg_alpha=0, 
                       reg_lambda=0, seed=0, nthread=-1, silent=True)

In [35]:
model.fit(train, Churn_X, eval_set=[(test, Churn_Y)],eval_metric='l1',
        early_stopping_rounds=5)




[1]	valid_0's l1: 0.478227
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's l1: 0.459088
[3]	valid_0's l1: 0.441806
[4]	valid_0's l1: 0.426109
[5]	valid_0's l1: 0.412026
[6]	valid_0's l1: 0.39882
[7]	valid_0's l1: 0.387549
[8]	valid_0's l1: 0.377056
[9]	valid_0's l1: 0.367672
[10]	valid_0's l1: 0.359733
[11]	valid_0's l1: 0.35159
[12]	valid_0's l1: 0.344207
[13]	valid_0's l1: 0.338304
[14]	valid_0's l1: 0.332377
[15]	valid_0's l1: 0.326728
[16]	valid_0's l1: 0.321894
[17]	valid_0's l1: 0.317462
[18]	valid_0's l1: 0.313501
[19]	valid_0's l1: 0.309561
[20]	valid_0's l1: 0.305935
[21]	valid_0's l1: 0.302916
[22]	valid_0's l1: 0.299706
[23]	valid_0's l1: 0.296539
[24]	valid_0's l1: 0.293983
[25]	valid_0's l1: 0.291652
[26]	valid_0's l1: 0.289648
[27]	valid_0's l1: 0.287891
[28]	valid_0's l1: 0.286073
[29]	valid_0's l1: 0.284406
[30]	valid_0's l1: 0.282737
[31]	valid_0's l1: 0.281231
[32]	valid_0's l1: 0.280215
[33]	valid_0's l1: 0.279073
[34]	valid_0's l1: 0.2778

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1,
        early_stopping=50, is_enable_sparse='true', learning_rate=0.1,
        max_bin=255, max_depth=8, min_child_samples=10, min_child_weight=5,
        min_data_in_leaf=400, min_split_gain=5, n_estimators=100,
        n_jobs=-1, nthread=-1, num_class=1, num_iteration=10000,
        num_leaves=31, objective='binary', random_state=None, reg_alpha=0,
        reg_lambda=0, seed=0, silent=True, subsample=0.995,
        subsample_for_bin=50000, subsample_freq=1, tree_learner='data')

In [36]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
y_pred = model.predict(test, num_iteration=model.best_iteration_)
print('The rmse of prediction is:', mean_squared_error(Churn_Y, y_pred) ** 0.5)

The rmse of prediction is: 0.42708180064580986


  if diff:


In [24]:
print('Feature importances:', list(model.best_score_))


Feature importances: ['valid_0']


In [33]:
import lightgbm as lgb

print('Feature importances:', list(model.feature_importances_))

# other scikit-learn modules
estimator = lgb.LGBMRegressor(num_leaves=31)

param_grid = {
    'learning_rate': [0.01, 0.1, 1],
    'n_estimators': [20, 40,100,600,700],
    'num_leaves': [31, 50,100,20,150],
    'min_data_in_leaf': [600,300,400],
    'max_depth':[4,6,8]
}

gbm = GridSearchCV(estimator, param_grid)

gbm.fit(train, Churn_X)

print('Best parameters found by grid search are:', gbm.best_params_)

Feature importances: [0, 0, 0, 0, 6, 7, 11, 5, 1, 9, 1, 3, 43, 11, 13, 8, 6, 55, 48]
Best parameters found by grid search are: {'learning_rate': 0.1, 'max_depth': 8, 'min_data_in_leaf': 400, 'n_estimators': 100, 'num_leaves': 31}
