# **ENSEMBLE LEARNING**
## FACEBOOK DEVELOPERS CIRCLE, NCR
### SHREY BATRA

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## **Churn dataset**
Churn rate is known as the rate of customers who discontinue their subscription with our business. This dataset gives the information of customers of a telecom business along with their status of **Churned or not**?

In [2]:
churn_df = pd.read_csv('/home/shreybatra/Downloads/churndataset.csv')

In [3]:
churn_df.head()

Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,...,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


## Analysing dataset

In [4]:
churn_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 21 columns):
state                     3333 non-null object
account length            3333 non-null int64
area code                 3333 non-null int64
phone number              3333 non-null object
international plan        3333 non-null object
voice mail plan           3333 non-null object
number vmail messages     3333 non-null int64
total day minutes         3333 non-null float64
total day calls           3333 non-null int64
total day charge          3333 non-null float64
total eve minutes         3333 non-null float64
total eve calls           3333 non-null int64
total eve charge          3333 non-null float64
total night minutes       3333 non-null float64
total night calls         3333 non-null int64
total night charge        3333 non-null float64
total intl minutes        3333 non-null float64
total intl calls          3333 non-null int64
total intl charge         3333 non-null float64

In [5]:
churn_df.churn.value_counts()

False    2850
True      483
Name: churn, dtype: int64

## Pre Processing, cleaning of data

**User defined function to map yes and no to 1 and 0.**

In [6]:
def change_yes_no(cols):
    if cols=='yes':
        return 1
    return 0

In [7]:
churn_df['international plan'] = churn_df['international plan'].apply(change_yes_no)

In [8]:
churn_df['voice mail plan'] = churn_df['voice mail plan'].apply(change_yes_no)

**User defined function to convert phone number to a single number.**

In [9]:
def conv(cols):
    l = cols.split('-')
    return ''.join(l)

In [10]:
churn_df['phone number'] = churn_df['phone number'].apply(conv)

In [11]:
len(churn_df.state.unique())

51

In [12]:
np_keys = churn_df['state'].unique()

In [13]:
np_values = np.arange(1,52,1)

In [14]:
mp = dict(zip(np_keys,np_values))

In [15]:
print(mp)

{'KY': 45, 'WV': 9, 'AK': 29, 'MS': 47, 'CT': 50, 'MT': 13, 'DE': 35, 'NC': 40, 'MI': 34, 'SC': 22, 'TN': 48, 'TX': 18, 'NM': 42, 'RI': 11, 'OH': 2, 'IN': 10, 'VT': 16, 'OK': 4, 'MD': 30, 'IL': 26, 'IA': 12, 'AL': 5, 'ME': 46, 'WY': 24, 'DC': 44, 'WA': 41, 'SD': 39, 'PA': 49, 'VA': 17, 'KS': 1, 'CO': 20, 'NH': 27, 'CA': 37, 'LA': 8, 'MO': 7, 'HI': 25, 'AZ': 21, 'GA': 28, 'NJ': 3, 'NE': 23, 'ID': 15, 'AR': 31, 'MA': 6, 'MN': 38, 'ND': 51, 'OR': 33, 'FL': 19, 'WI': 32, 'NY': 14, 'NV': 43, 'UT': 36}


In [16]:
churn_df['state'] = churn_df['state'].apply(lambda x: mp.get(x))

In [17]:
churn_df.head()

Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,...,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
0,1,128,415,3824657,0,1,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,2,107,415,3717191,0,1,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,3,137,415,3581921,0,0,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,2,84,408,3759999,1,0,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,4,75,415,3306626,1,0,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


## Splitting dataset into features and labels along with training and testing data.

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
X = churn_df.drop(['churn','phone number'], axis=1)
y = churn_df['churn']

In [20]:
X.head()

Unnamed: 0,state,account length,area code,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls
0,1,128,415,0,1,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1
1,2,107,415,0,1,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1
2,3,137,415,0,0,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0
3,2,84,408,1,0,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2
4,4,75,415,1,0,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3


In [21]:
y.head()

0    False
1    False
2    False
3    False
4    False
Name: churn, dtype: bool

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Decision Tree

In [23]:
from sklearn.tree import DecisionTreeClassifier

In [24]:
dt_model = DecisionTreeClassifier()

In [25]:
dt_model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [26]:
dt_predictions = dt_model.predict(X_test)

## Random Forest

In [27]:
from sklearn.ensemble import RandomForestClassifier

In [28]:
rf_model = RandomForestClassifier()

In [29]:
rf_model.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [30]:
rf_predictions = rf_model.predict(X_test)

## Gradient Boosting

In [31]:
from sklearn.ensemble import GradientBoostingClassifier

In [32]:
gb_model = GradientBoostingClassifier(max_depth=5, verbose=True)

In [33]:
gb_model.fit(X_train, y_train)

      Iter       Train Loss   Remaining Time 
         1           0.7000            1.83s
         2           0.6256            1.96s
         3           0.5719            2.18s
         4           0.5290            2.31s
         5           0.4925            2.13s
         6           0.4581            1.98s
         7           0.4328            1.83s
         8           0.4091            1.67s
         9           0.3890            1.58s
        10           0.3719            1.50s
        20           0.2730            1.03s
        30           0.2195            0.83s
        40           0.1784            0.67s
        50           0.1380            0.54s
        60           0.1174            0.41s
        70           0.0990            0.29s
        80           0.0894            0.19s
        90           0.0773            0.09s
       100           0.0655            0.00s


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=5,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0,
              verbose=True, warm_start=False)

In [34]:
gb_predictions = gb_model.predict(X_test)

## Evaluation of models

In [35]:
from sklearn.metrics import classification_report

**Decision Tree Report**

In [36]:
print(classification_report(dt_predictions,y_test))

             precision    recall  f1-score   support

      False       0.94      0.96      0.95       918
       True       0.76      0.67      0.71       182

avg / total       0.91      0.91      0.91      1100



**Random Forest Report**

In [37]:
print(classification_report(rf_predictions,y_test))

             precision    recall  f1-score   support

      False       1.00      0.94      0.97       999
       True       0.62      0.98      0.76       101

avg / total       0.96      0.94      0.95      1100



**Gradient Boosting Report**

In [38]:
print(classification_report(gb_predictions,y_test))

             precision    recall  f1-score   support

      False       0.99      0.96      0.97       973
       True       0.73      0.92      0.82       127

avg / total       0.96      0.95      0.95      1100

