In [30]:
#Data Analysis and Wrangling
import pandas as pd
from sklearn.preprocessing import Imputer
from sklearn.cross_validation import train_test_split
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
#Azure Usage
from azureml import Workspace

In [31]:
#Acquite Data
ws = Workspace()
trainds = ws.datasets['TeleTrain.csv']
#testds  = ws.datasets['TeleTest.csv']
trainall = trainds.to_dataframe()
#The last 450 rows of data are split off into a validation dataset
traindf = trainall.head(1350)
trainvl = trainall.tail(49)

In [32]:
#Analyze Data Column Headers
print(traindf.columns.values)
traindf.info()

['Customer ID.2' 'network_age' 'Customer tenure in month'
 'Total Spend in Months 1 and 2 of 2017' 'Total SMS Spend'
 'Total Data Spend' 'Total Data Consumption' 'Total Unique Calls'
 'Total Onnet spend ' 'Total Offnet spend'
 'Total Call centre complaint calls' 'Network type subscription in Month 1'
 'Network type subscription in Month 2'
 'Most Loved Competitor network in in Month 1'
 'Most Loved Competitor network in in Month 2' 'Churn Status']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1350 entries, 0 to 1349
Data columns (total 16 columns):
Customer ID.2                                  1350 non-null int64
network_age                                    1350 non-null int64
Customer tenure in month                       1350 non-null float64
Total Spend in Months 1 and 2 of 2017          1350 non-null float64
Total SMS Spend                                1350 non-null float64
Total Data Spend                               1350 non-null float64
Total Data Consumption         

In [33]:
#Prepare training data
train_data = pd.get_dummies(traindf)
train_data

Unnamed: 0,Customer ID.2,network_age,Customer tenure in month,Total Spend in Months 1 and 2 of 2017,Total SMS Spend,Total Data Spend,Total Data Consumption,Total Unique Calls,Total Onnet spend,Total Offnet spend,Total Call centre complaint calls,Network type subscription in Month 1,Network type subscription in Month 2,Most Loved Competitor network in in Month 1,Most Loved Competitor network in in Month 2,Churn Status
0,39,123,4.10,76.7140,0.00,1.25,1.495100e+00,14,564,6408,2,2,2,4,1,0
1,41,1316,43.87,98.8904,4.14,1.25,1.024400e+00,27,1626,4373,1,2,2,4,1,0
2,51,2385,79.50,372.6300,3.51,1.25,6.089800e+00,119,10411,22039,1,2,2,5,1,0
3,59,1614,53.80,530.2000,6.85,1.25,3.344700e+00,69,6822,6274,1,2,2,7,1,0
4,60,2175,72.50,554.1300,6.99,153.75,6.732040e+05,33,3387,7241,1,2,3,5,1,0
5,66,158,5.27,792.1144,28.63,108.75,1.077136e+05,101,7296,16636,1,2,2,4,1,0
6,68,110,3.67,1090.5000,1.75,0.00,1.491020e+01,303,22176,71699,2,2,2,6,1,0
7,72,558,18.60,29.5464,7.17,0.00,1.915330e+01,4,12,444,1,2,2,4,1,1
8,79,115,3.83,62.2520,6.76,11.25,2.013527e+05,13,0,3563,1,2,3,1,1,1
9,85,1868,62.27,120.1204,23.49,31.25,4.401846e+02,23,0,4254,1,2,2,1,1,1


In [34]:
#Set the target variable
y = targets = labels = traindf["Churn Status"].values
#Identify the feature variables
columns = ["Customer ID.2", "network_age", "Customer tenure in month", "Total Spend in Months 1 and 2 of 2017", "Total SMS Spend",
           "Total Data Spend", "Total Data Consumption", "Total Unique Calls", "Total Onnet spend ", "Total Offnet spend",
           "Total Call centre complaint calls", "Network type subscription in Month 1", "Network type subscription in Month 2",
           "Most Loved Competitor network in in Month 1", "Most Loved Competitor network in in Month 2"]
features = traindf[list(columns)].values

In [35]:
#Impute the features
imp = Imputer(axis=0)
X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.18, random_state=1)
#X = imp.fit_transform(features)
#X

In [36]:
#Define the Decision Tree Model
my_tree_one = tree.DecisionTreeClassifier(criterion="entropy", max_depth=3)

In [37]:
#Train the Decision Tree Classifier model
my_tree_one = my_tree_one.fit(X_train, y_train)

In [38]:
#The feature_importances_ attribute make it simple to interpret the significance of the predictors you include
print(my_tree_one.feature_importances_) 
print(my_tree_one.score(X_test, y_test))

[ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
0.934156378601


In [39]:
#Prepare validation dataset
trainvl = pd.get_dummies(trainvl)

In [40]:
#Set the test file by choosing the features and imputing it.
features_test = trainvl[list(columns)].values
imp_test = Imputer(missing_values='NaN', strategy='mean', axis=0)
X_testd = imp_test.fit_transform(features_test)
X_testd

array([[  1.40000000e+03,   1.93000000e+03,   6.43300000e+01,
          9.81190000e+02,   2.71200000e+01,   1.12500000e+01,
          2.40840800e+02,   1.90000000e+02,   2.14840000e+04,
          1.56900000e+04,   1.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   4.00000000e+00,   4.00000000e+00],
       [  1.40100000e+03,   2.96600000e+03,   9.88700000e+01,
          9.85410000e+02,   1.57500000e+01,   4.12500000e+01,
          2.01345662e+06,   1.74000000e+02,   6.47800000e+03,
          2.91140000e+04,   1.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   1.00000000e+00,   4.00000000e+00],
       [  1.42600000e+03,   1.33000000e+02,   4.43000000e+00,
          1.04486000e+03,   1.92500000e+01,   6.25000000e+00,
          4.64449696e+05,   1.61000000e+02,   2.74800000e+03,
          2.20800000e+03,   1.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   3.00000000e+00,   4.00000000e+00],
       [  1.45100000e+03,   8.34000000e+02,   2.78000000e+01,
     

In [41]:
#Do the prediction on the test file with the decision tree
pred = my_tree_one.predict(X_testd)
pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1])

In [42]:
trainvl['Predicted'] = pred
trainvl

Unnamed: 0,Customer ID.2,network_age,Customer tenure in month,Total Spend in Months 1 and 2 of 2017,Total SMS Spend,Total Data Spend,Total Data Consumption,Total Unique Calls,Total Onnet spend,Total Offnet spend,Total Call centre complaint calls,Network type subscription in Month 1,Network type subscription in Month 2,Most Loved Competitor network in in Month 1,Most Loved Competitor network in in Month 2,Churn Status,Predicted
1351,1400,1930,64.33,981.19,27.12,11.25,240.8408,190,21484,15690,1,0,0,4,4,0,0
1352,1401,2966,98.87,985.41,15.75,41.25,2013457.0,174,6478,29114,1,0,0,1,4,0,0
1353,1426,133,4.43,1044.86,19.25,6.25,464449.7,161,2748,2208,1,0,0,3,4,0,0
1354,1451,834,27.8,1091.16,0.0,2.5,353192.4,680,306,14438,1,0,0,1,4,0,0
1355,1454,856,28.53,1099.2964,2.95,2.5,117901.7,288,9288,13159,1,0,0,4,4,0,0
1356,1467,4131,137.7,1130.03,11.96,1.25,1759968.0,1002,0,179,1,0,0,5,4,0,0
1357,1468,418,13.93,1132.1364,66.17,31.25,2477682.0,686,4176,26548,1,0,0,4,4,0,0
1358,1472,706,23.53,1141.408,65.09,27.5,457845.2,105,2262,5888,1,0,0,4,4,0,0
1359,1483,126,4.2,1165.5608,873.98,3.75,26.582,39,17904,1101,1,0,3,4,4,0,0
1360,1509,185,6.17,1227.49,12.25,46.25,837078.6,339,21367,9036,1,0,0,5,4,0,0


In [43]:
#matrices and score of the current decision tree
pred = my_tree_one.predict(X_train)
df_confusion = metrics.confusion_matrix(y_train, pred)
df_confusion

array([[531,  28],
       [ 58, 490]])

In [44]:
print(my_tree_one.score(X_test,y_test))

0.934156378601


In [45]:
#Create another decision tree and this time change the max depth to 10 and the min samples split to 5
#Setting "max_depth" to 10 and "min_samples_split" to 5 : my_tree_two
max_depth = 10
min_samples_split = 5
my_tree_two = tree.DecisionTreeClassifier(max_depth = 10, min_samples_split = 5, random_state = 1)
my_tree_two = my_tree_two.fit(X_train, y_train)

#Print the score of the new decison tree
print(my_tree_two.score(X_test, y_test))

0.991769547325


In [46]:
#The confusion matrix of the second decision tree
pred2 = my_tree_two.predict(X_train)
df_confusion = metrics.confusion_matrix(y_train, pred2)
df_confusion

array([[559,   0],
       [  0, 548]])

In [47]:
pred2 = my_tree_one.predict(X_testd)
trainvl['Predicted2'] = pred2
trainvl

Unnamed: 0,Customer ID.2,network_age,Customer tenure in month,Total Spend in Months 1 and 2 of 2017,Total SMS Spend,Total Data Spend,Total Data Consumption,Total Unique Calls,Total Onnet spend,Total Offnet spend,Total Call centre complaint calls,Network type subscription in Month 1,Network type subscription in Month 2,Most Loved Competitor network in in Month 1,Most Loved Competitor network in in Month 2,Churn Status,Predicted,Predicted2
1351,1400,1930,64.33,981.19,27.12,11.25,240.8408,190,21484,15690,1,0,0,4,4,0,0,0
1352,1401,2966,98.87,985.41,15.75,41.25,2013457.0,174,6478,29114,1,0,0,1,4,0,0,0
1353,1426,133,4.43,1044.86,19.25,6.25,464449.7,161,2748,2208,1,0,0,3,4,0,0,0
1354,1451,834,27.8,1091.16,0.0,2.5,353192.4,680,306,14438,1,0,0,1,4,0,0,0
1355,1454,856,28.53,1099.2964,2.95,2.5,117901.7,288,9288,13159,1,0,0,4,4,0,0,0
1356,1467,4131,137.7,1130.03,11.96,1.25,1759968.0,1002,0,179,1,0,0,5,4,0,0,0
1357,1468,418,13.93,1132.1364,66.17,31.25,2477682.0,686,4176,26548,1,0,0,4,4,0,0,0
1358,1472,706,23.53,1141.408,65.09,27.5,457845.2,105,2262,5888,1,0,0,4,4,0,0,0
1359,1483,126,4.2,1165.5608,873.98,3.75,26.582,39,17904,1101,1,0,3,4,4,0,0,0
1360,1509,185,6.17,1227.49,12.25,46.25,837078.6,339,21367,9036,1,0,0,5,4,0,0,0


In [48]:
# Building and fitting Random Forest Classifier
forest = RandomForestClassifier(max_depth = 10, min_samples_split=2, n_estimators = 100, random_state = 1)
my_forest = forest.fit(X_train, y_train)

# Print the score of the fitted random forest
print(my_forest.score(X_test, y_test))

0.962962962963


In [49]:
pred3 = forest.predict(X_testd)
trainvl['Predicted3'] = pred3
trainvl

Unnamed: 0,Customer ID.2,network_age,Customer tenure in month,Total Spend in Months 1 and 2 of 2017,Total SMS Spend,Total Data Spend,Total Data Consumption,Total Unique Calls,Total Onnet spend,Total Offnet spend,Total Call centre complaint calls,Network type subscription in Month 1,Network type subscription in Month 2,Most Loved Competitor network in in Month 1,Most Loved Competitor network in in Month 2,Churn Status,Predicted,Predicted2,Predicted3
1351,1400,1930,64.33,981.19,27.12,11.25,240.8408,190,21484,15690,1,0,0,4,4,0,0,0,0
1352,1401,2966,98.87,985.41,15.75,41.25,2013457.0,174,6478,29114,1,0,0,1,4,0,0,0,0
1353,1426,133,4.43,1044.86,19.25,6.25,464449.7,161,2748,2208,1,0,0,3,4,0,0,0,0
1354,1451,834,27.8,1091.16,0.0,2.5,353192.4,680,306,14438,1,0,0,1,4,0,0,0,0
1355,1454,856,28.53,1099.2964,2.95,2.5,117901.7,288,9288,13159,1,0,0,4,4,0,0,0,0
1356,1467,4131,137.7,1130.03,11.96,1.25,1759968.0,1002,0,179,1,0,0,5,4,0,0,0,0
1357,1468,418,13.93,1132.1364,66.17,31.25,2477682.0,686,4176,26548,1,0,0,4,4,0,0,0,0
1358,1472,706,23.53,1141.408,65.09,27.5,457845.2,105,2262,5888,1,0,0,4,4,0,0,0,0
1359,1483,126,4.2,1165.5608,873.98,3.75,26.582,39,17904,1101,1,0,3,4,4,0,0,0,0
1360,1509,185,6.17,1227.49,12.25,46.25,837078.6,339,21367,9036,1,0,0,5,4,0,0,0,0


In [50]:
#Acquire and Prepare Test Dataset and add an empty medv column
testds  = ws.datasets['TeleTest.csv']
testdf = testds.to_dataframe()
testdf['Churn Status'] = 0
print(testdf.columns.values)

['Customer ID.2' 'network_age' 'Customer tenure in month'
 'Total Spend in Months 1 and 2 of 2017' 'Total SMS Spend'
 'Total Data Spend' 'Total Data Consumption' 'Total Unique Calls'
 'Total Onnet spend ' 'Total Offnet spend'
 'Total Call centre complaint calls' 'Network type subscription in Month 1'
 'Network type subscription in Month 2'
 'Most Loved Competitor network in in Month 1'
 'Most Loved Competitor network in in Month 2' 'Churn Status']


In [51]:
features_test = testdf[list(columns)].values
features_test

array([[  1.00000000e+00,   1.01200000e+03,   3.37333333e+01, ...,
          3.00000000e+00,   5.00000000e+00,   1.00000000e+00],
       [  2.00000000e+00,   1.37000000e+02,   4.56666667e+00, ...,
          3.00000000e+00,   3.00000000e+00,   1.00000000e+00],
       [  3.00000000e+00,   2.02000000e+02,   6.73333333e+00, ...,
          3.00000000e+00,   3.00000000e+00,   1.00000000e+00],
       ..., 
       [  1.95100000e+03,   4.11000000e+02,   1.37000000e+01, ...,
          3.00000000e+00,   3.00000000e+00,   6.00000000e+00],
       [  1.95200000e+03,   8.12000000e+02,   2.70666667e+01, ...,
          2.00000000e+00,   2.00000000e+00,   6.00000000e+00],
       [  1.95300000e+03,   5.49000000e+02,   1.83000000e+01, ...,
          2.00000000e+00,   6.00000000e+00,   6.00000000e+00]])

In [52]:
#Run the model on the test data on the best of the three models that were trained above
test_vals = my_tree_two.predict(features_test)
test_vals

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0,

In [53]:
#Add the predicted column and delete the redundant initialized 'Churn Status' column
testdf['PredictedChurn'] = test_vals
del testdf['Churn Status']

In [54]:
testdf

Unnamed: 0,Customer ID.2,network_age,Customer tenure in month,Total Spend in Months 1 and 2 of 2017,Total SMS Spend,Total Data Spend,Total Data Consumption,Total Unique Calls,Total Onnet spend,Total Offnet spend,Total Call centre complaint calls,Network type subscription in Month 1,Network type subscription in Month 2,Most Loved Competitor network in in Month 1,Most Loved Competitor network in in Month 2,PredictedChurn
0,1,1012,33.733333,1560.1156,14.04,226.30,3.126093e+06,64,14503,12402,1,3,3,5,1,0
1,2,137,4.566667,1705.8368,73.63,5.00,1.628809e+02,515,29832,88953,9,3,3,3,1,0
2,3,202,6.733333,1720.3008,4.36,245.00,3.334607e+06,163,228,1495,1,3,3,3,1,0
3,4,165,5.500000,1732.2800,0.00,12.50,6.682924e+05,484,0,71928,2,3,3,4,1,0
4,5,759,25.300000,1873.0600,69.57,412.50,6.666975e+06,64,7616,20753,2,3,3,1,1,0
5,6,129,4.300000,1877.2600,46.11,343.75,3.387698e+06,121,0,20944,1,3,3,2,1,0
6,7,148,4.933333,2388.8208,17.99,1.25,1.291677e+05,830,0,93461,1,0,3,7,1,0
7,8,1144,38.133333,2702.3500,47.84,48.75,3.390062e+03,431,60102,131525,1,3,3,1,1,0
8,9,137,4.566667,2782.5572,179.52,315.00,3.632011e+06,622,0,40724,2,3,3,5,1,0
9,10,1456,48.533333,2934.7028,200.37,108.75,4.063889e+04,892,16572,150693,1,4,4,2,1,0
