# Consolidated workflow (continued)

Import initial libraries and data file. N.B. The data being used read in here is already numercised for use in sklearn's decision trees and random forests

## Tanzania - data analysis

In [182]:
# import libraries and data
import numpy as np
import pandas as pd
import matplotlib as plot
import seaborn as sns

# check select_data
df = pd.read_csv('/Users/RAhmed/data store/Wesleyan_Capstone/all_numeric201808292240.csv')
df.shape

(59400, 24)

In [183]:
import matplotlib.pylab as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import sklearn.metrics

Sklearn's decision trees and random forests need categorical data changed into integers. See, e.g,: https://stackoverflow.com/questions/38108832/passing-categorical-data-to-sklearn-decision-tree

The above data set has already had this done.

In [184]:
df.columns

Index(['id', 'date_recorded', 'season_recorded', 'gps_height', 'installer',
       'longitude', 'latitude', 'basin', 'region_code', 'population',
       'public_meeting', 'scheme_management', 'permit', 'wpt_age',
       'construction_year', 'extraction_type_group', 'management_group',
       'payment_type', 'water_quality', 'quantity_group', 'source_type',
       'source_class', 'waterpoint_type_group', 'status_group'],
      dtype='object')

In [185]:
chosen_predictors = ['season_recorded', 'gps_height', 'installer', 'basin', 'region_code', 'population',
                     'public_meeting', 'scheme_management', 'permit', 'wpt_age','construction_year', 
                     'extraction_type_group', 'management_group', 'payment_type', 'water_quality', 
                     'quantity_group', 'source_type', 'waterpoint_type_group'
                    ]

predictors = df[chosen_predictors]
targets = df['status_group']

# train/test split
pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, targets, test_size=.2)

print("pred_train.shape:", pred_train.shape)
print("pred_test.shape:", pred_test.shape)
print("tar_train.shape:", tar_train.shape)
print("tar_test.shape:", tar_test.shape)

pred_train.shape: (47520, 18)
pred_test.shape: (11880, 18)
tar_train.shape: (47520,)
tar_test.shape: (11880,)


Build a classifier model with a decision tree, on train set

In [194]:
# Build model on training data; initiate classifier from sklearn, then fit it with the training data
classifier=DecisionTreeClassifier(random_state=2)
classifier=classifier.fit(pred_train,tar_train)

# predict for the test values and create confusion matrix
predictions=classifier.predict(pred_test)
print("Confusion matrix:")
print(sklearn.metrics.confusion_matrix(tar_test,predictions))
print("Accuracy score:")
print(sklearn.metrics.accuracy_score(tar_test, predictions))

Confusion matrix:
[[5236  339  890]
 [ 337  370  149]
 [ 902  175 3482]]
Accuracy score:
0.764983164983165


Trying again, to play with parameters, experiment, etc.

In [195]:
# Playing with hyper-parameters. Also have varied the wording of the code from that in the MOOC. 
# (Just to check all in order with exceptionally high result.)
# train/test split
from sklearn.utils import shuffle
df = shuffle(df)
X_train, X_test, y_train, y_test = train_test_split(predictors[:], targets[:], test_size=.2)

# Build model on training data; initiate classifier from sklearn, then fit it with the training data
model = DecisionTreeClassifier(random_state=2)
model.fit(X_train, y_train)

y_predict = model.predict(X_test)

from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_predict)

0.7574074074074074

For confusion matrix, N.B. that:

functional = 0

functional_needs_repair = 1

non-functional = 2

In [196]:
from sklearn.metrics import confusion_matrix

pd.DataFrame(
    confusion_matrix(y_test, y_predict),
    columns=['Pred functional', 'Predicted func_need_repair', 'Predicted non_func'],
    index=['True functional', 'True func_need_repair', 'True non_func']
)

Unnamed: 0,Pred functional,Predicted func_need_repair,Predicted non_func
True functional,5145,366,935
True func_need_repair,330,374,170
True non_func,916,165,3479


In [197]:
# reprint for convenience
accuracy_score(y_test, y_predict)

0.7574074074074074

Key ingredient: see which features are most important

In [198]:
importance = (dict(zip(X_train, model.feature_importances_)))
sorted_items = sorted(importance.items(), key = lambda x: x[1], reverse=True)
sorted_items

[('gps_height', 0.2085270273956416),
 ('wpt_age', 0.15943732041940392),
 ('quantity_group', 0.1588198942795139),
 ('waterpoint_type_group', 0.07967451106317347),
 ('installer', 0.059632524694422505),
 ('payment_type', 0.04388811116765172),
 ('region_code', 0.04010045007609793),
 ('population', 0.03686967147190676),
 ('scheme_management', 0.03171584234250596),
 ('construction_year', 0.030405240903152476),
 ('source_type', 0.030266771775412184),
 ('extraction_type_group', 0.030154357826475017),
 ('basin', 0.02531031686547665),
 ('season_recorded', 0.01543288409165785),
 ('water_quality', 0.015221474259669316),
 ('permit', 0.013251624844287048),
 ('public_meeting', 0.01105365537080478),
 ('management_group', 0.010238321152746824)]

## Visualise the Decision Tree

It turns out that visualising the decision tree is quite straight forward.

There are two things one can initially do: limit or not limit maximum depth as folling example:
model = DecisionTreeClassifier(max_depth=None)

As below, push the image out to a tree.dot file. Find it on your computer amd then cut and paste the code into a web viewer/converter. See, e.g.:
http://www.ilovefreesoftware.com/03/featured/free-online-dot-to-png-converter-websites.html https://dreampuf.github.io/GraphvizOnline/

Right-clicking the image in GraphvizOnline, I could save the png to desktop.

If you limit maximum depth, your decision tree may be small enough to fit on one page. Else, cut and paste a number of lines of code from the tree.dot file, making sure you finish as follows in the online converter, e.g.:

...
44 -> 54 ; 
}

If you do limit the max depth the tree will be different, of course, than a full tree.

In [199]:
from sklearn import tree

# visualise
tree.export_graphviz(model, out_file='tree.dot') 
# cut and paste the tree.dot file info into webgraphviz.com in a browser. 
# (You can paste as many layers of the tree as you want. See notes above.)

In [200]:
# save to a file
# df.to_csv('/Users/RAhmed/data store/Wesleyan_Capstone/all_numeric201808292240.csv', index=False)

## Quick Aside - do KNN approach

In [201]:
## Import the Classifier.
from sklearn.neighbors import KNeighborsClassifier
## Instantiate the model with 5 neighbors. 
knn_list = []
for i in range(1,15):
    knn = KNeighborsClassifier(n_neighbors=i)
    ## Fit the model on the training data.
    knn.fit(X_train, y_train)
    ## See how the model performs on the test data.
    knn.score(X_test, y_test)
    ## Append to list.
    knn_list.append({i: knn.score(X_test, y_test)})
knn_list

[{1: 0.6785353535353535},
 {2: 0.6822390572390572},
 {3: 0.6925084175084175},
 {4: 0.6855218855218855},
 {5: 0.6868686868686869},
 {6: 0.68493265993266},
 {7: 0.6835858585858586},
 {8: 0.6805555555555556},
 {9: 0.6845959595959596},
 {10: 0.6808080808080809},
 {11: 0.67996632996633},
 {12: 0.6792087542087543},
 {13: 0.6775252525252525},
 {14: 0.6772727272727272}]

## Do a global cross-validation approach on Decision Tree

https://chrisalbon.com/machine_learning/model_evaluation/cross-validaton/

In [202]:
# With k-fold cross validation. 
from sklearn import metrics
from sklearn.model_selection import KFold, cross_val_score

model = DecisionTreeClassifier(max_depth=None, random_state=2)

# Create k-Fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=2)

# Do k-fold cross-validation
cv_results = cross_val_score(model, # Classifier
                             predictors, # Feature matrix
                             targets, # Target vector
                             cv=kf, # Cross-validation technique
                             scoring="accuracy", # Loss function
                             n_jobs=-1) # Use all CPU scores

# Calculate mean
cv_results.mean()

0.7565824915824916

## Do a hold out cross-validation approach on Decision Tree

Two differences: 
> i). do cross-validation on train set only (not against all), and predict against test set<br>
> ii). experiment within the decision tree with various parameters

N.B. cross_validation does not give a fitted model. There is a cross_val_predict, but I think it doesn;t test against a seprate hold out set. (?)

In [203]:
# With k-fold cross validation. 
from sklearn import metrics
from sklearn.model_selection import KFold, cross_val_score


# train/test split
X_train, X_test, y_train, y_test = train_test_split(predictors, targets, test_size=.2)

# Difference for this 
model = DecisionTreeClassifier(max_depth=20, criterion='gini', random_state=2, splitter='random')

# Create k-Fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=1)

# Do k-fold cross-validation
cv_results = cross_val_score(model, # Classifier
                             X_train, # Feature matrix
                             y_train, # Target vector
                             cv=kf, # Cross-validation technique
                             scoring="accuracy", # Loss function
                             n_jobs=-1) # Use all CPU scores

# Calculate mean against train set!
print(cv_results.mean())

# So I have tweaked the 'model' (above) parameters to get the best model, 
# and can now test against the hold out set. ((Tweaked) Model has to be fitted)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)

from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_predict)


0.7639941077441076


0.7716329966329967

^ above cross validation tweaking has improved accuracy (against hold out) by some 1.5%, great!

## Do Random Forest approach

In [204]:
# import all necessary libraries
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import os
import matplotlib.pylab as plt
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import sklearn.metrics
# feature importance
from sklearn import datasets
from sklearn.ensemble import ExtraTreesClassifier

In [205]:
# run Random Forest
from sklearn.ensemble import RandomForestClassifier
# classifier (sometimes use clf) is initiated
classifier=RandomForestClassifier(n_estimators=25, random_state=2)
# next step trains the model
classifier=classifier.fit(X_train,y_train)
# now we apply the classifier to the test data
predictions=classifier.predict(X_test)

# we look at confusion matrix and accuracy of prediction on test values
print("RANDOM FOREST")
print("Confusion matrix:")
print(sklearn.metrics.confusion_matrix(y_test,predictions))
print("Accuracy score:")
print(sklearn.metrics.accuracy_score(y_test, predictions))
print()
# display the relative importance of each attribute using RandomForestClassifier
# make this more readable by having the names of the predictors and having sorted
zipped = zip(predictors, classifier.feature_importances_)
my_list = list(zipped)
my_list.sort(key=lambda tup: tup[1], reverse=True)
print('RandomForestClassifier relative feature importance:')
for item in my_list:
    print('{0:42} {1:>42}'.format(item[0], item[1]))

# fit an Extra Trees model to the data (instead of Random Forest)
model = ExtraTreesClassifier(random_state=2)
model.fit(X_train,y_train)
predictions=model.predict(X_test)
print()
print("EXTRA TREE")
print("Confusion matrix:")
print(sklearn.metrics.confusion_matrix(y_test,predictions))
print("Accuracy score:")
print(sklearn.metrics.accuracy_score(y_test, predictions))
print()
# make this more readable by having the names of the predictors and having sorted
zipped = zip(predictors, model.feature_importances_)
my_list = list(zipped)
my_list.sort(key=lambda tup: tup[1], reverse=True)
print('ExtraTreesClassifier relative feature importance:')
for item in my_list:
    print('{0:42} {1:>42}'.format(item[0], item[1]))

RANDOM FOREST
Confusion matrix:
[[5654  179  610]
 [ 397  347  133]
 [ 925  101 3534]]
Accuracy score:
0.8026094276094277

RandomForestClassifier relative feature importance:
gps_height                                                        0.18216643858276754
wpt_age                                                           0.15524415631341124
quantity_group                                                    0.13225987564905892
construction_year                                                  0.0700041725575122
installer                                                        0.061719192645792315
waterpoint_type_group                                             0.05854593477272142
extraction_type_group                                            0.047451111461484716
payment_type                                                     0.042760988871183764
region_code                                                       0.04141933999069797
population                                         

## Do Random Forest with cross-validation and hold out set

In [206]:
# Random forest with cross-validation and hold out set
model = RandomForestClassifier(n_estimators=50, criterion='entropy', max_depth=18, 
                               bootstrap=True, oob_score=True, random_state=2)

# Create k-Fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=1)

# Do k-fold cross-validation
cv_results = cross_val_score(model, # Classifier
                             X_train, # Feature matrix
                             y_train, # Target vector
                             cv=kf, # Cross-validation technique
                             scoring="accuracy", # Loss function
                             n_jobs=-1) # Use all CPU scores

# Calculate mean against train set!
cv_results.mean()

# Calculate mean against test set
model.fit(X_train, y_train)
y_predict = model.predict(X_test)

from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_predict)

0.8122895622895623

## Results

Random seed set at 2 for all except KNN (where n/a)

| Method | Cross-validation | Hold-out set | Accuracy |
| ---- | ---- | ---- | ---- |
| Naive Bayes | no | yes | 47.7% |
| KNN | no | yes | ~68.1% |
| Decision tree | no | yes | 75.7-76.5% |
| Decision tree | yes | no | 75.7% |
| Decision tree | yes | yes | 77.1% |
| Random forest | no | yes | 80.3% |
| Extra tree | no | yes | 79.0% |
| Random forest | yes | yes | 81.2% |

## How results could be improved
> Frequency encoding of variables;<br>
> Nearest neighbour for construction year?;<br>
> Smaller train set, perhaps model is over-fitted?  (Look how DTree improved with hold out set, when using cross validation)<br>

## Naive Bayes Classifier

In [253]:
df2 = df
df2.head()

Unnamed: 0,id,date_recorded,season_recorded,gps_height,installer,longitude,latitude,basin,region_code,population,...,construction_year,extraction_type_group,management_group,payment_type,water_quality,quantity_group,source_type,source_class,waterpoint_type_group,status_group
50404,53755,2013.139785,2,1465,112,34.349116,-4.581008,0,12,4,...,2006,1,2,2,6,3,3,1,1,1
38573,57452,2011.241935,1,1445,157,34.680502,-8.841369,6,10,2,...,1978,1,4,1,6,1,4,1,1,0
15574,5315,2011.241935,1,1,132,33.295568,-8.986989,2,11,0,...,1989,1,4,6,6,1,4,1,1,0
59072,2567,2011.201613,1,0,769,38.970133,-5.4199,5,3,2,...,1993,6,0,2,4,2,5,0,5,2
53219,14437,2011.225806,1,1552,157,38.372253,-4.647711,5,3,1,...,1992,6,4,2,6,1,5,0,3,0


In [254]:
predictors2 = df2[chosen_predictors]
targets2 = df2['status_group']

X_train, X_test, y_train, y_test = train_test_split(predictors2, targets2, test_size=.2)
X_train.head()

Unnamed: 0,season_recorded,gps_height,installer,basin,region_code,population,public_meeting,scheme_management,permit,wpt_age,construction_year,extraction_type_group,management_group,payment_type,water_quality,quantity_group,source_type,waterpoint_type_group
37734,0,2,576,0,16,0,1,9,1,2012.80914,0,5,4,2,6,3,5,3
27843,2,245,157,7,8,8,1,10,1,13.086022,2000,10,4,5,6,3,0,1
14394,2,1622,774,0,12,1,1,7,2,23.075269,1990,6,4,6,7,0,5,5
25458,1,500,157,5,3,8,1,7,0,21.233871,1990,1,4,2,6,3,6,1
20768,1,138,157,8,4,2,1,7,1,7.196237,2004,1,4,1,7,4,5,3


In [255]:
df2['gps_height'].value_counts()[0]

6146

In [256]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train, y_train)
GaussianNB(priors=None)
y_pred = clf.predict(X_test)

from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_predict)

0.4771043771043771