In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.neural_network import MLPClassifier
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import normalize

Create a multi-layer perceptron neural network model to predict on a labeled dataset of your choosing. Compare this model to either a boosted tree or a random forest model and describe the relative tradeoffs between complexity and accuracy. Be sure to vary the hyperparameters of your MLP!

# Cleaning

In [92]:
#i will be using a data set of world hapiness ratings. I will be trying to predict whether a country is happy or not

df= pd.read_csv('happy.csv')

In [93]:
df.head()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,Switzerland,Western Europe,1,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
1,Iceland,Western Europe,2,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201
2,Denmark,Western Europe,3,7.527,0.03328,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204
3,Norway,Western Europe,4,7.522,0.0388,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531
4,Canada,North America,5,7.427,0.03553,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176


In [94]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158 entries, 0 to 157
Data columns (total 12 columns):
Country                          158 non-null object
Region                           158 non-null object
Happiness Rank                   158 non-null int64
Happiness Score                  158 non-null float64
Standard Error                   158 non-null float64
Economy (GDP per Capita)         158 non-null float64
Family                           158 non-null float64
Health (Life Expectancy)         158 non-null float64
Freedom                          158 non-null float64
Trust (Government Corruption)    158 non-null float64
Generosity                       158 non-null float64
Dystopia Residual                158 non-null float64
dtypes: float64(9), int64(1), object(2)
memory usage: 14.9+ KB


In [95]:
#rename colums
df.columns = ['country','region','rank','score','standard_error','economy',\
             'family','health','freedom','trust','generosity','dystopia']
df.head()

Unnamed: 0,country,region,rank,score,standard_error,economy,family,health,freedom,trust,generosity,dystopia
0,Switzerland,Western Europe,1,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
1,Iceland,Western Europe,2,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201
2,Denmark,Western Europe,3,7.527,0.03328,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204
3,Norway,Western Europe,4,7.522,0.0388,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531
4,Canada,North America,5,7.427,0.03553,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176


In [96]:
#rename colums
happy = df.copy()
happy.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158 entries, 0 to 157
Data columns (total 12 columns):
country           158 non-null object
region            158 non-null object
rank              158 non-null int64
score             158 non-null float64
standard_error    158 non-null float64
economy           158 non-null float64
family            158 non-null float64
health            158 non-null float64
freedom           158 non-null float64
trust             158 non-null float64
generosity        158 non-null float64
dystopia          158 non-null float64
dtypes: float64(9), int64(1), object(2)
memory usage: 14.9+ KB


In [97]:
happy.describe()

Unnamed: 0,rank,score,standard_error,economy,family,health,freedom,trust,generosity,dystopia
count,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0
mean,79.493671,5.375734,0.047885,0.846137,0.991046,0.630259,0.428615,0.143422,0.237296,2.098977
std,45.754363,1.14501,0.017146,0.403121,0.272369,0.247078,0.150693,0.120034,0.126685,0.55355
min,1.0,2.839,0.01848,0.0,0.0,0.0,0.0,0.0,0.0,0.32858
25%,40.25,4.526,0.037268,0.545808,0.856823,0.439185,0.32833,0.061675,0.150553,1.75941
50%,79.5,5.2325,0.04394,0.910245,1.02951,0.696705,0.435515,0.10722,0.21613,2.095415
75%,118.75,6.24375,0.0523,1.158448,1.214405,0.811013,0.549092,0.180255,0.309883,2.462415
max,158.0,7.587,0.13693,1.69042,1.40223,1.02525,0.66973,0.55191,0.79588,3.60214


In [98]:
#going to turn happyness score into binary

happy.loc[:,'rank':] = happy.loc[:,'rank':].astype(float)

happy['rank']=np.where(happy['rank']> happy['rank'].median(),1,0)

In [99]:
corrmat = happy.corr()

corrmat['rank'].sort_values(ascending=False)

rank              1.000000
standard_error    0.148191
generosity       -0.136899
trust            -0.284088
dystopia         -0.403702
freedom          -0.518895
health           -0.629073
family           -0.640147
economy          -0.678905
score            -0.835430
Name: rank, dtype: float64

In [100]:
# #lets make region into dummies
dummies = pd.get_dummies(happy['region'])
features = pd.concat([dummies, happy], axis =1, join_axes =[dummies.index])



In [101]:
#drop country, standard error, generosity along with region
features = features.drop(['country','standard_error','region','rank','score','generosity'],1)
#do i need to normalize- doing so anyways
feats_norm = normalize(features)
X= feats_norm
Y= happy['rank']

## Random Forest 

In [102]:
#going to try something with grid search

param_search = [{'n_estimators': [2,3,5,10,15,20,100,200],
          'max_depth': [3,4,5,6,8,10,15],
          'min_samples_split':[2,3,5,6,8,10,15,20]}]


# Initialize and fit the model.
rfc_grid = GridSearchCV(ensemble.RandomForestClassifier(), param_grid=param_search,cv=5)

rfc_grid.fit(X,Y)

print('Best Score: ', rfc_grid.best_score_)
print('Best Parameters: ', rfc_grid.best_params_)

Best Score:  0.879746835443
Best Parameters:  {'max_depth': 8, 'min_samples_split': 3, 'n_estimators': 5}


In [105]:
rfc = ensemble.RandomForestClassifier(n_estimators=5 ,max_depth= 8, min_samples_split = 3)
rfc.fit(X, Y)
rfc.score(X, Y)

0.98101265822784811

In [106]:
RFC_score = cross_val_score(rfc, X, Y, cv=5)
print('\nEach Cross Validated Accuracy: \n', RFC_score)
print("\nOverall Random Forest Classifier Accuracy: %0.2f (+/- %0.2f)\n" % (RFC_score.mean(), RFC_score.std() * 2))


Each Cross Validated Accuracy: 
 [ 0.6875      0.8125      0.875       0.8125      0.83333333]

Overall Random Forest Classifier Accuracy: 0.80 (+/- 0.13)



## Neural Network Classifier


In [107]:
mlp = MLPClassifier(hidden_layer_sizes=(50,50,50,50 ), alpha=0.001)
mlp.fit(X, Y)
print(mlp.score(X, Y))
MLP_score = cross_val_score(mlp, X, Y, cv=5)
print('\nEach Cross Validated Accuracy: \n', MLP_score)
print("\nOverall Neural Network Classifier Accuracy: %0.2f (+/- %0.2f)\n" % (MLP_score.mean(), MLP_score.std() * 2))



1.0





Each Cross Validated Accuracy: 
 [ 0.6875      0.9375      1.          1.          0.66666667]

Overall Neural Network Classifier Accuracy: 0.86 (+/- 0.30)





In [108]:

#convergence issues, so lets add additional params
mlp = MLPClassifier(hidden_layer_sizes=(50,50,50,50 ), alpha=0.001, solver = 'lbfgs')
mlp.fit(X, Y)
print(mlp.score(X, Y))
MLP_score = cross_val_score(mlp, X, Y, cv=5)
print('\nEach Cross Validated Accuracy: \n', MLP_score)
print("\nOverall Neural Network Classifier Accuracy: %0.2f (+/- %0.2f)\n" % (MLP_score.mean(), MLP_score.std() * 2))

0.955696202532

Each Cross Validated Accuracy: 
 [ 0.65625     0.84375     1.          0.9375      0.76666667]

Overall Neural Network Classifier Accuracy: 0.84 (+/- 0.24)



In [109]:
mlp = MLPClassifier(hidden_layer_sizes=(100, ), alpha=0.001, solver = 'lbfgs')
mlp.fit(X, Y)
print(mlp.score(X, Y))
MLP_score = cross_val_score(mlp, X, Y, cv=5)
print('\nEach Cross Validated Accuracy: \n', MLP_score)
print("\nOverall Neural Network Classifier Accuracy: %0.2f (+/- %0.2f)\n" % (MLP_score.mean(), MLP_score.std() * 2))

1.0

Each Cross Validated Accuracy: 
 [ 0.75        0.9375      1.          1.          0.73333333]

Overall Neural Network Classifier Accuracy: 0.88 (+/- 0.24)



In [110]:
mlp = MLPClassifier(hidden_layer_sizes=(100, ), activation = 'identity', alpha=0.0001, solver = 'lbfgs')
mlp.fit(X, Y)
print(mlp.score(X, Y))
MLP_score = cross_val_score(mlp, X, Y, cv=5)
print('\nEach Cross Validated Accuracy: \n', MLP_score)
print("\nOverall Neural Network Classifier Accuracy: %0.2f (+/- %0.2f)\n" % (MLP_score.mean(), MLP_score.std() * 2))

1.0

Each Cross Validated Accuracy: 
 [ 0.625   0.9375  1.      1.      0.6   ]

Overall Neural Network Classifier Accuracy: 0.83 (+/- 0.36)



In [111]:
mlp = MLPClassifier(hidden_layer_sizes=(100, 100 ), activation = 'identity', alpha=0.0001, solver = 'lbfgs')
mlp.fit(X, Y)
print(mlp.score(X, Y))
MLP_score = cross_val_score(mlp, X, Y, cv=5)
print('\nEach Cross Validated Accuracy: \n', MLP_score)
print("\nOverall Neural Network Classifier Accuracy: %0.2f (+/- %0.2f)\n" % (MLP_score.mean(), MLP_score.std() * 2))

1.0

Each Cross Validated Accuracy: 
 [ 0.65625     1.          1.          1.          0.66666667]

Overall Neural Network Classifier Accuracy: 0.86 (+/- 0.33)



In [112]:

mlp = MLPClassifier(hidden_layer_sizes=(100,), activation = 'identity', alpha=0.05, solver = 'lbfgs')
mlp.fit(X, Y)
print(mlp.score(X, Y))
MLP_score = cross_val_score(mlp, X, Y, cv=5)
print('\nEach Cross Validated Accuracy: \n', MLP_score)
print("\nOverall Neural Network Classifier Accuracy: %0.2f (+/- %0.2f)\n" % (MLP_score.mean(), MLP_score.std() * 2))

0.96835443038

Each Cross Validated Accuracy: 
 [ 0.65625  0.9375   1.       1.       0.7    ]

Overall Neural Network Classifier Accuracy: 0.86 (+/- 0.30)



In [127]:
#this seems to do the best
mlp = MLPClassifier(hidden_layer_sizes=(10, ), activation = 'identity', alpha=0.07, solver = 'lbfgs')
mlp.fit(X, Y)
print(mlp.score(X, Y))
MLP_score = cross_val_score(mlp, X, Y, cv=5)
print('\nEach Cross Validated Accuracy: \n', MLP_score)
print("\nOverall Neural Network Classifier Accuracy: %0.2f (+/- %0.2f)\n" % (MLP_score.mean(), MLP_score.std() * 2))

0.955696202532

Each Cross Validated Accuracy: 
 [ 0.65625     0.9375      1.          0.96875     0.66666667]

Overall Neural Network Classifier Accuracy: 0.85 (+/- 0.30)



OK, so the MLP model does a little better than random forrest classifier, after tuning through grid search. 
For the MLP model, I added parameters that are good for smaller data sets, such as setting solver to 'lbfgs' and activation to identity. These parameters also needed to be set, as i was having a convergence issue.  However, as seen by the high accuracies, and evidence of overfitting, this might not have been the best data set to use. Rather, this data set might be better suited for supervised learning, or might require some additional feature engineering

In [None]:
|