## Ensemble methods and cross validation with college data

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier,AdaBoostClassifier,GradientBoostingClassifier

In [0]:
# read in the data and save it in a dataframe
col = pd.read_csv('college.csv')

In [0]:
col.head()

Unnamed: 0.1,Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
0,Abilene Christian University,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
1,Adelphi University,Yes,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
2,Adrian College,Yes,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
3,Agnes Scott College,Yes,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
4,Alaska Pacific University,Yes,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15


---
## EDA: 

In [0]:
# See what kind of data format and how many are there in each column
col.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 777 entries, 0 to 776
Data columns (total 19 columns):
Unnamed: 0     777 non-null object
Private        777 non-null object
Apps           777 non-null int64
Accept         777 non-null int64
Enroll         777 non-null int64
Top10perc      777 non-null int64
Top25perc      777 non-null int64
F.Undergrad    777 non-null int64
P.Undergrad    777 non-null int64
Outstate       777 non-null int64
Room.Board     777 non-null int64
Books          777 non-null int64
Personal       777 non-null int64
PhD            777 non-null int64
Terminal       777 non-null int64
S.F.Ratio      777 non-null float64
perc.alumni    777 non-null int64
Expend         777 non-null int64
Grad.Rate      777 non-null int64
dtypes: float64(1), int64(16), object(2)
memory usage: 115.4+ KB


---

#### Rename the first column name to University so it will be easier to relate.

In [0]:
col.rename(columns={'Unnamed: 0':'University'}, inplace=True)
col.head()

Unnamed: 0,University,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
0,Abilene Christian University,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
1,Adelphi University,Yes,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
2,Adrian College,Yes,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
3,Agnes Scott College,Yes,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
4,Alaska Pacific University,Yes,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15


#### Change yes/no to 1/0 in column 'Priviate' to get it ready for model training.

In [0]:
col['Private'].value_counts()

1    565
0    212
Name: Private, dtype: int64

In [0]:
col['Private'] = LabelEncoder().fit_transform(col['Private'])

In [0]:
col.head()
col['Private'].value_counts()

1    565
0    212
Name: Private, dtype: int64

___
### Compare classification performance of different models with cross validation

In [0]:
# Assigned the Private column as target y and 
#assigned all columns, except University and Private, in the dataframe as X
X = col.drop(columns=['University','Private'])
y = col['Private']

#### Create a function that takes a model and the 'number of splits' parameter in StratifiedKFold cross validation

In [0]:
def cross_score(model,model_name,n_splits, X, y):
  cv = StratifiedKFold(n_splits=n_splits, shuffle=True)
  s = cross_val_score(model, X, y, cv=cv, n_jobs=-1)
  print("{} with {} splits has an average score of {:0.3} ± {:0.3}".format(model_name, n_splits, s.mean().round(3), s.std().round(3)))

### Testing the function on Decision Tree algorithm

In [0]:
# showing the  the average and the standard deviation of cross validation score 
#for splits ranging from 2 to 20
for x in range(2,21):
  n_splits=x
  cross_score(DecisionTreeClassifier(),'Decision Tree Classifier',n_splits, X,y)

Decision Tree Classifier with 2 splits has an average score of 0.906 ± 0.004
Decision Tree Classifier with 3 splits has an average score of 0.885 ± 0.01
Decision Tree Classifier with 4 splits has an average score of 0.903 ± 0.006
Decision Tree Classifier with 5 splits has an average score of 0.897 ± 0.009
Decision Tree Classifier with 6 splits has an average score of 0.893 ± 0.038
Decision Tree Classifier with 7 splits has an average score of 0.896 ± 0.019
Decision Tree Classifier with 8 splits has an average score of 0.907 ± 0.037
Decision Tree Classifier with 9 splits has an average score of 0.902 ± 0.023
Decision Tree Classifier with 10 splits has an average score of 0.891 ± 0.034
Decision Tree Classifier with 11 splits has an average score of 0.901 ± 0.036
Decision Tree Classifier with 12 splits has an average score of 0.906 ± 0.029
Decision Tree Classifier with 13 splits has an average score of 0.92 ± 0.039
Decision Tree Classifier with 14 splits has an average score of 0.906 ± 0.

___
### Comparing the mean and sd of cross validation scores subject to decision tree and ensemble methods

Use the function you created above to print out  
the mean and sd of cross validation scores for  
decision trees, bagging, random forest, extra trees ,
AdaBoost, and Gradient Boosting classifiers   
with 3 folds cross validation.

In [0]:
cross_score(BaggingClassifier(),'Bagging Classifier',3,X,y)

Bagging Classifier with 3 splits has an average score of 0.937 ± 0.013


In [0]:
cross_score(RandomForestClassifier(),'Random Forest Classifier',3,X,y)

Random Forest Classifier with 3 splits has an average score of 0.925 ± 0.005


In [0]:
cross_score(ExtraTreesClassifier(),'Extra Trees Classifier',3,X,y)

Extra Trees Classifier with 3 splits has an average score of 0.932 ± 0.015


In [0]:
cross_score(AdaBoostClassifier(),'Ada Boost Classifier',3,X,y)

Ada Boost Classifier with 3 splits has an average score of 0.931 ± 0.008


In [0]:
cross_score(GradientBoostingClassifier(),'Gradient Boosting Classifier',3,X,y)

Gradient Boosting Classifier with 3 splits has an average score of 0.942 ± 0.014


#### Show, for RandomForestClassifier, the feature importances for each variable predicting private vs. not, sorted by most important feature to least.

In [0]:
# show the ranked list of feature importances here
rf = RandomForestClassifier()
rf.fit(X, y)
rf.score(X, y)



0.9961389961389961

In [0]:
# show the ranked list of feature importances here
feature_importances = pd.DataFrame(rf.feature_importances_,
                                   index = X.columns,
                                    columns=['Private']).sort_values('Private', ascending=False)
feature_importances

Unnamed: 0,Private
F.Undergrad,0.255992
Outstate,0.231555
P.Undergrad,0.121129
Enroll,0.10062
Room.Board,0.069497
perc.alumni,0.036504
S.F.Ratio,0.024533
Top25perc,0.024407
Expend,0.022977
Grad.Rate,0.02172


#### Show a ranked list of features in col dataframe that is correlated to the target column 'Private'

In [0]:
# ranked list of features in col dataframe that is correlated to the target column 'Private'
col.corr().abs().Private.sort_values(ascending = False)[1:]

F.Undergrad    0.615561
Enroll         0.567908
Outstate       0.552650
Accept         0.475252
S.F.Ratio      0.472205
P.Undergrad    0.452088
Apps           0.432095
perc.alumni    0.414775
Room.Board     0.340532
Grad.Rate      0.336162
Personal       0.304485
Expend         0.258461
Top10perc      0.164132
PhD            0.156714
Terminal       0.129620
Top25perc      0.095752
Books          0.018549
Name: Private, dtype: float64