### Challenge Set 6
Topic: Decision Trees<br>
Date: 08/16/2016<br>
Name: Seth Kaufman

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.grid_search  import GridSearchCV
from sklearn.cross_validation import StratifiedShuffleSplit
from IPython.display import Image  
from sklearn.externals.six import StringIO  
import pydotplus
import numpy as np

### Tree / Forest Challenges

You can examine the decision paths of an `sklearn` tree by generating `pydot` graphs as in the `sklearn` [documentation](http://scikit-learn.org/stable/modules/tree.html). It's sometimes tricky to get `pydot` working; see below for a possible install plan.


#### Challenge 1

For the house representatives data set, fit and evaluate a decision tree classifier. Examine the rules your tree uses.

In [2]:
!ls ../../../challenges_data/house-votes-84.data

../../../challenges_data/house-votes-84.data


In [3]:
hv = pd.read_csv('/Users/Seth/Documents/Data Science/Metis/nyc16_ds8/challenges/challenges_data/house-votes-84.data',
                 header=-1,na_values='?')
hv.replace({'y':1,'n':0},inplace=True)
hv[16]=hv[16].apply(lambda x : ''.join([i for i in x if i.isalpha()]))
hv = hv.fillna(hv.mean(axis=0))
y = hv[16]
X = hv.iloc[:,:-1]

In [4]:
crossval=StratifiedShuffleSplit(y,test_size=.3)

dt = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
            max_features=None, max_leaf_nodes=10, min_samples_leaf=2,
            min_samples_split=4, min_weight_fraction_leaf=0.5,
            presort=False, random_state=None, splitter='best')
dt.fit(X,y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
            max_features=None, max_leaf_nodes=10, min_samples_leaf=2,
            min_samples_split=4, min_weight_fraction_leaf=0.5,
            presort=False, random_state=None, splitter='best')

In [5]:
 tree.export_graphviz(dt,out_file='hv.dot',feature_names=X.columns,class_names=y.unique())

In [6]:
!dot -Tsvg hv.dot -o hv.svg

<img src='hv.svg'/>

#### Challenge 2

Fit and evaluate a decision tree classifier for your movie dataset. Examine the rules your tree uses.


In [7]:
mojo = pd.read_csv('/Users/Seth/Documents/Data Science/Metis/nyc16_ds8/challenges/challenges_data/2013_movies.csv',
                   parse_dates=['ReleaseDate'],infer_datetime_format=True)
mojo['ReleaseDate'] = mojo['ReleaseDate'].dt.month
mojo['Budget'] = mojo['Budget'].fillna(mojo['Budget'].mean(axis=0))
predictors = mojo[
    [#'Title',
     'Budget',
     'DomesticTotalGross',
     #'Director',
     'Runtime',
     'ReleaseDate']]

In [8]:
X = pd.get_dummies(predictors)
y = (mojo['Rating'].factorize()[0]==0).astype(int)
dt = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
            max_features=None, max_leaf_nodes=8, min_samples_leaf=2,
            min_samples_split=4, min_weight_fraction_leaf=0.5, splitter='best')
dt.fit(X,y)
dt.feature_importances_

array([ 0.476244  ,  0.        ,  0.29609394,  0.22766206])

In [9]:
tree.export_graphviz(dt,out_file='mojo.dot',feature_names=predictors.columns,class_names=mojo['Rating'].unique())

In [10]:
!dot -Tsvg mojo.dot -o mojo.svg

<img src='mojo.svg'/>

#### Challenge 3 (Optional but recommended)

Tackle the [Titanic Survivors kaggle competition](https://www.kaggle.com/c/titanic-gettingStarted) with decision trees. Look at your splits; how does your tree decide?



In [11]:
from sklearn.metrics import classification_report, accuracy_score,roc_auc_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import scale
from sklearn.cross_validation import StratifiedShuffleSplit

In [13]:
train = pd.read_csv('train.csv')
train['Age'].fillna(train['Age'].mean(),inplace=True)
train['Embarked'].fillna('S',inplace=True)
train['fam'] = train.Parch + train.SibSp
predictors = [#'PassengerId',
              #'Survived', 
              'Pclass', 
              #'Name', 
              'Sex',
              'Age', 
              #'SibSp',
                #'Parch', 
              #'Ticket',
              'Fare', 
              #'Cabin',
              'Embarked',
              'fam']

X = pd.get_dummies(train[predictors])
X.iloc[:,:3] = scale(X.iloc[:,:3])
y=train['Survived']

In [17]:
sss = StratifiedShuffleSplit(y, n_iter=10,test_size=.2)
for train_index, test_index in sss:
    xtrain, xtest = X.iloc[train_index], X.iloc[test_index]
    ytrain, ytest = y.iloc[train_index], y.iloc[test_index]

param_test1 = {'n_estimators':list(range(1,300,20))}
gsearch1 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, 
                                        min_samples_split=4,min_samples_leaf=4,
                                        max_depth=3,max_features='sqrt',subsample=0.8), 
                                        param_grid = param_test1, scoring='roc_auc',n_jobs=10,iid=False, cv=4)
gsearch1.fit(xtrain,ytrain)
gsearch1.best_score_

0.87985141381315923

In [18]:
param_test2 = {'max_depth':list(range(1,10,1)), 'min_samples_split':list(range(10,300,10))}
gsearch2 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=180, 
                                            max_features='sqrt', subsample=0.8), 
                                            param_grid = param_test2, scoring='roc_auc',n_jobs=4,iid=False, cv=4)
gsearch2.fit(xtrain,ytrain)
gsearch2.best_params_, gsearch2.best_score_

({'max_depth': 3, 'min_samples_split': 30}, 0.8775723251189006)

In [19]:
param_test3 = {'min_samples_split':list(range(50,300,10)), 'min_samples_leaf':list(range(2,100,5))}
gsearch3 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=180,max_depth=8,
                                                max_features='sqrt', subsample=0.8, random_state=10), 
                                                param_grid = param_test3, scoring='roc_auc',n_jobs=4,iid=False, cv=4)
gsearch3.fit(xtrain,ytrain)
gsearch3.best_params_, gsearch3.best_score_

({'min_samples_leaf': 32, 'min_samples_split': 130}, 0.87567135772066385)

In [20]:
param_test4 = {'max_features':list(range(1,9,1))}
gsearch4 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=41,max_depth=8, 
                                            min_samples_split=160, min_samples_leaf=7, 
                                            subsample=0.8),
                                            param_grid = param_test4, scoring='roc_auc',n_jobs=4,iid=False, cv=4)
gsearch4.fit(xtrain,ytrain)
gsearch4.best_params_, gsearch4.best_score_

({'max_features': 1}, 0.88147513166342795)

In [21]:
param_test5 = {'subsample':[0.6,0.7,0.75,0.8,0.85,0.9]}
gsearch5 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=41,max_depth=8, 
                                            min_samples_split=100, min_samples_leaf=2, 
                                            subsample=0.9,max_features =5),
                                            param_grid = param_test5, scoring='roc_auc',n_jobs=4,iid=False, cv=4)
gsearch5.fit(xtrain,ytrain)
gsearch5.best_params_, gsearch5.best_score_

({'subsample': 0.75}, 0.87792172336353047)

In [22]:
sss = StratifiedShuffleSplit(y, n_iter=10,test_size=.4)
for train_index, test_index in sss:
    xtrain, xtest = X.iloc[train_index], X.iloc[test_index]
    ytrain, ytest = y.iloc[train_index], y.iloc[test_index]
gbm_tuned_1 = GradientBoostingClassifier(learning_rate=0.01, n_estimators=1000,max_depth=8, 
                                            min_samples_split=20, min_samples_leaf=2, 
                                            subsample=0.9,max_features =9 )
gbm_tuned_1.fit(xtrain,ytrain)
print(roc_auc_score(ytest,gbm_tuned_1.predict_proba(xtest)[:,1]))
print(classification_report(ytest,gbm_tuned_1.predict(xtest)))

0.853301260783
             precision    recall  f1-score   support

          0       0.85      0.87      0.86       220
          1       0.78      0.75      0.77       137

avg / total       0.82      0.82      0.82       357



### Installing pydot for the challenges:

Note: Uninstall pydot if you already installed it but it's not working

    pip uninstall pydot

Otherwise, you can start here:

    pip uninstall pyparsing

    pip install -Iv
    https://pypi.python.org/packages/source/p/pyparsing/pyparsing-1.5.7.tar.gz#md5=9be0fcdcc595199c646ab317c1d9a709

    pip install pydot

    brew install graphviz
