In [1]:
import pandas as pd
import numpy as np
import os
import csv
import random
from math import sqrt, fabs, exp
from sklearn.preprocessing import label_binarize
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plot
import matplotlib.patches as mpatches
from sklearn.linear_model import enet_path
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score
from sklearn.cross_validation import train_test_split

In [2]:
path =r'/Users/Bruce/desktop/stat242_2015/project'
os.chdir(path)

In [3]:
sales_data = pd.read_csv("train.csv",sep=",")
scale_data = sales_data.shape


#### random sample to generate training data and test data

random.seed(1223)

sample_index = random.sample(range(0, scale_data[0]-1), int (0.8*(scale_data[0]-1)))

### generate training data

sale_train = sales_data.ix[sample_index]

labels = sale_train["target"]

In [4]:
sale_train.drop('target',axis = 1 , inplace = True)

sale_train_x = sale_train

sale_train_x.drop("id",axis = 1,inplace = True)

In [5]:
rows = sale_train_x.index
rows = list(rows)
row_count = len(rows)

train_index = random.sample(list(rows), int (0.8*row_count))
test_index = sale_train_x.index.delete(train_index)

### training sets of X variables
Xtrain = sale_train_x.ix[train_index]
### training sets of labels

Ytrain = labels.ix[train_index]
Ytrain = pd.Categorical(Ytrain)
Ytrain = np.unique(Ytrain, return_inverse=True)[1]


### test sets of X variables
Xtest = sale_train_x.ix[test_index]
Ytest = labels.ix[test_index]
Ytest = pd.Categorical(Ytest)
Ytest = np.unique(Ytest, return_inverse=True)[1]

In [11]:
nTreeList = range(1,101,1)
missCLassError_RF = []

In [13]:
for iTrees in nTreeList:
    depth = 18
    otto_classify = RandomForestClassifier(n_estimators=iTrees,
        max_depth=depth, max_features="sqrt",
        oob_score=False, random_state=1223,n_jobs = 4)


    otto_classify.fit(Xtrain,Ytrain)

    #Accumulate auc on test set
    prediction = otto_classify.predict(Xtest)

    correct = accuracy_score(Ytest, prediction)

    missCLassError_RF.append(1.0 - correct)

In [15]:
min(missCLassError_RF)

0.15504049148282606

In [10]:
min(missCLassError_RF)
min_bool = (missCLassError_RF ==min(missCLassError_RF))
min_bool
best_ntree = nTreeList[min_bool.argmax()]

In [11]:
best_ntree


25

In [None]:
## extreme tree method

In [None]:
nTreeList = range(1,101,1)

missCLassError_extreme = []

In [16]:

for iTrees in nTreeList:
    depth = 18
    otto_extreme = ExtraTreesClassifier(n_estimators=iTrees,
        max_depth=depth, max_features="sqrt",
        oob_score=False, random_state=1223,n_jobs = 4)


    otto_extreme.fit(Xtrain,Ytrain)

    #Accumulate auc on test set
    prediction = otto_extreme.predict(Xtest)

    correct = accuracy_score(Ytest, prediction)

    missCLassError_extreme.append(1.0 - correct)

In [40]:
min (missCLassError_extreme)

0.25601787210276461

In [41]:

min_bool_extre = (missCLassError_extreme ==min(missCLassError_extreme))
min_bool_extre
best_ntree_extre = nTreeList[min_bool_extre.argmax()]

In [42]:
best_ntree_extre

37

In [None]:
## Gradient Boosting

In [33]:
otto_GBM = GradientBoostingClassifier(n_estimators=100,
                    max_depth=10,learning_rate=0.05,
                    max_features="sqrt",subsample=0.5)

In [34]:
otto_GBM.fit(Xtrain, Ytrain)

GradientBoostingClassifier(init=None, learning_rate=0.05, loss='deviance',
              max_depth=10, max_features='sqrt', max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2, n_estimators=100,
              random_state=None, subsample=0.5, verbose=0,
              warm_start=False)

In [35]:
 # compute auc on test set as function of ensemble size
missClassError_GB = []
missClassBest = 1.0
predictions = otto_GBM.staged_decision_function(Xtest)

In [36]:
for p in predictions:
    missClass = 0
    for i in range(len(p)):
        listP = p[i].tolist()
        if listP.index(max(listP)) != Ytest[i]:
            missClass += 1
    missClass = float(missClass)/len(p)
    missClassError_GB.append(missClass)
    #capture best predictions
    if missClass < missClassBest:
        missClassBest = missClass
pBest = p

In [37]:
idxBest = missClassError_GB.index(min(missClassError_GB))

In [22]:
print("Best Missclassification Error" )
print(missClassBest)
print("Number of Trees for Best Missclassification Error")
print(idxBest)

Best Missclassification Error
0.1357162803686121
Number of Trees for Best Missclassification Error
99


In [None]:
plot.plot(nTreeList,missCLassError_RF)
plot.plot(nTreeList,missCLassError_extreme)
plot.plot(range(1, 100 + 1), missClassError_GB, label='Test Set Error')
plot.xlabel('Number of Trees in Ensemble')
plot.ylabel('Classification Error')
plot.show()

In [44]:
pBestList = pBest.tolist()
bestPrediction = [r.index(max(r)) for r in pBestList]
confusionMat = confusion_matrix(Ytest, bestPrediction)
print('')
print("Confusion Matrix")
print(confusionMat)


Confusion Matrix
[[ 420   21    2    0    1   22    8   37   33]
 [   1 4263  235    9    0    3   10    4    1]
 [   0  618 1655    5    0    2   15    4    2]
 [   0  181   64  533    1   16    4    0    0]
 [   0    7    0    0  815    0    0    0    0]
 [   4   33    2    2    0 4115   18   32   22]
 [   5   70   29    3    2   20  694   19    5]
 [   8   17    6    0    1   23    9 2281   17]
 [  11   27    4    0    0   18    8   22 1386]]


In [None]:
### FIT THE MODEL USING ALL THE TRAINING DATA

In [51]:
## combine all the training data
X_all = sale_train_x
labels_all = sale_train["target"]
labels_all = pd.Categorical(labels_all)
Y_all = np.unique(labels_all, return_inverse=True)[1]

In [52]:
## test data after using all training data to fit the model
sale_test = sales_data.drop(sales_data.index[sample_index])
label_test = sale_test["target"]
## change to factor
label_test = pd.Categorical(label_test)
label_test = np.unique(label_test,return_inverse = True)[1]
sale_test.drop('target',axis = 1 , inplace = True)
sale_test_x  =  sale_test
sale_test_x.drop("id",axis = 1,inplace = True)

In [53]:
##fit the gradient boosting model using all the training data
otto_GBM.fit(X_all, Y_all)

GradientBoostingClassifier(init=None, learning_rate=0.05, loss='deviance',
              max_depth=10, max_features='sqrt', max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2, n_estimators=100,
              random_state=None, subsample=0.5, verbose=0,
              warm_start=False)

In [54]:
predictions_test = otto_GBM.staged_decision_function(sale_test_x)
missClassError_new=[]
for p in predictions_test:
    missClass = 0
    for i in range(len(p)):
        listP = p[i].tolist()
        if listP.index(max(listP)) != label_test[i]:
            missClass += 1
    missClass = float(missClass)/len(p)
    missClassError_new.append(missClass)
    #capture best predictions
    if missClass < missClassBest:
        missClassBest = missClass
pBest = p

In [62]:
idxBest_test = missClassError_new.index(min(missClassError_new))

99

In [56]:
print("Best Missclassification Error" )
print(missClassBest)
print("Number of Trees for Best Missclassification Error")
print(idxBest)

Best Missclassification Error
0.09734710974588104
Number of Trees for Best Missclassification Error
99


In [None]:
## fit best model of random forest and predict using test data

In [59]:
otto_classify_best = RandomForestClassifier(n_estimators=25,
        max_depth=18, max_features="sqrt",
        oob_score=False, random_state=1223,n_jobs = 4)
otto_classify_best.fit(X_all, Y_all)
    #Accumulate auc on test set
prediction_best = otto_classify_best.predict(sale_test_x)
correct = accuracy_score(label_test, prediction_best)
missCLassError_best = 1.0 - correct


In [None]:
## fit best model of extra trees and predict 

In [61]:
otto_extreme = ExtraTreesClassifier(n_estimators=37,
        max_depth=18, max_features="sqrt",
        oob_score=False, random_state=1223,n_jobs = 4)
otto_extreme.fit(X_all,Y_all)
 #Accumulate auc on test set
prediction = otto_extreme.predict(sale_test_x)
correct = accuracy_score(label_test, prediction)
missCLassError_best_ET = 1.0 - correct
