In [149]:
# Import Libraries
%matplotlib inline

import pandas
import numpy as np
import re
import operator

# import sklearn algorithms
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_selection import SelectKBest, f_classif

In [175]:
# Read in CSV File
allstate = pandas.read_csv("/Users/ryanallred/Desktop/Allstate_Data/train.csv")

In [176]:
# Fill empty loss column with 0.00 
medianVal = 0.00
allstate['loss'] = 0.00
for i, row in enumerate(allstate['loss']):
    allstate.set_value(i, 'loss', medianVal)

In [177]:
# Drop complex categorical columns for now.
for column in allstate:
    if column == "cat109" or column == "cat110" or column == "cat112" or column == "cat113" or column == "cat116":
        allstate.drop(column, 1, inplace=True)

In [178]:
# create an array subset that is just the remaining categorical variables
categorical_headers = allstate.columns.values[1:112]
#print(categorical_headers)

In [181]:
# replace categorical variables with numbers
for column in categorical_headers:
    allstate.loc[allstate[column] == "A", column] = 0
    allstate.loc[allstate[column] == "B", column] = 1
    allstate.loc[allstate[column] == "C", column] = 2
    allstate.loc[allstate[column] == "D", column] = 3
    allstate.loc[allstate[column] == "E", column] = 4
    allstate.loc[allstate[column] == "F", column] = 5
    allstate.loc[allstate[column] == "G", column] = 6
    allstate.loc[allstate[column] == "H", column] = 7
    allstate.loc[allstate[column] == "I", column] = 8
    allstate.loc[allstate[column] == "J", column] = 9
    allstate.loc[allstate[column] == "K", column] = 11
    allstate.loc[allstate[column] == "L", column] = 12
    allstate.loc[allstate[column] == "M", column] = 13
    allstate.loc[allstate[column] == "N", column] = 14
    allstate.loc[allstate[column] == "O", column] = 15
    allstate.loc[allstate[column] == "P", column] = 16
    allstate.loc[allstate[column] == "Q", column] = 17
    allstate.loc[allstate[column] == "R", column] = 18
    allstate.loc[allstate[column] == "S", column] = 19
    allstate.loc[allstate[column] == "T", column] = 20
    allstate.loc[allstate[column] == "U", column] = 21
    allstate.loc[allstate[column] == "V", column] = 22
    allstate.loc[allstate[column] == "W", column] = 23
    allstate.loc[allstate[column] == "X", column] = 24
    allstate.loc[allstate[column] == "Y", column] = 25
    allstate.loc[allstate[column] == "Z", column] = 26

In [180]:
#get column headers to be used in analysis
predictors = allstate.columns.values[1:126]
print(predictors)

['cat1' 'cat2' 'cat3' 'cat4' 'cat5' 'cat6' 'cat7' 'cat8' 'cat9' 'cat10'
 'cat11' 'cat12' 'cat13' 'cat14' 'cat15' 'cat16' 'cat17' 'cat18' 'cat19'
 'cat20' 'cat21' 'cat22' 'cat23' 'cat24' 'cat25' 'cat26' 'cat27' 'cat28'
 'cat29' 'cat30' 'cat31' 'cat32' 'cat33' 'cat34' 'cat35' 'cat36' 'cat37'
 'cat38' 'cat39' 'cat40' 'cat41' 'cat42' 'cat43' 'cat44' 'cat45' 'cat46'
 'cat47' 'cat48' 'cat49' 'cat50' 'cat51' 'cat52' 'cat53' 'cat54' 'cat55'
 'cat56' 'cat57' 'cat58' 'cat59' 'cat60' 'cat61' 'cat62' 'cat63' 'cat64'
 'cat65' 'cat66' 'cat67' 'cat68' 'cat69' 'cat70' 'cat71' 'cat72' 'cat73'
 'cat74' 'cat75' 'cat76' 'cat77' 'cat78' 'cat79' 'cat80' 'cat81' 'cat82'
 'cat83' 'cat84' 'cat85' 'cat86' 'cat87' 'cat88' 'cat89' 'cat90' 'cat91'
 'cat92' 'cat93' 'cat94' 'cat95' 'cat96' 'cat97' 'cat98' 'cat99' 'cat100'
 'cat101' 'cat102' 'cat103' 'cat104' 'cat105' 'cat106' 'cat107' 'cat108'
 'cat111' 'cat114' 'cat115' 'cont1' 'cont2' 'cont3' 'cont4' 'cont5' 'cont6'
 'cont7' 'cont8' 'cont9' 'cont10' 'cont11' 'cont

In [183]:
# Linear Regression
alg = LinearRegression()
kf = KFold(allstate.shape[0], n_folds=3, random_state=1)
predictions = []
for train, test in kf:
    train_predictors = (allstate[predictors].iloc[train,:])
    train_target = allstate["loss"].iloc[train]
    alg.fit(train_predictors, train_target)
    test_predictions = alg.predict(allstate[predictors].iloc[test,:])
    print(test_predictions)
    predictions.append(test_predictions)
predictions = np.concatenate(predictions, axis=0)

# for prediction in predictions:
#     print(prediction)

[ 0.  0.  0. ...,  0.  0.  0.]
[ 0.  0.  0. ...,  0.  0.  0.]
[ 0.  0.  0. ...,  0.  0.  0.]


In [187]:
# Random Forest
predictors = allstate.columns.values[1:126]
alg = RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=10, min_samples_leaf=1)
scores = cross_val_score(alg, allstate[predictors], allstate["loss"], cv=20)
# print(scores.mean())

for score in scores:
    print(score)

1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0


In [188]:
#Delete extra rows and export to CSV
for column in allstate:
    if column != "loss" and column != "id":
        allstate.drop(column, 1, inplace=True)
allstate.to_csv("/Users/ryanallred/Desktop/Allstate_Data/final.csv", index=False, float_format='%.3f')