In [28]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import KFold, cross_val_score

In [23]:
abalone = pd.read_csv('abalone.csv')
abalone.head()

Unnamed: 0,Sex,Length,Diameter,Height,WholeWeight,ShuckedWeight,VisceraWeight,ShellWeight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [24]:
y = abalone['Rings']
abalone_train = abalone.drop('Rings', axis = 1)
abalone_train.head()

Unnamed: 0,Sex,Length,Diameter,Height,WholeWeight,ShuckedWeight,VisceraWeight,ShellWeight
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055


In [25]:
abalone_train['Sex'] = abalone_train['Sex'].map(lambda x: -1 if x == 'F' else(0 if x == 'I' else 1))
abalone_train.head()

Unnamed: 0,Sex,Length,Diameter,Height,WholeWeight,ShuckedWeight,VisceraWeight,ShellWeight
0,1,0.455,0.365,0.095,0.514,0.2245,0.101,0.15
1,1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07
2,-1,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21
3,1,0.44,0.365,0.125,0.516,0.2155,0.114,0.155
4,0,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055


In [36]:
cv = KFold(y.size, n_folds = 5, shuffle = True, random_state = 1)
param_grid = {'n_estimators':range(1, 51)}
clf = RandomForestRegressor(random_state = 1)
grid = GridSearchCV(clf, param_grid, cv = cv, scoring = 'r2')
grid.fit(abalone_train, y)
grid.grid_scores_

[mean: 0.11450, std: 0.10473, params: {'n_estimators': 1},
 mean: 0.33158, std: 0.03872, params: {'n_estimators': 2},
 mean: 0.39742, std: 0.02406, params: {'n_estimators': 3},
 mean: 0.43956, std: 0.02450, params: {'n_estimators': 4},
 mean: 0.46244, std: 0.02147, params: {'n_estimators': 5},
 mean: 0.46873, std: 0.01825, params: {'n_estimators': 6},
 mean: 0.47412, std: 0.01831, params: {'n_estimators': 7},
 mean: 0.47981, std: 0.02069, params: {'n_estimators': 8},
 mean: 0.48522, std: 0.02089, params: {'n_estimators': 9},
 mean: 0.49120, std: 0.02373, params: {'n_estimators': 10},
 mean: 0.49125, std: 0.02189, params: {'n_estimators': 11},
 mean: 0.49554, std: 0.01888, params: {'n_estimators': 12},
 mean: 0.50038, std: 0.01893, params: {'n_estimators': 13},
 mean: 0.50495, std: 0.01834, params: {'n_estimators': 14},
 mean: 0.50886, std: 0.01861, params: {'n_estimators': 15},
 mean: 0.51072, std: 0.01818, params: {'n_estimators': 16},
 mean: 0.51226, std: 0.01784, params: {'n_estimat

0.52950231917710844