# This Model is a random forest - Score= .451502

### Import Packages

In [1]:
import pandas as pd
import numpy as np
from sklearn import ensemble
from IPython.display import Image
import pydotplus

### Load the datasets

In [2]:
TeamOverview = pd.read_csv(filepath_or_buffer='../TeamOverview2.csv')
TurnyResult = pd.read_csv(filepath_or_buffer='../data\\TourneyCompactResults.csv')
sampleSub = pd.read_csv('../data\\sample_submission.csv')

### Define fuctions

In [3]:
def create_train_test(data,split):
    np.random.seed(0)
    msk = np.random.rand(len(data)) < split
    train = data[msk]
    test = data[~msk]
    return train,test

def random_forest(data, features, target,n_estimators, max_depth, min_impurity_split):
    model = ensemble.RandomForestClassifier( n_estimators=n_estimators, max_depth=max_depth,
                                            random_state=0, min_impurity_split=min_impurity_split)
    model = model.fit(data[features],data[target].ravel())
    return model

def make_predictions(model, data, features):
    data.loc[:,'prediction'] = model.predict(data[features])
    #dataout['prediction_prob'] = model.predict_proba((data[features]))
    return data

def find_best_model(train_data,test_data, features, target, max_depth, min_impurity_split, estimators):
    best_accuracy=0
    best_model=None
    for estimate in estimators:
        for depth in max_depth:
            for impurity in min_impurity_split:
                model = model.fit(train[feats],train['outcome'].ravel())
                predictions = make_predictions(model, test_data, features)
                accuracy = (predictions['prediction']==test_data[target]).mean()
                if accuracy>best_accuracy:
                    print('depth:',depth,'impurity:',impurity,'trees:',int(estimate),'accuracy:',accuracy)
                    best_accuracy=accuracy
                    best_model=model
    return best_model

def model_building_data_generator():
    ##Uses TeamOverview
    toJoin = TeamOverview[['Season','Team','pointsFor','pointsAgainst',
                           'Wins','Losses','Awins','Hwins','Nwins','Hloss',
                           'Aloss','Nloss','BestWinWin%','BestWinSeed',
                           'AwinAvg','HwinAvg','NwinAvg','HlossAvg','AlossAvg','NlossAvg']]
    toJoin['BestWinSeed'].fillna(20, inplace=True)
    
    Acol =list(toJoin.columns)
    x = 0
    x = []
    for col in Acol:
        ocol = 'Away-'+col
        x.append(ocol)
    AtoJoin = toJoin.copy()
    AtoJoin.columns = x
    
    Hcol =list(toJoin.columns)
    y = 0
    y = []
    for col in Hcol:
        ocol = 'Home-'+col
        y.append(ocol)
    HtoJoin = toJoin.copy()
    HtoJoin.columns = y
    
    ##Uses TurnyResult dataset
    TurnyResult.reset_index(inplace = True)
    TurnyResult['index'] = TurnyResult['index'].apply(lambda x: x%2)
    
    WisHome = TurnyResult[TurnyResult['index']==0]
    LisHome = TurnyResult[TurnyResult['index']==1]
    WisHome['outcome'] = 'H'
    LisHome['outcome'] = 'A'
    WisHome.drop(['index','Daynum','Wscore','Lscore','Wloc','Numot'], inplace=True, axis=1)
    LisHome.drop(['index','Daynum','Wscore','Lscore','Wloc','Numot'], inplace=True, axis=1)
    WisHome.columns = ['Season','Hteam','Ateam','outcome']
    LisHome.columns = ['Season','Ateam','Hteam','outcome']
    
    NewTurnyResults = pd.concat([WisHome,LisHome])
    NewTurnyResults.columns = ['Away-Team','Home-Team','Season','outcome']
    NewTurnyResults = NewTurnyResults.merge(HtoJoin, left_on=['Season','Home-Team'],
                                            right_on=['Home-Season','Home-Team'],how='left' )
    NewTurnyResults = NewTurnyResults.merge(AtoJoin, left_on=['Season','Away-Team'], 
                                            right_on=['Away-Season','Away-Team'],how='left' )
    NewTurnyResults.drop(['Home-Season','Away-Season'], inplace=True, axis=1)
    
    train,test = create_train_test(NewTurnyResults, 0.9)
    
    ##Uses sampeSub
    idsplit = sampleSub['id'].str.split('_',expand=True)
    idsplit.columns = ['Season','predteam','oteam']
    Subtest= sampleSub.join(idsplit)
    
    Subtest['Season'] = pd.to_numeric(Subtest['Season'])
    Subtest['predteam'] = pd.to_numeric(Subtest['predteam'])
    Subtest['oteam'] = pd.to_numeric(Subtest['oteam'])
    
    Subtest = Subtest.merge(HtoJoin, left_on=['Season','oteam'], right_on=['Home-Season','Home-Team'],how='left' )
    Subtest = Subtest.merge(AtoJoin, left_on=['Season','predteam'], right_on=['Away-Season','Away-Team'],how='left' )
    Subtest.drop(['Home-Season','Away-Season'], inplace=True, axis=1)
    
    return train, test, Subtest
    

### Identify what games would be played in each round

In [4]:
feats = [
    'Home-pointsFor',
    'Home-pointsAgainst',
    'Home-Wins',
    'Home-Losses',
    'Home-Awins',
    'Home-Hwins',
    'Home-Nwins',
    'Home-Hloss',
    'Home-Aloss',
    'Home-Nloss',
    'Home-BestWinWin%',
    'Home-BestWinSeed',
    'Home-AwinAvg',
    'Home-HwinAvg',
    'Home-NwinAvg',

    'Away-pointsFor',
    'Away-pointsAgainst',
    'Away-Wins',
    'Away-Losses',
    'Away-Awins',
    'Away-Hwins',
    'Away-Nwins',
    'Away-Hloss',
    'Away-Aloss',
    'Away-Nloss',
    'Away-BestWinWin%',
    'Away-BestWinSeed',
    'Away-AwinAvg',
    'Away-HwinAvg',
    'Away-NwinAvg',

    'SeedDifH-A'
]

In [5]:
seed = TeamOverview[['Season','Team','SeedNum']]
seed.head()

Unnamed: 0,Season,Team,SeedNum
0,1985,1102,
1,1985,1103,
2,1985,1104,7.0
3,1985,1106,
4,1985,1108,


#### Build the data frame

In [6]:
train, test, modeldata = model_building_data_generator()

modeldata.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documen

Unnamed: 0,id,pred,Season,predteam,oteam,Home-Team,Home-pointsFor,Home-pointsAgainst,Home-Wins,Home-Losses,...,Away-Aloss,Away-Nloss,Away-BestWinWin%,Away-BestWinSeed,Away-AwinAvg,Away-HwinAvg,Away-NwinAvg,Away-HlossAvg,Away-AlossAvg,Away-NlossAvg
0,2013_1103_1107,0.5,2013,1103,1107,1107,2190,2050,24,10,...,4,1,0.848485,11.0,70.0,74.714,73.5,64.0,67.75,65.0
1,2013_1103_1112,0.5,2013,1103,1112,1112,2345,2038,25,7,...,4,1,0.848485,11.0,70.0,74.714,73.5,64.0,67.75,65.0
2,2013_1103_1125,0.5,2013,1103,1125,1125,2307,1939,24,6,...,4,1,0.848485,11.0,70.0,74.714,73.5,64.0,67.75,65.0
3,2013_1103_1129,0.5,2013,1103,1129,1129,2061,1927,19,10,...,4,1,0.848485,11.0,70.0,74.714,73.5,64.0,67.75,65.0
4,2013_1103_1137,0.5,2013,1103,1137,1137,2152,1846,27,5,...,4,1,0.848485,11.0,70.0,74.714,73.5,64.0,67.75,65.0


In [7]:
train = train.merge(seed, left_on=['Season','Home-Team'], right_on=['Season','Team'], how='left')\
.merge(seed, left_on=['Season','Away-Team'], right_on=['Season','Team'], how='left')
train['SeedDifH-A'] = train['SeedNum_x']-train['SeedNum_x']
train.drop(['Team_x','SeedNum_x','Team_y','SeedNum_y'], inplace=True, axis=1)

test = test.merge(seed, left_on=['Season','Home-Team'], right_on=['Season','Team'], how='left')\
.merge(seed, left_on=['Season','Away-Team'], right_on=['Season','Team'], how='left')
test['SeedDifH-A'] = test['SeedNum_x']-test['SeedNum_x']
test.drop(['Team_x','SeedNum_x','Team_y','SeedNum_y'], inplace=True, axis=1)

modeldata = modeldata.merge(seed, left_on=['Season','Home-Team'], right_on=['Season','Team'], how='left')\
.merge(seed, left_on=['Season','Away-Team'], right_on=['Season','Team'], how='left')
modeldata['SeedDifH-A'] = modeldata['SeedNum_x']-modeldata['SeedNum_x']
modeldata.drop(['Team_x','SeedNum_x','Team_y','SeedNum_y'], inplace=True, axis=1)


#train = , test, modeldata
#test.head()

In [8]:
depthset = np.linspace(3,17,15)
impurityset = np.linspace(.05,.20,10)
esti = np.linspace(100,700,7)
esti

array([ 100.,  200.,  300.,  400.,  500.,  600.,  700.])

In [9]:

bestmodel = find_best_model(train_data=train,test_data=test, 
                            features=feats, target='outcome', max_depth= depthset,
                            min_impurity_split = impurityset, estimators=esti)

depth: 3.0 impurity: 0.05 trees: 100 accuracy: 0.706896551724
depth: 3.0 impurity: 0.0666666666667 trees: 100 accuracy: 0.711206896552
depth: 6.0 impurity: 0.0666666666667 trees: 100 accuracy: 0.724137931034
depth: 9.0 impurity: 0.15 trees: 100 accuracy: 0.745689655172


In [10]:
prediction_prob = bestmodel.predict_proba(modeldata[feats])

In [11]:
prediction_prob

array([[ 0.65786519,  0.34213481],
       [ 0.44187893,  0.55812107],
       [ 0.50529313,  0.49470687],
       ..., 
       [ 0.1965955 ,  0.8034045 ],
       [ 0.53617468,  0.46382532],
       [ 0.8313417 ,  0.1686583 ]])

In [12]:
predict = pd.DataFrame(prediction_prob)
predict['id']=modeldata['id']
predict.drop(1,axis=1, inplace=True)
predict.columns = ['pred','id']
predict = predict[['id','pred']]
predict.head()

Unnamed: 0,id,pred
0,2013_1103_1107,0.657865
1,2013_1103_1112,0.441879
2,2013_1103_1125,0.505293
3,2013_1103_1129,0.699085
4,2013_1103_1137,0.609039


In [13]:
predict.to_csv('../submissions\\7sub.csv',index=False)

# Quick Run

In [14]:
'''model = ensemble.RandomForestClassifier( n_estimators=1000, max_depth=13,
                                            random_state=0, min_impurity_split=.11666666667)
model = model.fit(train[feats],train['outcome'].ravel())
prediction_prob = model.predict_proba(modeldata[feats])
predict = pd.DataFrame(prediction_prob)
predict['id']=modeldata['id']
predict.drop(1,axis=1, inplace=True)
predict.columns = ['pred','id']
predict = predict[['id','pred']]
predict.to_csv('../submissions\\Thirdsub.csv',index=False)'''

"model = ensemble.RandomForestClassifier( n_estimators=1000, max_depth=13,\n                                            random_state=0, min_impurity_split=.11666666667)\nmodel = model.fit(train[feats],train['outcome'].ravel())\nprediction_prob = model.predict_proba(modeldata[feats])\npredict = pd.DataFrame(prediction_prob)\npredict['id']=modeldata['id']\npredict.drop(1,axis=1, inplace=True)\npredict.columns = ['pred','id']\npredict = predict[['id','pred']]\npredict.to_csv('../submissions\\Thirdsub.csv',index=False)"

In [52]:
model = ensemble.RandomForestClassifier( n_estimators=1000,
                                            random_state=0)
model = model.fit(train[feats],train['outcome'].ravel())

In [53]:
prediction_prob = model.predict(test[feats])

In [54]:

(prediction_prob==test['outcome']).mean()

0.70258620689655171

In [55]:
prediction_prob = model.predict_proba(modeldata[feats])

In [56]:
predict = pd.DataFrame(prediction_prob)
predict['id']=modeldata['id']
predict.drop(1,axis=1, inplace=True)
predict.columns = ['pred','id']
predict = predict[['id','pred']]
predict.head()

Unnamed: 0,id,pred
0,2013_1103_1107,0.709
1,2013_1103_1112,0.437
2,2013_1103_1125,0.482
3,2013_1103_1129,0.709
4,2013_1103_1137,0.652


In [57]:
predict.to_csv('../submissions\\8sub.csv',index=False)

In [58]:
model = ensemble.RandomForestClassifier( n_estimators=10000,
                                            random_state=0)
model = model.fit(train[feats],train['outcome'].ravel())
prediction_prob = model.predict(test[feats])
print((prediction_prob==test['outcome']).mean())
prediction_prob = model.predict_proba(modeldata[feats])
predict = pd.DataFrame(prediction_prob)
predict['id']=modeldata['id']
predict.drop(1,axis=1, inplace=True)
predict.columns = ['pred','id']
predict = predict[['id','pred']]
print(predict.head())
predict.to_csv('../submissions\\9sub.csv',index=False)

0.693965517241
               id    pred
0  2013_1103_1107  0.7097
1  2013_1103_1112  0.4232
2  2013_1103_1125  0.4813
3  2013_1103_1129  0.7069
4  2013_1103_1137  0.6736


In [59]:
model = ensemble.RandomForestClassifier( n_estimators=20000,
                                            random_state=0)
model = model.fit(train[feats],train['outcome'].ravel())
prediction_prob = model.predict(test[feats])
print((prediction_prob==test['outcome']).mean())
prediction_prob = model.predict_proba(modeldata[feats])
predict = pd.DataFrame(prediction_prob)
predict['id']=modeldata['id']
predict.drop(1,axis=1, inplace=True)
predict.columns = ['pred','id']
predict = predict[['id','pred']]
print(predict.head())
predict.to_csv('../submissions\\10sub.csv',index=False)

0.702586206897
               id     pred
0  2013_1103_1107  0.71040
1  2013_1103_1112  0.41935
2  2013_1103_1125  0.47925
3  2013_1103_1129  0.70595
4  2013_1103_1137  0.67460


In [None]:
model = ensemble.RandomForestClassifier( n_estimators=100000,
                                            random_state=0, max_depth = 1)
model = model.fit(train[feats],train['outcome'].ravel())
prediction_prob = model.predict(test[feats])
print((prediction_prob==test['outcome']).mean())
prediction_prob = model.predict_proba(modeldata[feats])
predict = pd.DataFrame(prediction_prob)
predict['id']=modeldata['id']
predict.drop(1,axis=1, inplace=True)
predict.columns = ['pred','id']
predict = predict[['id','pred']]
print(predict.head())
predict.to_csv('../submissions\\12sub.csv',index=False)