# This Model is a random forest - Score= .451502

### Import Packages

In [19]:
import pandas as pd
import numpy as np
from sklearn import ensemble
from IPython.display import Image
import pydotplus

### Load the datasets

In [20]:
TeamOverview = pd.read_csv(filepath_or_buffer='../TeamOverview2.csv')
TurnyResult = pd.read_csv(filepath_or_buffer='../data\\TourneyCompactResults.csv')
sampleSub = pd.read_csv('../data\\sample_submission.csv')

### Define fuctions

In [21]:
def create_train_test(data,split):
    np.random.seed(0)
    msk = np.random.rand(len(data)) < split
    train = data[msk]
    test = data[~msk]
    return train,test

def random_forest(data, features, target,n_estimators, max_depth, min_impurity_split):
    model = ensemble.RandomForestClassifier( n_estimators=n_estimators, max_depth=max_depth,
                                            random_state=0, min_impurity_split=min_impurity_split)
    model = model.fit(data[features],data[target].ravel())
    return model

def make_predictions(model, data, features):
    data.loc[:,'prediction'] = model.predict(data[features])
    #dataout['prediction_prob'] = model.predict_proba((data[features]))
    return data

def find_best_model(train_data,test_data, features, target, max_depth, min_impurity_split, estimators):
    best_accuracy=0
    best_model=None
    for estimate in estimators:
        for depth in max_depth:
            for impurity in min_impurity_split:
                model = random_forest(data=train_data, features=features, target=target,
                                      n_estimators=int(estimate), max_depth= depth, min_impurity_split=impurity)
                predictions = make_predictions(model, test_data, features)
                accuracy = (predictions['prediction']==test_data[target]).mean()
                if accuracy>best_accuracy:
                    print('depth:',depth,'impurity:',impurity,'trees:',int(estimate),'accuracy:',accuracy)
                    best_accuracy=accuracy
                    best_model=model
    return best_model

def model_building_data_generator():
    ##Uses TeamOverview
    toJoin = TeamOverview[['Season','Team','pointsFor','pointsAgainst',
                           'Wins','Losses','Awins','Hwins','Nwins','Hloss',
                           'Aloss','Nloss','BestWinWin%','BestWinSeed',
                           'AwinAvg','HwinAvg','NwinAvg','HlossAvg','AlossAvg','NlossAvg']]
    toJoin['BestWinSeed'].fillna(20, inplace=True)
    
    Acol =list(toJoin.columns)
    x = 0
    x = []
    for col in Acol:
        ocol = 'Away-'+col
        x.append(ocol)
    AtoJoin = toJoin.copy()
    AtoJoin.columns = x
    
    Hcol =list(toJoin.columns)
    y = 0
    y = []
    for col in Hcol:
        ocol = 'Home-'+col
        y.append(ocol)
    HtoJoin = toJoin.copy()
    HtoJoin.columns = y
    
    ##Uses TurnyResult dataset
    TurnyResult.reset_index(inplace = True)
    TurnyResult['index'] = TurnyResult['index'].apply(lambda x: x%2)
    
    WisHome = TurnyResult[TurnyResult['index']==0]
    LisHome = TurnyResult[TurnyResult['index']==1]
    WisHome['outcome'] = 'H'
    LisHome['outcome'] = 'A'
    WisHome.drop(['index','Daynum','Wscore','Lscore','Wloc','Numot'], inplace=True, axis=1)
    LisHome.drop(['index','Daynum','Wscore','Lscore','Wloc','Numot'], inplace=True, axis=1)
    WisHome.columns = ['Season','Hteam','Ateam','outcome']
    LisHome.columns = ['Season','Ateam','Hteam','outcome']
    
    NewTurnyResults = pd.concat([WisHome,LisHome])
    NewTurnyResults.columns = ['Away-Team','Home-Team','Season','outcome']
    NewTurnyResults = NewTurnyResults.merge(HtoJoin, left_on=['Season','Home-Team'],
                                            right_on=['Home-Season','Home-Team'],how='left' )
    NewTurnyResults = NewTurnyResults.merge(AtoJoin, left_on=['Season','Away-Team'], 
                                            right_on=['Away-Season','Away-Team'],how='left' )
    NewTurnyResults.drop(['Home-Season','Away-Season'], inplace=True, axis=1)
    
    train,test = create_train_test(NewTurnyResults, 0.9)
    
    ##Uses sampeSub
    idsplit = sampleSub['id'].str.split('_',expand=True)
    idsplit.columns = ['Season','predteam','oteam']
    Subtest= sampleSub.join(idsplit)
    
    Subtest['Season'] = pd.to_numeric(Subtest['Season'])
    Subtest['predteam'] = pd.to_numeric(Subtest['predteam'])
    Subtest['oteam'] = pd.to_numeric(Subtest['oteam'])
    
    Subtest = Subtest.merge(HtoJoin, left_on=['Season','oteam'], right_on=['Home-Season','Home-Team'],how='left' )
    Subtest = Subtest.merge(AtoJoin, left_on=['Season','predteam'], right_on=['Away-Season','Away-Team'],how='left' )
    Subtest.drop(['Home-Season','Away-Season'], inplace=True, axis=1)
    
    return train, test, Subtest
    

### Identify what games would be played in each round

In [44]:
feats = [
    'Home-pointsFor',
    'Home-pointsAgainst',
    'Home-Wins',
    'Home-Losses',
    'Home-Awins',
    'Home-Hwins',
    'Home-Nwins',
    'Home-Hloss',
    'Home-Aloss',
    'Home-Nloss',
    'Home-BestWinWin%',
    'Home-BestWinSeed',
    'Home-AwinAvg',
    'Home-HwinAvg',
    'Home-NwinAvg',
#    'Home-HlossAvg',
#    'Home-AlossAvg',
#    'Home-NlossAvg',
    'Away-pointsFor',
    'Away-pointsAgainst',
    'Away-Wins',
    'Away-Losses',
    'Away-Awins',
    'Away-Hwins',
    'Away-Nwins',
    'Away-Hloss',
    'Away-Aloss',
    'Away-Nloss',
    'Away-BestWinWin%',
    'Away-BestWinSeed',
    'Away-AwinAvg',
    'Away-HwinAvg',
    'Away-NwinAvg',
#    'Away-HlossAvg',
#    'Away-AlossAvg',
#    'Away-NlossAvg'
]

#### Build the data frame

In [23]:
train, test, modeldata = model_building_data_generator()

modeldata.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documen

Unnamed: 0,id,pred,Season,predteam,oteam,Home-Team,Home-pointsFor,Home-pointsAgainst,Home-Wins,Home-Losses,...,Away-Aloss,Away-Nloss,Away-BestWinWin%,Away-BestWinSeed,Away-AwinAvg,Away-HwinAvg,Away-NwinAvg,Away-HlossAvg,Away-AlossAvg,Away-NlossAvg
0,2013_1103_1107,0.5,2013,1103,1107,1107,2190,2050,24,10,...,4,1,0.848485,11.0,70.0,74.714,73.5,64.0,67.75,65.0
1,2013_1103_1112,0.5,2013,1103,1112,1112,2345,2038,25,7,...,4,1,0.848485,11.0,70.0,74.714,73.5,64.0,67.75,65.0
2,2013_1103_1125,0.5,2013,1103,1125,1125,2307,1939,24,6,...,4,1,0.848485,11.0,70.0,74.714,73.5,64.0,67.75,65.0
3,2013_1103_1129,0.5,2013,1103,1129,1129,2061,1927,19,10,...,4,1,0.848485,11.0,70.0,74.714,73.5,64.0,67.75,65.0
4,2013_1103_1137,0.5,2013,1103,1137,1137,2152,1846,27,5,...,4,1,0.848485,11.0,70.0,74.714,73.5,64.0,67.75,65.0


In [6]:
test.head()

Unnamed: 0,Away-Team,Home-Team,Season,outcome,Home-pointsFor,Home-pointsAgainst,Home-Wins,Home-Losses,Home-Awins,Home-Hwins,...,Away-Aloss,Away-Nloss,Away-BestWinWin%,Away-BestWinSeed,Away-AwinAvg,Away-HwinAvg,Away-NwinAvg,Away-HlossAvg,Away-AlossAvg,Away-NlossAvg
8,1112,1104,1985,H,2055,1821,21,9,5,15,...,5,1,0.758621,6.0,70.429,70.182,70.429,60.667,55.2,73.0
13,1305,1301,1985,H,2146,1893,20,9,4,14,...,8,0,0.9,4.0,83.0,82.0,81.0,74.0,68.625,68.625
20,1433,1104,1985,H,2055,1821,21,9,5,15,...,4,0,0.733333,7.0,72.222,73.75,81.25,58.0,63.75,63.75
27,1246,1385,1985,H,2284,1932,27,3,7,12,...,8,1,0.766667,3.0,62.5,67.273,92.0,62.0,64.25,55.0
38,1239,1396,1986,H,1992,1682,24,5,12,10,...,6,0,0.740741,8.0,68.833,72.3,63.333,61.667,61.5,61.5


In [24]:
depthset = np.linspace(3,17,15)
impurityset = np.linspace(.05,.20,10)
esti = np.linspace(100,700,7)
esti

array([ 100.,  200.,  300.,  400.,  500.,  600.,  700.])

In [45]:

bestmodel = find_best_model(train_data=train,test_data=test, 
                            features=feats, target='outcome', max_depth= depthset,
                            min_impurity_split = impurityset, estimators=esti)

depth: 3.0 impurity: 0.05 trees: 100 accuracy: 0.719827586207
depth: 3.0 impurity: 0.1 trees: 100 accuracy: 0.724137931034
depth: 4.0 impurity: 0.05 trees: 100 accuracy: 0.728448275862
depth: 6.0 impurity: 0.1 trees: 100 accuracy: 0.737068965517


In [None]:
prediction_prob = bestmodel.predict_proba(modeldata[feats])

In [None]:
prediction_prob

In [None]:
predict = pd.DataFrame(prediction_prob)
predict['id']=modeldata['id']
predict.drop(1,axis=1, inplace=True)
predict.columns = ['pred','id']
predict = predict[['id','pred']]
predict.head()

In [None]:
predict.to_csv('../submissions\\Fithsub.csv',index=False)

# Quick Run

In [None]:
'''model = ensemble.RandomForestClassifier( n_estimators=1000, max_depth=13,
                                            random_state=0, min_impurity_split=.11666666667)
model = model.fit(train[feats],train['outcome'].ravel())
prediction_prob = model.predict_proba(modeldata[feats])
predict = pd.DataFrame(prediction_prob)
predict['id']=modeldata['id']
predict.drop(1,axis=1, inplace=True)
predict.columns = ['pred','id']
predict = predict[['id','pred']]
predict.to_csv('../submissions\\Thirdsub.csv',index=False)'''

In [18]:
model = ensemble.RandomForestClassifier( n_estimators=100, max_depth=5,
                                        random_state=0, min_impurity_split=.2)
model = model.fit(train[feats],train['outcome'].ravel())

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').