# This Model is a random forest - Score= .451502

### Import Packages

In [1]:
import pandas as pd
import numpy as np
from sklearn import ensemble
from IPython.display import Image
import pydotplus

### Load the datasets

In [2]:
TeamOverview = pd.read_csv(filepath_or_buffer='../TeamOverview.csv')
TurnyResult = pd.read_csv(filepath_or_buffer='../data\\TourneyCompactResults.csv')
sampleSub = pd.read_csv('../data\\sample_submission.csv')

### Define fuctions

In [24]:
def create_train_test(data,split):
    np.random.seed(0)
    msk = np.random.rand(len(data)) < split
    train = data[msk]
    test = data[~msk]
    return train,test

def random_forest(data, features, target,n_estimators, max_depth, min_impurity_split):
    model = ensemble.RandomForestClassifier( n_estimators=n_estimators, max_depth=max_depth,
                                            random_state=0, min_impurity_split=min_impurity_split)
    model = model.fit(data[features],data[target].ravel())
    return model

def make_predictions(model, data, features):
    data.loc[:,'prediction'] = model.predict(data[features])
    #dataout['prediction_prob'] = model.predict_proba((data[features]))
    return data

def find_best_model(train_data,test_data, features, target, max_depth, min_impurity_split):
    best_accuracy=0
    best_model=None
    for depth in max_depth:
        for impurity in min_impurity_split:
            model = random_forest(data=train_data, features=features, target=target,
                                  n_estimators=1000, max_depth= depth, min_impurity_split=impurity)
            predictions = make_predictions(model, test_data, features)
            accuracy = (predictions['prediction']==test_data[target]).mean()
            if accuracy>best_accuracy:
                print('depth:',depth,'impurity:',impurity,'accuracy:',accuracy)
                best_accuracy=accuracy
                best_model=model
    return best_model

def model_building_data_generator():
    ##Uses TeamOverview
    toJoin = TeamOverview[['Season','Team','pointsFor','pointsAgainst',
                       'Wins','Losses','Awins','Hwins','Nwins','Hloss',
                       'Aloss','Nloss','BestWinWin%','BestWinSeed']]
    toJoin['BestWinSeed'].fillna(20, inplace=True)
    
    Acol =list(toJoin.columns)
    x = 0
    x = []
    for col in Acol:
        ocol = 'Away-'+col
        x.append(ocol)
    AtoJoin = toJoin.copy()
    AtoJoin.columns = x
    
    Hcol =list(toJoin.columns)
    y = 0
    y = []
    for col in Hcol:
        ocol = 'Home-'+col
        y.append(ocol)
    HtoJoin = toJoin.copy()
    HtoJoin.columns = y
    
    ##Uses TurnyResult dataset
    TurnyResult.reset_index(inplace = True)
    TurnyResult['index'] = TurnyResult['index'].apply(lambda x: x%2)
    
    WisHome = TurnyResult[TurnyResult['index']==0]
    LisHome = TurnyResult[TurnyResult['index']==1]
    WisHome['outcome'] = 'H'
    LisHome['outcome'] = 'A'
    WisHome.drop(['index','Daynum','Wscore','Lscore','Wloc','Numot'], inplace=True, axis=1)
    LisHome.drop(['index','Daynum','Wscore','Lscore','Wloc','Numot'], inplace=True, axis=1)
    WisHome.columns = ['Season','Hteam','Ateam','outcome']
    LisHome.columns = ['Season','Ateam','Hteam','outcome']
    
    NewTurnyResults = pd.concat([WisHome,LisHome])
    NewTurnyResults.columns = ['Away-Team','Home-Team','Season','outcome']
    NewTurnyResults = NewTurnyResults.merge(HtoJoin, left_on=['Season','Home-Team'],
                                            right_on=['Home-Season','Home-Team'],how='left' )
    NewTurnyResults = NewTurnyResults.merge(AtoJoin, left_on=['Season','Away-Team'], 
                                            right_on=['Away-Season','Away-Team'],how='left' )
    NewTurnyResults.drop(['Home-Season','Away-Season'], inplace=True, axis=1)
    
    train,test = create_train_test(NewTurnyResults, 0.9)
    
    ##Uses sampeSub
    idsplit = sampleSub['id'].str.split('_',expand=True)
    idsplit.columns = ['Season','predteam','oteam']
    Subtest= sampleSub.join(idsplit)
    
    Subtest['Season'] = pd.to_numeric(Subtest['Season'])
    Subtest['predteam'] = pd.to_numeric(Subtest['predteam'])
    Subtest['oteam'] = pd.to_numeric(Subtest['oteam'])
    
    Subtest = Subtest.merge(HtoJoin, left_on=['Season','oteam'], right_on=['Home-Season','Home-Team'],how='left' )
    Subtest = Subtest.merge(AtoJoin, left_on=['Season','predteam'], right_on=['Away-Season','Away-Team'],how='left' )
    Subtest.drop(['Home-Season','Away-Season'], inplace=True, axis=1)
    
    return train, test, Subtest
    

### Identify what games would be played in each round

In [4]:
feats = [
    'Home-pointsFor',
    'Home-pointsAgainst',
    'Home-Wins',
    'Home-Losses',
    'Home-Awins',
    'Home-Hwins',
    'Home-Nwins',
    'Home-Hloss',
    'Home-Aloss',
    'Home-Nloss',
    'Home-BestWinWin%',
    'Home-BestWinSeed',
    'Away-pointsFor',
    'Away-pointsAgainst',
    'Away-Wins',
    'Away-Losses',
    'Away-Awins',
    'Away-Hwins',
    'Away-Nwins',
    'Away-Hloss',
    'Away-Aloss',
    'Away-Nloss',
    'Away-BestWinWin%',
    'Away-BestWinSeed']

#### Build the data frame

In [5]:
train, test, modeldata = model_building_data_generator()

modeldata.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documen

Unnamed: 0,id,pred,Season,predteam,oteam,Home-Team,Home-pointsFor,Home-pointsAgainst,Home-Wins,Home-Losses,...,Away-Wins,Away-Losses,Away-Awins,Away-Hwins,Away-Nwins,Away-Hloss,Away-Aloss,Away-Nloss,Away-BestWinWin%,Away-BestWinSeed
0,2013_1103_1107,0.5,2013,1103,1107,1107,2190,2050,24,10,...,25,6,7,14,4,1,4,1,0.848485,11.0
1,2013_1103_1112,0.5,2013,1103,1112,1112,2345,2038,25,7,...,25,6,7,14,4,1,4,1,0.848485,11.0
2,2013_1103_1125,0.5,2013,1103,1125,1125,2307,1939,24,6,...,25,6,7,14,4,1,4,1,0.848485,11.0
3,2013_1103_1129,0.5,2013,1103,1129,1129,2061,1927,19,10,...,25,6,7,14,4,1,4,1,0.848485,11.0
4,2013_1103_1137,0.5,2013,1103,1137,1137,2152,1846,27,5,...,25,6,7,14,4,1,4,1,0.848485,11.0


In [51]:
depthset = np.linspace(3,17,15)
impurityset = np.linspace(.05,.20,10)
impurityset

array([ 0.05      ,  0.06666667,  0.08333333,  0.1       ,  0.11666667,
        0.13333333,  0.15      ,  0.16666667,  0.18333333,  0.2       ])

In [25]:

bestmodel = find_best_model(train_data=train,test_data=test, 
                            features=feats, target='outcome', max_depth= depthset,
                            min_impurity_split = impurityset)

depth: 3.0 impurity: 0.05 accuracy: 0.715517241379
depth: 4.0 impurity: 0.1 accuracy: 0.719827586207
depth: 5.0 impurity: 0.05 accuracy: 0.724137931034
depth: 5.0 impurity: 0.0666666666667 accuracy: 0.728448275862
depth: 5.0 impurity: 0.0833333333333 accuracy: 0.73275862069
depth: 10.0 impurity: 0.133333333333 accuracy: 0.737068965517
depth: 13.0 impurity: 0.116666666667 accuracy: 0.741379310345


In [38]:
prediction_prob = bestmodel.predict_proba(modeldata[feats])

In [39]:
prediction_prob

array([[ 0.66771186,  0.33228814],
       [ 0.41447419,  0.58552581],
       [ 0.47349449,  0.52650551],
       ..., 
       [ 0.22426261,  0.77573739],
       [ 0.49682442,  0.50317558],
       [ 0.84826993,  0.15173007]])

In [49]:
predict = pd.DataFrame(prediction_prob)
predict['id']=modeldata['id']
predict.drop(1,axis=1, inplace=True)
predict.columns = ['pred','id']
predict = predict[['id','pred']]
predict.head()

Unnamed: 0,id,pred
0,2013_1103_1107,0.667712
1,2013_1103_1112,0.414474
2,2013_1103_1125,0.473494
3,2013_1103_1129,0.677751
4,2013_1103_1137,0.60147


In [50]:
predict.to_csv('../submissions\\Thirdsub.csv',index=False)

# Quick Run

In [55]:
model = ensemble.RandomForestClassifier( n_estimators=1000, max_depth=13,
                                            random_state=0, min_impurity_split=.11666666667)
model = model.fit(train[feats],train['outcome'].ravel())
prediction_prob = model.predict_proba(modeldata[feats])
predict = pd.DataFrame(prediction_prob)
predict['id']=modeldata['id']
predict.drop(1,axis=1, inplace=True)
predict.columns = ['pred','id']
predict = predict[['id','pred']]
predict.to_csv('../submissions\\Thirdsub.csv',index=False)

In [56]:
predict.head()

Unnamed: 0,id,pred
0,2013_1103_1107,0.667712
1,2013_1103_1112,0.414474
2,2013_1103_1125,0.473494
3,2013_1103_1129,0.677751
4,2013_1103_1137,0.60147
