# Import Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import Imputer
from sklearn import preprocessing

# To split the dataset into train and test datasets
from sklearn.cross_validation import train_test_split

# To model the Gaussian Naive Bayes classifier
from sklearn.naive_bayes import GaussianNB

# To calculate the accuracy score of the model
from sklearn.metrics import accuracy_score
import re

In [2]:
#Import Massey
massey = pd.read_csv('../input/MasseyOrdinals.csv')

#Import Other files
cities = pd.read_csv('../input/Cities.csv')
conferences = pd.read_csv('../input/Conferences.csv')
conf_tourney = pd.read_csv('../input/ConferenceTourneyGames.csv')
game_cities = pd.read_csv('../input/GameCities.csv')
tourney_results = pd.read_csv('../input/NCAATourneyCompactResults.csv')
detailed_tourney_results = pd.read_csv('../input/NCAATourneyDetailedResults.csv')
tourney_all = pd.read_csv('../input/NCAATourneySeedRoundSlots.csv')
tourney_seeds = pd.read_csv('../input/NCAATourneySeeds.csv')
tourney_slots = pd.read_csv('../input/NCAATourneySlots.csv')
reg_season = pd.read_csv('../input/RegularSeasonCompactResults.csv')
detailed_reg_season = pd.read_csv('../input/RegularSeasonDetailedResults.csv')
seasons = pd.read_csv('../input/Seasons.csv')
second_tourney = pd.read_csv('../input/SecondaryTourneyCompactResults.csv')
detailed_second_tourney = pd.read_csv('../input/SecondaryTourneyTeams.csv')
team_coaches = pd.read_csv('../input/TeamCoaches.csv')
team_conferences = pd.read_csv('../input/TeamConferences.csv')
teams = pd.read_csv('../input/Teams.csv')
teams_description = pd.read_csv('../input/Teams.csv')

In [3]:
detailed_reg_season.groupby('Season').count()

## Use results of best elo from prior grid search, k=45, ha=10

In [4]:
rs = pd.read_csv("../input/RegularSeasonCompactResults.csv")
results_data = pd.read_csv("../input/RegularSeasonCompactResults.csv")
K = 45
HOME_ADVANTAGE = 10
rs = results_data
rs.head(3)
team_ids = set(rs.WTeamID).union(set(rs.LTeamID))
len(team_ids)
elo_dict = dict(zip(list(team_ids), [1500] * len(team_ids)))
rs['margin'] = rs.WScore - rs.LScore
def elo_pred(elo1, elo2):
    return(1. / (10. ** (-(elo1 - elo2) / 400.) + 1.))

def expected_margin(elo_diff):
    return((7.5 + 0.006 * elo_diff))

def elo_update(w_elo, l_elo, margin):
    elo_diff = w_elo - l_elo
    pred = elo_pred(w_elo, l_elo)
    mult = ((margin + 3.) ** 0.8) / expected_margin(elo_diff)
    update = K * mult * (1 - pred)
    return(pred, update)
assert np.all(rs.index.values == np.array(range(rs.shape[0]))), "Index is out of order."
preds = []
w_elo = []
l_elo = []

# Loop over all rows of the games dataframe
for row in rs.itertuples():

    # Get key data from current row
    w = row.WTeamID
    l = row.LTeamID
    margin = row.margin
    wloc = row.WLoc

    # Does either team get a home-court advantage?
    w_ad, l_ad, = 0., 0.
    if wloc == "H":
        w_ad += HOME_ADVANTAGE
    elif wloc == "A":
        l_ad += HOME_ADVANTAGE

    # Get elo updates as a result of the game
    pred, update = elo_update(elo_dict[w] + w_ad,
                              elo_dict[l] + l_ad, 
                              margin)
    elo_dict[w] += update
    elo_dict[l] -= update

    # Save prediction and new Elos for each round
    preds.append(pred)
    w_elo.append(elo_dict[w])
    l_elo.append(elo_dict[l])
rs['w_elo'] = w_elo
rs['l_elo'] = l_elo
rs['elo_dif'] =rs['w_elo']-rs['l_elo']
#correlation=rs[['elo_dif','margin']].corr().iloc[0,1]
#results_df=pd.DataFrame({'H':ha,'K':k,'Correlation':correlation})
print(rs[['elo_dif','margin']].corr().iloc[0,1])


In [5]:
plt.scatter(rs['elo_dif'],rs['margin'], alpha=.005)

## Create table

In [6]:
rs_win=rs[['Season','DayNum','WTeamID','w_elo']]
rs_lose=rs[['Season','DayNum','LTeamID','l_elo']]

rs_win=rs_win.rename(columns={'WTeamID':'TeamID','w_elo':'elo'})
rs_lose=rs_lose.rename(columns={'LTeamID':'TeamID','l_elo':'elo'})
rs_all=rs_win.append(rs_lose)

final_elo=pd.DataFrame()
for team in rs_all['TeamID'].unique():
    for season in rs_all[rs_all['TeamID']==team]['Season'].unique():
        append=rs_all[(rs_all['TeamID']==team) & (rs_all['Season']==season)].sort_values('DayNum', ascending=False).iloc[0,:]
        final_elo=final_elo.append(append)
        
final_elo.head(2)

## Put together Data

In [7]:
tourney_seeds = pd.read_csv('../input/NCAATourneySeeds.csv')

In [8]:
def removeletters(x):
    return re.sub('[^0-9]','', x)

In [9]:
tourney_seeds['Seed']=tourney_seeds['Seed'].apply(lambda x: removeletters(x)).apply(int)

In [10]:
df1=tourney_seeds.merge(tourney_results, left_on= ['TeamID','Season'], right_on=['WTeamID','Season'], how='left')

In [11]:
df1['TeamID1']=df1['WTeamID']
df1['TeamID2']=df1['LTeamID']
df1['Win']=np.where(df1['TeamID1']==df1['WTeamID'],1,0)
df1=df1.rename(columns={'Seed':'TeamID1Seed'})

In [12]:
df2=df1.merge(tourney_seeds, left_on= ['LTeamID','Season'], right_on=['TeamID','Season'], how='right')
df2=df2.rename(columns={'Seed':'TeamID2Seed','Win':'Team1Win'})
df2.drop(['TeamID_x','TeamID_y','NumOT','WLoc','LTeamID','WTeamID'], 1, inplace = True)

In [13]:
df2['Margin']=df2['WScore']-df2['LScore']
df2.drop(['WScore','LScore'], 1, inplace = True)

In [14]:
lose_df=df2[['TeamID2','TeamID1','TeamID2Seed','TeamID1Seed','Season']]
lose_df=lose_df.rename(columns={'TeamID2':'TeamID','TeamID1':'OppTeamID','TeamID2Seed':'Seed','TeamID1Seed':'OppSeed'})
lose_df['Win']=0

In [15]:
win_df=df2[['TeamID1','TeamID2','TeamID1Seed','TeamID2Seed','Season']]
win_df['Win']=1
win_df=win_df.rename(columns={'TeamID1':'TeamID','TeamID2':'OppTeamID','TeamID1Seed':'Seed','TeamID2Seed':'OppSeed'})

In [16]:
all_tourney_df=win_df.append(lose_df)

In [17]:
all_tourney_df=all_tourney_df.merge(final_elo, left_on=['OppTeamID','Season'], right_on=['TeamID','Season'],how='left')
#all_tourney_df=all_tourney_df.rename(columns={'TeamID_x':'TeamID'})
all_tourney_df.drop(['TeamID_y'], 1, inplace = True)
all_tourney_df=all_tourney_df.rename(columns={'TeamID_x':'TeamID','elo':'Oppelo'})

In [18]:
all_tourney_df.head()

In [19]:
all_tourney_df=all_tourney_df.merge(final_elo, left_on=['TeamID','Season'], right_on=['TeamID','Season'],how='left')

In [20]:
all_tourney_df.drop(['DayNum_y','DayNum_x'], 1, inplace = True)

In [21]:
all_tourney_df=all_tourney_df[(~all_tourney_df['TeamID'].isnull()) & (~all_tourney_df['OppSeed'].isnull())]

In [22]:
all_tourney_df['elodif']=all_tourney_df['elo']-all_tourney_df['Oppelo']
all_tourney_df['seeddif']=all_tourney_df['Seed']-all_tourney_df['OppSeed']

In [23]:
features=all_tourney_df[['TeamID','OppSeed','Seed','Season','OppTeamID','elodif']]
target=all_tourney_df['Win']

In [24]:
features_train, features_test, target_train, target_test = train_test_split(features,
                                                target, test_size = 0.3)
target_test
clf = GaussianNB()
clf.fit(features_train, target_train)
target_pred = clf.predict(features_test)

accuracy_score(target_test, target_pred, normalize = True)

# Import confusion matrix functionality
from sklearn.metrics import confusion_matrix as sk_confusion_matrix

# Create and format a confusion matrix
def conf_matrix(y_test, y_predict):

    # Create the raw confusion matrix
    conf = sk_confusion_matrix(y_test, y_predict)

    # Format the confusion matrix nicely
    conf = pd.DataFrame(data=conf)
    conf.columns.name = 'Predicted label'
    conf.index.name = 'Actual label'

    # Return the confusion matrix
    return conf

conf_matrix(target_test, target_pred)

In [25]:
from sklearn import linear_model, datasets

In [26]:
clf = linear_model.LogisticRegression()
clf.fit(features_train, target_train)
target_pred = clf.predict(features_test)

accuracy_score(target_test, target_pred, normalize = True)

In [27]:
from sklearn import metrics
print(metrics.classification_report(target_test, target_pred))

# Creating the 2018 Predictions

In [28]:
df2018=tourney_seeds[tourney_seeds['Season']==2018]

In [29]:
teams2018=df2018['TeamID'].unique()

In [30]:
matchups=pd.DataFrame()
index=0
for team in teams2018:
    for team2 in teams2018:
        if team!=team2:
            index=index+1
            matchup=pd.DataFrame({'TeamID':team,'OppTeamID':team2},index=[index])
            matchups=matchups.append(matchup)

In [31]:
final_elo2018=final_elo[final_elo['Season']==2018][['TeamID','elo']]

In [32]:
matchups=matchups.merge(final_elo2018, left_on=['OppTeamID'],right_on=['TeamID'])
matchups.drop(labels=['TeamID_y'], inplace=True, axis=1)
matchups=matchups.rename(columns={'TeamID_x':'TeamID'})
matchups=matchups.rename(columns={'elo':'Oppelo'})

In [33]:
matchups=matchups.merge(final_elo2018, left_on=['TeamID'],right_on=['TeamID'])
matchups=matchups.merge(df2018, left_on=['OppTeamID'], right_on=['TeamID'])
matchups=matchups.rename(columns={'TeamID_x':'TeamID','Seed':'OppSeed'})
matchups.drop(['TeamID_y'], 1, inplace = True)

In [34]:
matchups.head(2)

In [35]:
matchups=matchups.merge(df2018, left_on=['TeamID'], right_on=['TeamID'])
matchups=matchups.rename(columns={'Season_x':'Season'})
matchups.drop(['Season_y'], 1, inplace = True)
matchups['elodif']=matchups['elo']-matchups['Oppelo']
matchups['seeddif']=matchups['Seed']-matchups['OppSeed']

In [36]:
features=matchups[['TeamID','OppSeed','Seed','Season','OppTeamID','elodif']]

In [37]:
probabilities=clf.predict_proba(features)

In [38]:
predictions=[]
for i in probabilities:
    predictions.append((i[1]))

In [39]:
matchups['prediction']=predictions

In [40]:
teams_description=teams_description.iloc[:,:2]

In [41]:
output=matchups.merge(teams_description, left_on=['OppTeamID'], right_on=['TeamID'])
output=output.rename(columns={'TeamName':'OppTeamName'})
output.drop(labels=['TeamID_y'], inplace=True, axis=1)

In [49]:
output=output.merge(teams_description, left_on=['TeamID_x'], right_on=['TeamID'])

In [51]:
output=output.rename(columns={'TeamID_x':'TeamID'})

In [53]:
output=output[['TeamName','OppTeamName','prediction','Seed','OppSeed','elo','Oppelo']]

In [None]:
output.to_csv('descriptive_output.csv')

In [None]:
matchups.merge(teams_description, left_on=['TeamID'], right_on=['TeamID']).groupby('TeamName').mean().sort_values('prediction', ascending=False)

In [None]:
matchups_for_submission=matchups[matchups['TeamID']<matchups['OppTeamID']]

In [None]:
matchups_for_submission['ID'] = '2018_' + matchups_for_submission.TeamID.map(str) + "_" + matchups_for_submission.OppTeamID.map(str)
matchups_for_submission['pred']=matchups_for_submission['prediction']

### Grab sample to join on ids

In [None]:
sample = pd.read_csv('../input/SampleSubmissionStage2.csv')
del sample['Pred']

In [None]:
sample.shape

### Limit to fields we care about

In [None]:
matchups_for_submission=matchups_for_submission[['ID','pred']]

In [None]:
matchups_for_submission.shape

In [None]:
matchups_for_submission['pred'].mean()

In [None]:
matchups_for_submission.to_csv('submission.csv', index=None)

The file you submit will depend on whether the competition is in stage 1 (historical model building) or stage 2 (the 2018 tournament). Sample submission files will be provided for both stages. The format is a list of every possible matchup between the tournament teams. Since team1 vs. team2 is the same as team2 vs. team1, we only include the game pairs where team1 has the lower team id. For example, in a tournament of 68 teams (64 + 4 play-in teams), you will predict (68*67)/2  = 2278 matchups. 

Each game has a unique id created by concatenating the season in which the game was played, the team1 id, and the team2 id. For example, "2013_1104_1129" indicates team 1104 played team 1129 in the year 2013. You must predict the probability that the team with the lower id beats the team with the higher id.

The resulting submission format looks like the following, where "pred" represents the predicted probability that the first team will win: