In [1]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
from matplotlib import pyplot

from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

from xgboost import XGBClassifier
from xgboost import plot_importance

In [2]:
#Load the dataframes
df_seeds = pd.read_csv('NCAATourneySeeds.csv')
df_matches_NCAA = pd.read_csv('NCAATourneyCompactResults.csv')
df_matches_reg = pd.read_csv('RegularSeasonCompactResults.csv')

#We can choose to concatinate the two dataframes or only use one
#df_matches = pd.concat([df_matches_NCAA, df_matches_reg])
df_matches = df_matches_NCAA

In [3]:
#Display top 5 rows in the dataframe

print(df_matches_NCAA.shape)
print(df_matches_reg.shape)

print(df_seeds.head())
df_matches_NCAA.head()

(2117, 8)
(150684, 8)
   Season Seed  TeamID
0    1985  W01    1207
1    1985  W02    1210
2    1985  W03    1228
3    1985  W04    1260
4    1985  W05    1374


Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,136,1116,63,1234,54,N,0
1,1985,136,1120,59,1345,58,N,0
2,1985,136,1207,68,1250,43,N,0
3,1985,136,1229,58,1425,55,N,0
4,1985,136,1242,49,1325,38,N,0


In [4]:
#Extract the numeric part of the seed (drop the regions at the beginning and the (a,b) at the end).
#Then convert the numeric string to integer
df_seeds['Seed'] = df_seeds['Seed'].str.extract('(\d+)').astype(int)
df_seeds.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Season,Seed,TeamID
0,1985,1,1207
1,1985,2,1210
2,1985,3,1228
3,1985,4,1260
4,1985,5,1374


In [5]:
#Drop some of the columns to make the dataframe simpler
#DayNum is the day in which the match was played
#NumOT: the number of overtime periods in the game.
#WLoc is  this identifies the "location" of the winning team. 
# ---If the winning team was the home team, this value will be "H".
# ---If the winning team was the visiting team, this value will be "A".
# ---If it was played on a neutral court, then this value will be "N".

#axis = 0 drops a row. axis = 1 drops from the columns
#inplace = True does the operation inplace, so you dont need to write df = df.drop....
df_matches.drop(labels=['DayNum', 'WLoc', 'NumOT'], inplace = True, axis=1)
print(df_matches.shape)

# Drop the matches before some year if you want, or choose 1984 to take all data
df_matches = df_matches[df_matches.Season > 1984]
print(df_matches.shape)

#Swap the second and third columns to have teams and scores next to each other for better visualisation
df_matches = df_matches[["Season","WTeamID","LTeamID", "WScore", "LScore"]]
df_matches.head()

(2117, 5)
(2117, 5)


Unnamed: 0,Season,WTeamID,LTeamID,WScore,LScore
0,1985,1116,1234,63,54
1,1985,1120,1345,59,58
2,1985,1207,1250,68,43
3,1985,1229,1425,58,55
4,1985,1242,1325,49,38


In [6]:
# We are going to merge the matches and seeds databases.
# The seed and score difference in each match is also calculated
# First, in the seeds df, change TeamID to WTeamID and LTeamID to match with the df_matches df
# Then merge the winning teams seeds
# Then merge the losing teams seeds
# Calculate the seed and score difference in each match

df_winseeds = df_seeds.rename(columns={'TeamID':'WTeamID', 'Seed':'WSeed'})
df_lossseeds = df_seeds.rename(columns={'TeamID':'LTeamID', 'Seed':'LSeed'})
df_dummy = pd.merge(left=df_matches, right=df_winseeds, how='left', on=['Season', 'WTeamID'])
df = pd.merge(left=df_dummy, right=df_lossseeds, on=['Season', 'LTeamID'])
df['SeedDiff'] = df.WSeed - df.LSeed
df['ScoreDiff'] = df.WScore - df.LScore
df.head()

Unnamed: 0,Season,WTeamID,LTeamID,WScore,LScore,WSeed,LSeed,SeedDiff,ScoreDiff
0,1985,1116,1234,63,54,9,8,1,9
1,1985,1120,1345,59,58,11,6,5,1
2,1985,1207,1250,68,43,1,16,-15,25
3,1985,1229,1425,58,55,9,8,1,3
4,1985,1242,1325,49,38,3,14,-11,11


In [7]:
# Assign a win (Result = 1) to the seed and score differences
# Assign a lose (Result = 0) to the negative of the seed and score difference
# This gives us a relationship of how seed and score difference can be classified
# as win or lose. We expect higher seed difference to be more likely to win.


df_wins = df.take([7,8], axis=1)
df_wins['Result'] = 1

df_losses = -df.take([7,8], axis=1)
df_losses['Result'] = 0

data = pd.concat((df_wins, df_losses))
data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,SeedDiff,ScoreDiff,Result
0,1,9,1
1,5,1,1
2,-15,25,1
3,1,3,1
4,-11,11,1


In [8]:
#Split the data into a training set and a validation set.
# We need to keep testing our model on the validation set and try to improve the accuracy

training_data, validation_data = train_test_split(data, random_state=11, train_size=0.5, test_size=0.5)
training_data.shape

(2117, 3)

In [9]:
# The features of the model. 'Result' is the output so should subtract it
# Let's forget about ScoreDiff for now and only learn on SeedDiff

features = list(set(data.columns) - {'Result'} - {'ScoreDiff'})

X_train = training_data[features]
y_train = training_data.Result
X_val = validation_data[features]

In [10]:
# Choose a model
# Choose a range for your parameter that want to GridSearch
# Train the model by fitting X and y
#Print the best parameter

model = LogisticRegression()
params = {'C': np.logspace(start=-5, stop=3, num=9)}
clf = GridSearchCV(model, params, scoring='neg_log_loss', refit=True)
clf.fit(X_train, y_train)
print(clf.best_params_['C'])

0.01


In [11]:
# Or choose a model without doing GridSearch by manually choosing the parameters

#clf = XGBClassifier(max_depth= 2, n_estimators = 20, learning_rate = 1.15)
#clf.fit(X_train, y_train)

In [12]:
# Test the model by predicting the results of the validation set
# Calculate the error in your prediction
# This is essentially the final step in training the model.
# We need to keep trying different models and parameters on our data
# Until we get lower and lower errors

proba = clf.predict_proba(X_val)

print(log_loss(validation_data.Result, proba))

0.555357375839


In [13]:
#Now we neeed to use our trained model to predict the sample submission data 
# The format of Sample submission is SSSS_XXXX_YYYY
# S is season ID, X and Y are the two team ID's

df_sample = pd.read_csv('SampleSubmissionStage1.csv')
n_games = len(df_sample)
print(n_games)

def get_year_t1_t2(ID):
    """Return a tuple with ints `year`, `team1` and `team2`."""
    return (int(x) for x in ID.split('_'))

9112


In [14]:
# Create a new database, interate over all the rows of sample df
# extract season and team ID's using the function defined above
# For that season and that teamID, find the team Seed
# Calculate the seed difference
# For that row of sample df, assign to it the seed difference

X_test = np.zeros(shape=(n_games, 1))
for ii, row in df_sample.iterrows():
    year, t1, t2 = get_year_t1_t2(row.ID)
    t1_seed = df_seeds[(df_seeds.TeamID == t1) & (df_seeds.Season == year)].Seed.values[0]
    t2_seed = df_seeds[(df_seeds.TeamID == t2) & (df_seeds.Season == year)].Seed.values[0]
    diff_seed = t1_seed - t2_seed
    X_test[ii, 0] = diff_seed

In [15]:
# Based on our trained model, predict the probability of winning for the first team
# The [:,1] means we are looking for winning probability, 0 would mean for losing

preds = clf.predict_proba(X_test)[:,1]
df_sample.Pred = preds
df_sample.head()

Unnamed: 0,ID,Pred
0,2014_1107_1110,0.461443
1,2014_1107_1112,0.075154
2,2014_1107_1113,0.269767
3,2014_1107_1124,0.158582
4,2014_1107_1140,0.269767


In [16]:
#Create the file to be uploaded on Kaggle

df_sample.to_csv('submission.csv', index=False)