In [55]:
import pandas as pd
import numpy as np
import datetime as dt
from meteostat import Point, Daily, Stations

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import explained_variance_score
from sklearn.metrics import median_absolute_error
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_poisson_deviance
from sklearn.metrics import mean_gamma_deviance
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import log_loss
from sklearn.metrics import brier_score_loss
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import jaccard_score
from sklearn.metrics import hamming_loss
from sklearn.metrics import zero_one_loss
from sklearn.metrics import hinge_loss
from sklearn.metrics import zero_one_loss
from sklearn.metrics import log_loss
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import cohen_kappa_score


## data wrangling

In [105]:
df = pd.read_csv('C:\\Users\\shann\\Documents\\GitHub\\15095-project\\data\\international_matches_cleaned.csv')
df_city_locations = pd.read_excel('C:\\Users\\shann\\Documents\\GitHub\\15095-project\\data\\worldcities_excel.xlsx')

In [106]:
df_city_locations.head(2)

Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id
0,Tokyo,Tokyo,35.6839,139.7744,Japan,JP,JPN,Tōkyō,primary,39105000.0,1392685764
1,Jakarta,Jakarta,-6.2146,106.8451,Indonesia,ID,IDN,Jakarta,primary,35362000.0,1360771077


In [107]:
# city, city_ascii, country lowercase
df_city_locations['city'] = df_city_locations['city'].str.lower()
df_city_locations['city_ascii'] = df_city_locations['city_ascii'].str.lower()
df_city_locations['country'] = df_city_locations['country'].str.lower()

In [116]:
# extract city_ascii, lat, lng, country, population from df_city_locations
df_city_locations = df_city_locations[['city', 'lat', 'lng', 'population']]

In [109]:
df_city_locations.head(2)

Unnamed: 0,city,lat,lng,country,population
0,tokyo,35.6839,139.7744,japan,39105000.0
1,jakarta,-6.2146,106.8451,indonesia,35362000.0


In [110]:
df.head(2)

Unnamed: 0,date,home_team,away_team,home_team_continent,away_team_continent,home_team_fifa_rank,away_team_fifa_rank,home_team_total_fifa_points,away_team_total_fifa_points,home_team_score,...,shoot_out,home_team_result,home_team_goalkeeper_score,away_team_goalkeeper_score,home_team_mean_defense_score,home_team_mean_offense_score,home_team_mean_midfield_score,away_team_mean_defense_score,away_team_mean_offense_score,away_team_mean_midfield_score
0,9/2/2011,Argentina,Venezuela,South America,South America,9,44,1016,615,1,...,No,Win,77,64,80.8,88.3,85.2,72.2,73.7,75.5
1,9/2/2011,Belarus,Bosnia and Herzegovina,Europe,Europe,41,39,645,655,0,...,No,Lose,75,76,63.2,67.3,68.0,72.0,77.7,78.0


In [111]:
# variance of shoot_out column
# make shoot_out column binary
df['shoot_out'] = df['shoot_out'].apply(lambda x: 1 if x == 'yes' else 0)
df['shoot_out'].var() # no variance so we can drop this column

0.0

In [112]:
# remove variables that may cause leakage 
# remove variables that are not useful for modelling
df.drop(['home_team_score', # leakage
        'away_team_score', # leakage
        #'date', # not useful for modelling
        'shoot_out', # no variance
        'neutral_location', # not useful for this problem
        #'tournament'  # not useful for modelling
        ],
        inplace=True, axis=1)

In [113]:
# check tournament column counts
# df['tournament'].value_counts()

In [114]:
# use country and city to find weather data for each match
df['date'] = pd.to_datetime(df['date'])
df[['city','country','tournament','date']].head(3)

Unnamed: 0,city,country,tournament,date
0,Calcutta,India,Friendly,2011-09-02
1,Minsk,Belarus,UEFA Euro qualification,2011-09-02
2,Sofia,Bulgaria,UEFA Euro qualification,2011-09-02


In [115]:
# lower case city and country
df['city'] = df['city'].str.lower()
df['country'] = df['country'].str.lower()

In [117]:
# map lat and lng to each match in df
df = df.merge(df_city_locations, how='left', left_on=['city'], right_on=['city'])

In [118]:
df[['city','lat','lng']].head(3)

Unnamed: 0,city,lat,lng
0,calcutta,,
1,minsk,53.9022,27.5618
2,sofia,42.6979,23.3217


In [134]:
# city values beginning with 't' in df_city_locations
df_city_locations[df_city_locations['city'].str.startswith('tel')]

Unnamed: 0,city,lat,lng,population
1510,tel aviv-yafo,32.08,34.78,451523.0
3751,telford,52.6766,-2.4469,142723.0
4908,telde,27.9985,-15.4167,102769.0
4975,teluk intan,4.0259,101.0213,101659.0
7455,telšiai,55.9833,22.25,22039.0
7640,telavi,41.9167,45.4833,19629.0
8987,teleneşti,47.4997,28.3656,7227.0
10855,telêmaco borba,-24.3239,-50.6158,78974.0
10876,tela,15.7833,-87.4667,78537.0
18957,teltow,52.4022,13.2706,22538.0


In [119]:
# which countries have missing lat and lng?
df[df['lat'].isnull()]['city'].value_counts()

cardiff              38
solna                22
st. gallen           13
villeneuve-d'ascq     8
rostov-on-don         7
al rayyan             6
nizhny novgorod       5
cluj                  5
suita                 4
herning               3
ploiești              3
ekaterinburg          3
kobe                  3
ismaila               3
kallang               3
cheonan               3
chōfu                 3
donets'k              3
faro-loulé            2
luque                 2
brøndby               2
osaka                 2
al ain                2
frankfurt am main     2
ramat-gan             2
hwaseong              2
puerto ordaz          2
villeneuve d'ascq     2
brondbyvester         2
vila-real             1
ivory coast           1
guayana               1
são joão da venda     1
burkina faso          1
tel aviv              1
calcutta              1
belek                 1
biel                  1
molenbeek             1
cracow                1
limbé                 1
jaffa           

In [120]:
# which countries have missing lat and lng?
df['lat'].isnull().sum()

179

In [None]:
location = Point(49.2497, -123.1193, 70)
data = Daily(location, 2019, 1)