# Imports

In [1]:
import pandas as pd
import time
import datetime as dt
import seaborn as sns
import matplotlib.pyplot as plt

import utilities.densmore_v3 as dns

In [2]:
import pycaret.classification as pyc

# Data Setup

**Reading in CSVs**

In [3]:
df_stats = pd.read_csv('../data/output_aggragated/basic_stats2.csv')
df_info = pd.read_csv('../data/output_aggragated/mls_fixtures_2015_2020.csv')

**Filtering out non-regular-season  Fixtures**

In [4]:
df_info = df_info[df_info['lg_rnd'].str.contains('Regular Season')]

**Filtering out unnecessary columns from `df_info`**

In [5]:
df_info.columns

Index(['fx_id', 'fx_ref', 'fx_tz', 'fx_date', 'fx_time', 'fx_per_fst',
       'fx_per_sec', 'fx_ven_id', 'fx_ven_name', 'fx_ven_city', 'fx_sts_long',
       'fx_sts_shrt', 'fx_sts_elps', 'lg_id', 'lg_name', 'lg_ctry', 'lg_logo',
       'lg_flag', 'lg_seas', 'lg_rnd', 'tm_h_id', 'tm_h_name', 'tm_h_logo',
       'tm_h_win', 'tm_a_id', 'tm_a_name', 'tm_a_logo', 'tm_a_win', 'gl_h',
       'gl_a', 'sc_ht_h', 'sc_ht_a', 'sc_ft_h', 'sc_ft_a', 'sc_et_h',
       'sc_et_a', 'sc_pen_h', 'sc_pen_a'],
      dtype='object')

In [6]:
df_info = df_info[['fx_id', 'fx_ref', 'fx_time', 'fx_ven_name', 'fx_ven_city', 'fx_sts_elps', 
 'tm_h_id', 'tm_h_win', 'tm_a_id', 'tm_a_win']]

In [7]:
# df_info.head()
# df_info.info()

**Renaming Time Column and Unpacking it into separate Date and Time Columns**

In [8]:
df_info.rename(columns={'fx_time':'fx_utc'}, inplace=True)

In [9]:
df_info['fx_date'] = df_info['fx_utc'].map(lambda x: dt.datetime.fromtimestamp(x).strftime('%Y-%m-%d'))
df_info['fx_time'] = df_info['fx_utc'].map(lambda x: dt.datetime.fromtimestamp(x).strftime('%H:%M:%S'))

In [10]:
df_info.drop(axis=1, columns=['fx_utc'], inplace=True)

**Removing Nulls**

The target columns: 'tm_h_win' and 'tm_a_win' each have nulls in them whenever the game ends in a draw. So I'll remove those columns to simplify classification.

In [11]:
df_info.dropna(axis=0, subset=['tm_h_win', 'tm_a_win'], inplace=True)

In [12]:
df_info.fillna('unknown', inplace=True)

In [13]:
# df_info.info()

**Filtering out unnecessary columns from `df_stats`**

In [14]:
df_stats.columns

Index(['fx_id', 'tm_h_id', 'tm_h_name', 'tm_a_id', 'tm_a_name',
       'h_shots_on_goal', 'h_shots_off_goal', 'h_total_shots',
       'h_blocked_shots', 'h_shots_insidebox', 'h_shots_outsidebox', 'h_fouls',
       'h_corner_kicks', 'h_offsides', 'h_ball_possession', 'h_yellow_cards',
       'h_red_cards', 'h_goalkeeper_saves', 'h_total_passes',
       'h_passes_accurate', 'h_passes_%', 'a_shots_on_goal',
       'a_shots_off_goal', 'a_total_shots', 'a_blocked_shots',
       'a_shots_insidebox', 'a_shots_outsidebox', 'a_fouls', 'a_corner_kicks',
       'a_offsides', 'a_ball_possession', 'a_yellow_cards', 'a_red_cards',
       'a_goalkeeper_saves', 'a_total_passes', 'a_passes_accurate',
       'a_passes_%'],
      dtype='object')

There aren't any columns that I want to remove from the stats dataframe. So I'll leave them all in for now.

**Merging the two dataframes on the fixture ID column**

In [15]:
df = pd.merge(left=df_info, right=df_stats, how='left', on='fx_id')

In [16]:
df.columns

Index(['fx_id', 'fx_ref', 'fx_ven_name', 'fx_ven_city', 'fx_sts_elps',
       'tm_h_id_x', 'tm_h_win', 'tm_a_id_x', 'tm_a_win', 'fx_date', 'fx_time',
       'tm_h_id_y', 'tm_h_name', 'tm_a_id_y', 'tm_a_name', 'h_shots_on_goal',
       'h_shots_off_goal', 'h_total_shots', 'h_blocked_shots',
       'h_shots_insidebox', 'h_shots_outsidebox', 'h_fouls', 'h_corner_kicks',
       'h_offsides', 'h_ball_possession', 'h_yellow_cards', 'h_red_cards',
       'h_goalkeeper_saves', 'h_total_passes', 'h_passes_accurate',
       'h_passes_%', 'a_shots_on_goal', 'a_shots_off_goal', 'a_total_shots',
       'a_blocked_shots', 'a_shots_insidebox', 'a_shots_outsidebox', 'a_fouls',
       'a_corner_kicks', 'a_offsides', 'a_ball_possession', 'a_yellow_cards',
       'a_red_cards', 'a_goalkeeper_saves', 'a_total_passes',
       'a_passes_accurate', 'a_passes_%'],
      dtype='object')

**Re-ordering Columns**

In [17]:
df = df[['fx_id', 'fx_date', 'fx_time', 'fx_ref', 'fx_ven_name', 'fx_ven_city', 'fx_sts_elps', 
         'tm_h_id_x', 'tm_a_id_x', 'tm_h_name', 'tm_a_name', 'tm_h_win', 'tm_a_win', 
         'h_shots_on_goal', 'h_shots_off_goal', 'h_total_shots',
         'h_blocked_shots', 'h_shots_insidebox', 'h_shots_outsidebox', 'h_fouls',
         'h_corner_kicks', 'h_offsides', 'h_ball_possession', 'h_yellow_cards',
         'h_red_cards', 'h_goalkeeper_saves', 'h_total_passes',
         'h_passes_accurate', 'h_passes_%', 'a_shots_on_goal',
         'a_shots_off_goal', 'a_total_shots', 'a_blocked_shots',
         'a_shots_insidebox', 'a_shots_outsidebox', 'a_fouls', 'a_corner_kicks',
         'a_offsides', 'a_ball_possession', 'a_yellow_cards', 'a_red_cards',
         'a_goalkeeper_saves', 'a_total_passes', 'a_passes_accurate',
         'a_passes_%']]

**Converting Strings with % Symbol to Floats**

In [18]:
df['h_passes_%'] = df['h_passes_%'].map(lambda x: float(str(x).strip('%'))/100)
df['a_passes_%'] = df['a_passes_%'].map(lambda x: float(str(x).strip('%'))/100)
df['h_ball_possession'] = df['h_ball_possession'].map(lambda x: float(str(x).strip('%'))/100)
df['a_ball_possession'] = df['a_ball_possession'].map(lambda x: float(str(x).strip('%'))/100)

In [19]:
df.rename(columns={'tm_h_id_x':'tm_h_id', 'tm_a_id_x':'tm_a_id'}, inplace=True)

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1605 entries, 0 to 1604
Data columns (total 45 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   fx_id               1605 non-null   int64  
 1   fx_date             1605 non-null   object 
 2   fx_time             1605 non-null   object 
 3   fx_ref              1605 non-null   object 
 4   fx_ven_name         1605 non-null   object 
 5   fx_ven_city         1605 non-null   object 
 6   fx_sts_elps         1605 non-null   float64
 7   tm_h_id             1605 non-null   int64  
 8   tm_a_id             1605 non-null   int64  
 9   tm_h_name           1605 non-null   object 
 10  tm_a_name           1605 non-null   object 
 11  tm_h_win            1605 non-null   bool   
 12  tm_a_win            1605 non-null   bool   
 13  h_shots_on_goal     1581 non-null   float64
 14  h_shots_off_goal    1589 non-null   float64
 15  h_total_shots       1587 non-null   float64
 16  h_bloc

# 
---
# 

# Modeling

In [21]:
df.columns

Index(['fx_id', 'fx_date', 'fx_time', 'fx_ref', 'fx_ven_name', 'fx_ven_city',
       'fx_sts_elps', 'tm_h_id', 'tm_a_id', 'tm_h_name', 'tm_a_name',
       'tm_h_win', 'tm_a_win', 'h_shots_on_goal', 'h_shots_off_goal',
       'h_total_shots', 'h_blocked_shots', 'h_shots_insidebox',
       'h_shots_outsidebox', 'h_fouls', 'h_corner_kicks', 'h_offsides',
       'h_ball_possession', 'h_yellow_cards', 'h_red_cards',
       'h_goalkeeper_saves', 'h_total_passes', 'h_passes_accurate',
       'h_passes_%', 'a_shots_on_goal', 'a_shots_off_goal', 'a_total_shots',
       'a_blocked_shots', 'a_shots_insidebox', 'a_shots_outsidebox', 'a_fouls',
       'a_corner_kicks', 'a_offsides', 'a_ball_possession', 'a_yellow_cards',
       'a_red_cards', 'a_goalkeeper_saves', 'a_total_passes',
       'a_passes_accurate', 'a_passes_%'],
      dtype='object')

In [22]:
df.fillna(0, inplace=True)

In [23]:
df_h = df.drop(columns=['fx_id', 'tm_a_win'])

In [24]:
df_a = df.drop(columns=['fx_id', 'tm_h_win'])

**Setting up PyCaret Modeling**

In [25]:
df['tm_h_id'] = df['tm_h_id'].map(lambda x: str(x))
df['tm_a_id'] = df['tm_a_id'].map(lambda x: str(x))

In [26]:
df_h.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1605 entries, 0 to 1604
Data columns (total 43 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   fx_date             1605 non-null   object 
 1   fx_time             1605 non-null   object 
 2   fx_ref              1605 non-null   object 
 3   fx_ven_name         1605 non-null   object 
 4   fx_ven_city         1605 non-null   object 
 5   fx_sts_elps         1605 non-null   float64
 6   tm_h_id             1605 non-null   int64  
 7   tm_a_id             1605 non-null   int64  
 8   tm_h_name           1605 non-null   object 
 9   tm_a_name           1605 non-null   object 
 10  tm_h_win            1605 non-null   bool   
 11  h_shots_on_goal     1605 non-null   float64
 12  h_shots_off_goal    1605 non-null   float64
 13  h_total_shots       1605 non-null   float64
 14  h_blocked_shots     1605 non-null   float64
 15  h_shots_insidebox   1605 non-null   float64
 16  h_shot

In [27]:
df_h.head()

Unnamed: 0,fx_date,fx_time,fx_ref,fx_ven_name,fx_ven_city,fx_sts_elps,tm_h_id,tm_a_id,tm_h_name,tm_a_name,tm_h_win,h_shots_on_goal,h_shots_off_goal,h_total_shots,h_blocked_shots,h_shots_insidebox,h_shots_outsidebox,h_fouls,h_corner_kicks,h_offsides,h_ball_possession,h_yellow_cards,h_red_cards,h_goalkeeper_saves,h_total_passes,h_passes_accurate,h_passes_%,a_shots_on_goal,a_shots_off_goal,a_total_shots,a_blocked_shots,a_shots_insidebox,a_shots_outsidebox,a_fouls,a_corner_kicks,a_offsides,a_ball_possession,a_yellow_cards,a_red_cards,a_goalkeeper_saves,a_total_passes,a_passes_accurate,a_passes_%
0,2015-03-06,19:00:00,"Jair Marrufo, USA",StubHub Center,Los Angeles,90.0,1605,1607,Los Angeles Galaxy,Chicago Fire,True,5.0,7.0,13.0,1.0,8.0,5.0,14.0,4.0,2.0,0.53,1.0,0.0,1.0,465.0,381.0,0.82,1.0,3.0,8.0,4.0,2.0,6.0,15.0,2.0,2.0,0.47,2.0,0.0,3.0,427.0,332.0,0.78
1,2015-03-07,12:00:00,"Mark Geiger, USA",RFK Stadium,Washington,90.0,1615,1614,DC United,Montreal Impact,True,4.0,5.0,0.0,0.0,0.0,0.0,6.0,4.0,3.0,0.52,0.0,0.0,3.0,0.0,0.0,0.0,3.0,6.0,0.0,0.0,0.0,0.0,13.0,4.0,3.0,0.48,0.0,0.0,2.0,0.0,0.0,0.0
2,2015-03-07,15:00:00,"Kevin Stott, USA",BC Place Stadium,Vancouver,90.0,8007,1601,Whitecaps,Toronto FC,False,7.0,6.0,13.0,0.0,8.0,5.0,18.0,8.0,2.0,0.51,2.0,0.0,2.0,397.0,294.0,0.74,5.0,4.0,15.0,6.0,6.0,9.0,12.0,2.0,0.0,0.49,1.0,0.0,6.0,397.0,307.0,0.77
3,2015-03-07,17:30:00,"Ted Unkel, USA",Toyota Stadium,Frisco,90.0,1597,1596,FC Dallas,San Jose Earthquakes,True,5.0,6.0,11.0,0.0,7.0,4.0,16.0,5.0,3.0,0.52,1.0,0.0,2.0,369.0,275.0,0.75,2.0,4.0,9.0,3.0,4.0,5.0,15.0,3.0,2.0,0.48,4.0,0.0,3.0,349.0,253.0,0.72
4,2015-03-07,17:30:00,"Allen Chapman, USA",BBVA Compass Stadium,Houston,90.0,1600,1613,Houston Dynamo,Columbus Crew,True,3.0,4.0,0.0,0.0,0.0,0.0,11.0,1.0,1.0,0.41,0.0,0.0,7.0,0.0,0.0,0.0,8.0,7.0,0.0,0.0,0.0,0.0,12.0,6.0,1.0,0.59,2.0,0.0,2.0,0.0,0.0,0.0


In [28]:
plt.figure(figsize=(5,10))
sns.heatmap(df_h.corr()[['tm_h_win']].sort_values(by = 'tm_h_win', ascending=False, key = np.abs), cmap = 'coolwarm', vmin = -1, vmax = 1, annot = True)


NameError: name 'np' is not defined

<Figure size 360x720 with 0 Axes>

In [None]:
model_setup_a = pyc.setup(data=df_h, target='tm_h_win', normalize=True, 
                        remove_multicollinearity=True, multicollinearity_threshold=.75,
                          session_id=74)

In [None]:
best_model = pyc.compare_models()

In [None]:
model_lr = pyc.create_model('lr')

```
[{'day' = 1, teams = {'Sounders':1400,'Red Bulls':1500},
 {'day' = 2, teams = {'Sounders':1305,'Red Bulls':1605}]

{'season':[1,1,1,1,1],
 'week':[1,1,1,1,1],
'day':[1,2,3,4,5],
'sounders':[(1400,played),(1400,played),(1400,didn't play)],
'red bulls':[1503,1502,1504,1506,1506]}

```