In [52]:
import sqlite3 as db 
import pandas as pd
conn = db.connect('soccer/database.sqlite')

from sklearn.preprocessing import MinMaxScaler

home_adv_factor = 1
# #'attacking_work_rate', 'defensive_work_rate'
# attr_cols = ['overall_rating',
#        'potential', 'crossing', 'finishing', 'heading_accuracy',
#        'short_passing', 'volleys', 'dribbling', 'curve', 'free_kick_accuracy',
#        'long_passing', 'ball_control', 'acceleration', 'sprint_speed',
#        'agility', 'reactions', 'balance', 'shot_power', 'jumping', 'stamina',
#        'strength', 'long_shots', 'aggression', 'interceptions', 'positioning',
#        'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle',
#        'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning',
#        'gk_reflexes']

attr_cols = ['crossing', 'finishing', 'heading_accuracy','short_passing', 'volleys', 'dribbling', 'free_kick_accuracy',
       'long_passing', 'ball_control', 'acceleration', 'sprint_speed',
       'agility', 'reactions', 'balance', 'shot_power', 'jumping', 'stamina',
       'strength', 'long_shots', 'aggression', 'interceptions', 'positioning',
       'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle',
       'gk_positioning','gk_reflexes']

cols = ['Position']
        

In [53]:
norm_team_name = {}
norm_team_name['Man United'] = 'Manchester United'
norm_team_name['Man City'] = 'Manchester City'
norm_team_name['QPR'] = 'Queens Park Rangers'
norm_team_name['West Brom'] = 'West Bromwich Albion'
norm_team_name['West Ham'] = 'West Ham United'
norm_team_name['Bournemouth'] = 'AFC Bournemouth'

year_data = {'2014': 'statbunker-football-stats/Player Stats 2014-15.csv' , 
             '2015': 'statbunker-football-stats/Player Stats 2015-16.csv' ,
             '2016': 'statbunker-football-stats/Player Stats 2016-17.csv'}


In [54]:
def get_all_players_info(year):
    query_str = """select  p.player_name,pa2.* from player_attributes pa2, player p where p.player_api_id = pa2.player_api_id and
    (pa2.player_api_id, pa2.date) in (
    select z.player_api_id, z.date from (
    select pa.player_api_id, pa.date, min(abs(strftime('%%s',pa.date) - strftime('%%s','%(year)s-09-01 00:00:00'))) from player_attributes pa, player p 
                                    where pa.player_api_id = p.player_api_id
                                    and strftime('%%s',pa.date) > strftime('%%s','2014-05-14 00:00:00') 
                                    and pa.date like '%(year)s%%'
                                    group by pa.player_api_id ) z )"""%{'year':year}
    
    all_player_attrs = pd.read_sql_query(query_str,conn)
    return all_player_attrs


In [55]:
def norm_team_names(team_info):
    
    for key,value in norm_team_name.items():
        team_info.loc[lambda df: df['HomeTeam'] == key, 'HomeTeam'] = value
        team_info.loc[lambda df: df['AwayTeam'] == key, 'AwayTeam'] = value
    
    return team_info

In [56]:
def get_full_data(year):
    ptm = pd.read_csv(year_data[year] , dtype = {'overall_rating':float} )
    ptm = ptm[(ptm['Type Of Goal'] == 'Player') & (ptm['League'] == 'Premier League')]
    
    all_players_info  = get_all_players_info(year)
        
    joined = all_players_info.set_index('player_name').join(ptm.set_index('Player'), how='inner')
    return joined


In [57]:
def create_team(data, name):
    
    full_team = data[data['Team'].str.startswith(name)]
    return full_team


In [58]:
def agg_attrs_all(team):
    
    agg_attrs = {}
    team_atts = team[attr_cols]
    for key in team_atts.keys():
        if key.startswith('gk'):
            agg_attrs[key] = team.sort_values(by = [key], ascending = False)[key].head(2).mean()
        else:
            agg_attrs[key] = team.sort_values(by = [key], ascending = False)[key].head(4).mean()
        
    agg_attrs_df = pd.DataFrame([agg_attrs]).astype(float)
    return agg_attrs_df


def agg_attributes(team):
    
    team.loc[lambda df: df['Position'] == 'Goalkeeper', 'Position'] = 'Defender'
    
    g = team.groupby(['Position'] , as_index=False).mean()
    return g.transpose()

In [59]:
def get_fixtures(year):
    f = pd.read_csv('epl-results-19932018/EPL_Set.csv')
    f = norm_team_names(f)
    
    return f[f['Season'].str.startswith(year)]
    


In [60]:
def combine_metrics_4(i,row,home_attrs,away_attrs,team_form):
   
    def update_form(home_team_name, away_team_name, result):
        if result == 1:
            team_form[home_team_name] += 2
            team_form[away_team_name] -= 2
        elif result == 3:
            team_form[home_team_name] -= 3
            team_form[away_team_name] += 3
        else:
            team_form[home_team_name] -= 1
            team_form[away_team_name] += 1
        
    norm_attrs = {}
    
    for key in home_attrs.keys():
        norm_attrs['h_'+key] = float(home_attrs[key])
        norm_attrs['a_'+key] = float(away_attrs[key])
    
    norm_attrs['Home_Team'] = row['HomeTeam']
    norm_attrs['Away_Team'] = row['AwayTeam']
    norm_attrs['Result'] = 1 if row['FTHG'] - row['FTAG'] > 0  else (3 if row['FTHG'] - row['FTAG'] < 0 else 2)
    norm_attrs['Game_Week'] = (i//10) + 1
    norm_attrs['FTHG'] = row['FTHG']
    norm_attrs['FTAG'] = row['FTAG']
    norm_attrs['h_form'] = team_form[row['HomeTeam']]
    norm_attrs['a_form'] = team_form[row['AwayTeam']]
                                        
    update_form(row['HomeTeam'],row['AwayTeam'],norm_attrs['Result'])
    
    norm_df = pd.DataFrame(norm_attrs, index = [str(i)])
    
    return norm_df 

In [61]:
def get_all_team_data(year):
    
    data = {}

    joined_data = get_full_data(year)

    f = get_fixtures(year)
    teams = f['HomeTeam'].unique() #get all unique teams

    for team in teams:
        team_data = create_team(joined_data, team)
        team_attr = agg_attrs_all(team_data)
        data[team] = (team_data, team_attr)
    return data

In [62]:

def construct_training_data(years):
    
    full_res_df = pd.DataFrame()
    
    for year in years:
        
        team_store = get_all_team_data(year)
        i = 0
        res_df = pd.DataFrame()
        f = get_fixtures(year)
    
        team_form = {name:60 for name in f['HomeTeam'].unique()}
        for key,row in f.iterrows():
            home = row['HomeTeam']
            away = row['AwayTeam']

            home_attrs = team_store[home][1]
            away_attrs = team_store[away][1]

            res_df = res_df.append(combine_metrics_4(i,row,home_attrs,away_attrs,team_form))

            i = i + 1
        
        full_res_df = full_res_df.append(res_df)

    return full_res_df

In [63]:
tr_data = construct_training_data(['2014','2015','2016'])
tr_data['h_form'] = tr_data['h_form']/100
tr_data['a_form'] = tr_data['a_form']/100

In [64]:
tr_data.to_csv('Full_Data_feats_ha.csv')