In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
import re
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import norm
from scipy import stats

In [39]:
fifa = pd.read_csv('../data/EA_FIFA19.csv')

In [40]:
	def value_to_int(df_value):
		try:
			value = float(df_value[1:-1])
			suffix = df_value[-1:]

			if suffix == 'M':
				value = value * 1000000
			elif suffix == 'K':
				value = value * 1000
		except ValueError:
			value = 0
		return value

fifa['Value'] = fifa['Value'].apply(value_to_int)
fifa['Wage'] = fifa['Wage'].apply(value_to_int)

In [41]:
	fifa['Release Clause'] = fifa['Release Clause'].fillna('0')
	fifa['Release Clause'] = fifa['Release Clause'].apply(value_to_int)

	fifa.loc[fifa['Release Clause'] == 0, 'Release Clause'] = fifa[fifa['Release Clause'] > 0]['Release Clause'].mean()



In [42]:
	def check_contract(row):
		"""
		Creates new variable num_contract days remaining from contract expiry date information
		:param row: each row/obheservation of the dataframe
		:return: updated row with new variable 'contract_days'
		"""
		month_list = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
		ref_date = datetime(2018, 5, 31, 0, 0, 0)
		contract = row['Contract Valid Until']
		try:
			match = re.findall('(\w{3}) \d{1,2}, (\d{4})', contract)
			if len(match) != 0:
				month_str = match[0][0]
				month = month_list.index(month_str) + 1
				year = int(match[0][1])
				dt = datetime(year, month, 1, 0, 0, 0)
				a = dt - ref_date
				row['contract_days'] = a.days
			else:
				match = re.findall('(\d{4})', contract)
				month = month_list.index('Jun') + 1
				year = int(match[0])
				dt = datetime(year, month, 1, 0, 0, 0)
				a = dt - ref_date
				row['contract_days'] = a.days
			return row
		except:
			year = 2020
			month = month_list.index('Jun')
			dt = datetime(year, month, 1, 0, 0, 0)
			a = dt - ref_date
			row['contract_days'] = a.days
			return row

	fifa = fifa.apply(check_contract,axis=1)


In [47]:
	def right_footed(df):
		"""
		Turn Preferred Foot into a binary indicator variable
		"""
		if (df['Preferred Foot'] == 'Right'):
			return 1
		else:
			return 0
	#
	#
	def simple_position(df):
		"""
		Create a simplified position varaible to account for all player positions
		"""
		if (df['Position'] == 'GK'):
			return 'GK'
		elif ((df['Position'] == 'RB') | (df['Position'] == 'LB') | (df['Position'] == 'CB') | (
				df['Position'] == 'LCB') | (df['Position'] == 'RCB') | (df['Position'] == 'RWB') | (
					  df['Position'] == 'LWB')):
			return 'DF'
		elif ((df['Position'] == 'LDM') | (df['Position'] == 'CDM') | (df['Position'] == 'RDM')):
			return 'DM'
		elif ((df['Position'] == 'LM') | (df['Position'] == 'LCM') | (df['Position'] == 'CM') | (
				df['Position'] == 'RCM') | (df['Position'] == 'RM')):
			return 'MF'
		elif ((df['Position'] == 'LAM') | (df['Position'] == 'CAM') | (df['Position'] == 'RAM') | (
				df['Position'] == 'LW') | (df['Position'] == 'RW')):
			return 'AM'
		elif ((df['Position'] == 'RS') | (df['Position'] == 'ST') | (df['Position'] == 'LS') | (
				df['Position'] == 'CF') | (df['Position'] == 'LF') | (df['Position'] == 'RF')):
			return 'ST'
		else:
			return df.Position
	#
	#
	nat_counts = fifa['Nationality'].value_counts()
	nat_list = nat_counts[nat_counts > 250].index.tolist()
    
    
	def major_nation(df):
		"""
		Replace Nationality with a binary indicator variable for 'Major Nation'
		"""

		if (df.Nationality in nat_list):
			return 1
		else:
			return 0

	# Create a copy of the original dataframe to avoid indexing errors
	df = fifa.copy()

	# Apply changes to dataset to create new column
	df['Right_Foot'] = df.apply(right_footed, axis=1)
	df['Simple_Position'] = df.apply(simple_position, axis=1)
	df['Major_Nation'] = df.apply(major_nation, axis=1)


In [48]:
	tempwork = df["Work Rate"].str.split("/ ", n=1, expand=True)
	# Create new column for first work rate
	df["WorkRate1"] = tempwork[0]
	# Create new column for second work rate
	df["WorkRate2"] = tempwork[1]

	df.drop(columns=['LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW',
					  'LAM', 'CAM', 'RAM', 'LM', 'LCM', 'CM', 'RCM', 'RM', 'LWB', 'LDM',
					  'CDM', 'RDM', 'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB'], inplace=True)

	df.drop(['Work Rate', 'Preferred Foot', 'Real Face', 'Position', 'Nationality'], axis=1, inplace=True)

In [49]:
	df['Wage'] = np.log10(df['Wage'] + 1)
	df['Value'] = np.log10(df['Value'] + 1)



In [65]:
	#Remove NA and Unlabelled observations
	df_final = df[df['Value'] != 0]
	df_final = df_final[~df_final['Agility'].isnull()]

In [66]:

adhoc = df_final[['ID', 'Photo', 'Flag', 'Club Logo', 'Jersey Number', 'Joined', 'Special', 'Loaned From',
				 'Body Type','Weight', 'Height', 'Contract Valid Until', 'Name', 'Club']]



In [67]:
df_final.columns

Index(['ID', 'Name', 'Age', 'Photo', 'Flag', 'Overall', 'Potential', 'Club',
       'Club Logo', 'Value', 'Wage', 'Special', 'International Reputation',
       'Weak Foot', 'Skill Moves', 'Body Type', 'Jersey Number', 'Joined',
       'Loaned From', 'Contract Valid Until', 'Height', 'Weight', 'Crossing',
       'Finishing', 'HeadingAccuracy', 'ShortPassing', 'Volleys', 'Dribbling',
       'Curve', 'FKAccuracy', 'LongPassing', 'BallControl', 'Acceleration',
       'SprintSpeed', 'Agility', 'Reactions', 'Balance', 'ShotPower',
       'Jumping', 'Stamina', 'Strength', 'LongShots', 'Aggression',
       'Interceptions', 'Positioning', 'Vision', 'Penalties', 'Composure',
       'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiving', 'GKHandling',
       'GKKicking', 'GKPositioning', 'GKReflexes', 'Release Clause',
       'contract_days', 'Right_Foot', 'Simple_Position', 'Major_Nation',
       'WorkRate1', 'WorkRate2'],
      dtype='object')

In [68]:
	#Drop unnecessary columns
	df_final.drop(
		columns=['ID', 'Photo', 'Flag', 'Club Logo', 'Jersey Number', 'Joined', 'Special', 'Loaned From',
				 'Body Type','Weight', 'Height', 'Contract Valid Until', 'Name', 'Club',
				 'WorkRate2'], inplace=True)

	
	df_final = pd.get_dummies(df_final)
    	df_final.rename(columns={'WorkRate1_High':'WorkRate_High',
       'WorkRate1_Low':'WorkRate_Low', 'WorkRate1_Medium':'WorkRate_Medium'},inplace=True)


In [69]:
df_final.columns

Index(['Age', 'Overall', 'Potential', 'Value', 'Wage',
       'International Reputation', 'Weak Foot', 'Skill Moves', 'Crossing',
       'Finishing', 'HeadingAccuracy', 'ShortPassing', 'Volleys', 'Dribbling',
       'Curve', 'FKAccuracy', 'LongPassing', 'BallControl', 'Acceleration',
       'SprintSpeed', 'Agility', 'Reactions', 'Balance', 'ShotPower',
       'Jumping', 'Stamina', 'Strength', 'LongShots', 'Aggression',
       'Interceptions', 'Positioning', 'Vision', 'Penalties', 'Composure',
       'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiving', 'GKHandling',
       'GKKicking', 'GKPositioning', 'GKReflexes', 'Release Clause',
       'contract_days', 'Right_Foot', 'Major_Nation', 'Simple_Position_AM',
       'Simple_Position_DF', 'Simple_Position_DM', 'Simple_Position_GK',
       'Simple_Position_MF', 'Simple_Position_ST', 'WorkRate1_High',
       'WorkRate1_Low', 'WorkRate1_Medium'],
      dtype='object')

In [3]:
processed = pd.read_csv('../data/processed_fifa.csv')
adhoc = pd.read_csv('../data/adhoc.csv')

In [4]:
position_data = processed.loc[processed['Simple_Position_ST'] == 1, :]

In [22]:
features_list = ['Reactions','Potential','Age','BallControl','StandingTackle','Composure','Dribbling','Positioning','Finishing','GKReflexes']+[col for col in processed.columns if col.startswith('Simple_')]
X_train = position_data[features_list]

In [23]:
from sklearn.preprocessing import StandardScaler

In [24]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)

  return self.partial_fit(X, y)
  This is separate from the ipykernel package so we can avoid doing imports until


In [25]:
X_train[1]

array([3.71471666, 3.7338224 , 1.70489507, 3.49333427, 0.476029  ,
       3.34055461, 2.79102131, 3.15196185, 3.61858916, 0.16404044,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        ])

In [26]:
inp = {'Reactions':96,'Potential':94,'Age':33,'BallControl':94,'StandingTackle':31,'Composure':95,
 'Dribbling':88,'Positioning':95,'Finishing':94,'GKReflexes':11,'Position':'ST'}


In [27]:
	df = pd.DataFrame(inp, index=[0])
	df['Simple_Position'] = df.apply(simple_position, axis=1)

In [28]:
def simple_position(df):
    if df['Position'] == 'GK':
        return 'GK'
    elif (df['Position'] == 'RB') | (df['Position'] == 'LB') | (df['Position'] == 'CB') | (df['Position'] == 'LCB') | (df['Position'] == 'RCB') | (df['Position'] == 'RWB') | (df['Position'] == 'LWB'):
        return 'DF'
    elif (df['Position'] == 'LDM') | (df['Position'] == 'CDM') | (df['Position'] == 'RDM'):
        return 'DM'
    elif (df['Position'] == 'LM') | (df['Position'] == 'LCM') | (df['Position'] == 'CM') | (df['Position'] == 'RCM') | (df['Position'] == 'RM'):
        return 'MF'
    elif (df['Position'] == 'LAM') | (df['Position'] == 'CAM') | (df['Position'] == 'RAM') | (df['Position'] == 'LW') | (df['Position'] == 'RW'):
        return 'AM'
    elif (df['Position'] == 'RS') | (df['Position'] == 'ST') | (df['Position'] == 'LS') | (df['Position'] == 'CF') | (df['Position'] == 'LF') | (df['Position'] == 'RF'):
        return 'ST'
    else:
        return df.Position

In [29]:
df

Unnamed: 0,Reactions,Potential,Age,BallControl,StandingTackle,Composure,Dribbling,Positioning,Finishing,GKReflexes,Position,Simple_Position
0,96,94,33,94,31,95,88,95,94,11,ST,ST


In [30]:
my_cols_list = ['Simple_Position_AM','Simple_Position_DF','Simple_Position_DM','Simple_Position_GK','Simple_Position_MF','Simple_Position_ST']

In [31]:
	df = df.reindex(columns=[*df.columns.tolist(), *my_cols_list], fill_value=0)
	req_simp = df['Simple_Position'].values[0]
	col_name = 'Simple_Position_' + req_simp
	df[col_name] = 1
	df.drop(labels=['Position', 'Simple_Position'], axis=1, inplace=True)


In [32]:
df

Unnamed: 0,Reactions,Potential,Age,BallControl,StandingTackle,Composure,Dribbling,Positioning,Finishing,GKReflexes,Simple_Position_AM,Simple_Position_DF,Simple_Position_DM,Simple_Position_GK,Simple_Position_MF,Simple_Position_ST
0,96,94,33,94,31,95,88,95,94,11,0,0,0,0,0,1


In [33]:
	df = scaler.transform(df)


  """Entry point for launching an IPython kernel.


In [34]:
df

array([[3.71471666, 3.7338224 , 1.70489507, 3.49333427, 0.476029  ,
        3.34055461, 2.79102131, 3.15196185, 3.61858916, 0.16404044,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ]])

In [35]:
X_train[1]

array([3.71471666, 3.7338224 , 1.70489507, 3.49333427, 0.476029  ,
       3.34055461, 2.79102131, 3.15196185, 3.61858916, 0.16404044,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        ])

In [36]:
	y_train = position_data['Value']

In [37]:
from sklearn.neighbors import KNeighborsRegressor

In [38]:
	regressor = KNeighborsRegressor(n_neighbors=5)
	regressor.fit(X_train, y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=None, n_neighbors=5, p=2,
          weights='uniform')

In [39]:
regressor.kneighbors(df)[1][0]

array([ 1,  0,  8,  4, 10], dtype=int64)

In [42]:
nneighbors = position_data.iloc[regressor.kneighbors(df)[1][0], :]

In [43]:
nneighbor_id = nneighbors['ID'].tolist()

In [44]:
nname = adhoc.loc[adhoc['ID'].isin(nneighbor_id),'Name'].tolist()


In [45]:
nname

['L. Messi', 'Cristiano Ronaldo', 'R. Lewandowski', 'S. Agüero', 'G. Higuaín']