# Data Wrangling



### Step 1 - Importing Necessary Modules and Loading the Dataset

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("xG-shots-master.csv")
df.head()

Unnamed: 0,1,R Grounded Pass,53:14:13,53:16:13,53:17:13,Unnamed: 5,TG,d7,TG.1,a9,...,Unnamed: 146,Unnamed: 147,Unnamed: 148,Unnamed: 149,Unnamed: 150,Unnamed: 151,Unnamed: 152,Unnamed: 153,Unnamed: 154,Unnamed: 155
0,2.0,R Grounded Pass,53:16:19,53:18:19,53:19:19,,G6,a10,G3,e12,...,,,,,,,,,,
1,3.0,R Grounded Pass,53:19:03,53:21:03,53:22:03,,G3,e12,G2,g6,...,,,,,,,,,,
2,4.0,R Grounded Pass,53:26:15,53:28:15,53:29:15,,B5,c3,B2,d2,...,,,,,,,,,,
3,5.0,R Grounded Pass,53:30:02,53:32:02,53:33:02,,B2,e2,TB,a5,...,,,,,,,,,,
4,6.0,R Grounded Pass,53:32:04,53:34:04,53:35:04,,TB,a5,B5,c2,...,,,,,,,,,,


### Step 2 - Data Cleaning

In [3]:
# There are some issues with this dataframe:
# 1) there is no header row [The headers can be taken from one of the collected match files]
# 2) the last ~150 columns are useless
# 3) column headers need to be changed to make them more usable

# Let's deal with that. 

df2 = pd.read_csv("C:\\Users\\91953\\Desktop\\InTheGame\\Match Files\\020120\\2200 - 2300\\22020120 - Part 4.csv")
df.columns = df2.columns
df3 = df.iloc[:,:-144]
df3.columns = df3.columns.str.replace(' ', '_')
df3.head()

Unnamed: 0,N,Category,Start,Click,End,Descriptors,Des_1,Des_2,Des_3,Des_4,Des_5,Des_6
0,2.0,R Grounded Pass,53:16:19,53:18:19,53:19:19,,G6,a10,G3,e12,,
1,3.0,R Grounded Pass,53:19:03,53:21:03,53:22:03,,G3,e12,G2,g6,,
2,4.0,R Grounded Pass,53:26:15,53:28:15,53:29:15,,B5,c3,B2,d2,,
3,5.0,R Grounded Pass,53:30:02,53:32:02,53:33:02,,B2,e2,TB,a5,,
4,6.0,R Grounded Pass,53:32:04,53:34:04,53:35:04,,TB,a5,B5,c2,,


### Step 3 - Data Manipulation

#### Turning the above dataframe to the final shots dataset that we will use.

In [4]:
# Things that need to be done:

# 1) New columns: isShot, shotDist, shotAng, isOnTarget, goalLoc, isGoal, isHeader, isThroughball, isBigChance, isCounter,isTapIn.
# 2) remove the unnecessary rows (passes) and columns.

df2.columns = df2.columns.str.replace(' ', '_')
df4 = df2.iloc[:,:-144]
df4.head()

Unnamed: 0,N,Category,Start,Click,End,Descriptors,Des_1,Des_2,Des_3,Des_4,Des_5,Des_6
0,1,Goal Kick,53:11:14,53:13:14,53:14:14,,G3,e12,TG,d7,,
1,1,R Grounded Pass,53:14:13,53:16:13,53:17:13,,TG,d7,TG,a9,,
2,2,R Grounded Pass,53:16:19,53:18:19,53:19:19,,G6,a10,G3,e12,,
3,3,R Grounded Pass,53:19:03,53:21:03,53:22:03,,G3,e12,G2,g6,,
4,1,Duel - Won,53:19:24,53:21:24,53:22:24,,G2,g6,B3,,,


### Step 4.1 - Calculating shot distance and shot angle using Des_2.

In [5]:
# How:
# 1) splitting the des_2 column, into separate alphabet and numeral columns.
# 2) changing the alphabets to respective numbers for the X co-ords.
# 3) finding out which goal (e0 or e13) the ball was shot at:
    # shots are generally shot to the goal closer to them, unless they are marked with a long range tag.
# 4) calculating the shot distance using pythagoras theorem.
# 5) calculating the angle using numpy arcsin.

ShotDict = {'a':'1','b':'2','c':'3','d':'4','e':'5','f':'6','g':'7','h':'8','i':'9'}
ShotDict1 = {'12':'1','11':'2','10':'3','9':'4','8':'5','7':'6','6':'7','5':'8','4':'9','3':'10','2':'11','1':'12'}
df3['shotXPrelim'] = df3.Des_2.str[0]
df3['shotYPrelim'] = df3.Des_2.str[1:]
df3['shotX'] = ''
df3['shotY'] = ''

for val in range(len(df3)):
    for key in df3.shotXPrelim.loc[val]:
        try:
            df3['shotX'].loc[val] = ShotDict[key]
        except:
            df3['shotX'].loc[val] = '0'

for val in range(len(df3)):
    for key in df3.shotYPrelim.loc[val]:
        if df3.Des_4.loc[val] == 'Long Range' or df3.Des_5.loc[val] == 'Long Range':
            try:
                df3['shotY'].loc[val] = ShotDict1[key]
            except:
                df3['shotY'].loc[val] = '0'
        else:
            try:
                df3['shotY'].loc[val] = df3['shotYPrelim'].loc[val]
            except:
                df3['shotY'].loc[val] = '0'

# changing columns to ideal dtypes
df3 = df3.astype({'shotYPrelim':'int64','shotY':'int64','shotX':'int64', 'Des_4':'str', 'Des_5':'str'})

# marking the correct goalpoint
df3['goalpoint'] = df3.apply(lambda row: 0 if row.shotYPrelim <=6 else 13, axis=1)

# finding shotDist and shotAng
df3['distY'] = abs(df3['goalpoint'] - df3['shotY'])
df3['distX'] = abs(5 - df3['shotX'])
df3['shotDist'] = df3.apply(lambda row: (row.distX**2 + row.distY**2)**0.5, axis=1)
df3['shotAng'] = df3.apply(lambda row: np.degrees(np.arcsin(row.distY/row.shotDist)), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [6]:
# dftest = df3[df3['shotDist'] > 7 ]
# dftest.head()

### Step 4.2 - Creating the other columns.

In [7]:
# Creating the other columns
# New columns: isShot, isOnTarget, goalLoc, isGoal, isHeader, isThroughball, isBigChance, isCounter,isTapIn.

df3['isShot'] = df3.apply(lambda row: 1 if 'Shot' in row.Category or 'Goal' in row.Category else 0, axis=1)
df3['isOnTarget'] = df3.apply(lambda row: 1 if 'On' in row.Category or 'Goal' in row.Category else 0, axis=1)
df3['isGoal'] = df3.apply(lambda row: 1 if 'Goal' in row.Category else 0, axis=1)
df3['isHeader'] = df3.apply(lambda row: 1 if 'H ' in row.Category else 0, axis=1)
df3['isBigChance'] = df3.apply(lambda row: 1 if 'Big' in row.Des_4 or 'Big' in row.Des_5 else 0, axis=1)
df3['isCounter'] = df3.apply(lambda row: 1 if 'Counter' in row.Des_4 or 'Counter' in row.Des_5 else 0, axis=1)
df3['isTapIn'] = df3.apply(lambda row: 1 if 'Tap' in row.Des_4 or 'Tap' in row.Des_5 else 0, axis=1)

# since pass types are collected in the previous event, we use the shift() function to obtain whether the pass was a throughball
df3['prevD5'] = df3['Des_5'].shift(1).astype(str)
df3['prevD6'] = df3['Des_6'].shift(1).astype(str)
df3['prevCat'] = df3['Category'].shift(1).astype(str)
df3['isThroughball'] = df3.apply(lambda row: 1 if 'Through' in row.prevD5 or 'Through' in row.prevD6 else 0, axis=1)
df3['isGround'] = df3.apply(lambda row: 1 if 'Ground' in row.prevCat else 0, axis=1)

# getting the location of the goal
df3['goalLoc'] = df3.Des_3
df3.head()

# adding a 'minute' feature to add another layer of analysis
df3['minute']=df3.Click.str[:2].astype(int)


# Test code to see if the changes are made
# dftest = df3[df3['Des_4'] == 'Counter Attack']
# dftest.head()

### Step 5 - Removing excess columns and rows.

In [8]:
# Removing non-shots for training.

dffinal = df3[df3['isShot']==1]
dffinal.head()
dffinal.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 479 entries, 9 to 2049
Data columns (total 35 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   N              479 non-null    float64
 1   Category       479 non-null    object 
 2   Start          479 non-null    object 
 3   Click          479 non-null    object 
 4   End            479 non-null    object 
 5   Descriptors    0 non-null      float64
 6   Des_1          479 non-null    object 
 7   Des_2          479 non-null    object 
 8   Des_3          479 non-null    object 
 9   Des_4          479 non-null    object 
 10  Des_5          479 non-null    object 
 11  Des_6          1 non-null      object 
 12  shotXPrelim    479 non-null    object 
 13  shotYPrelim    479 non-null    int64  
 14  shotX          479 non-null    int64  
 15  shotY          479 non-null    int64  
 16  goalpoint      479 non-null    int64  
 17  distY          479 non-null    int64  
 18  distX    

In [15]:
# removing the columns that won't be used for training.
dffinalexp = dffinal.drop(dffinal.columns[[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,21,28,29,30]], axis=1)  # df.columns is zero-based pd.Index 

In [16]:
# saving the training datasets

dffinal.to_csv('xgdata.csv')
dffinalexp.to_csv('finalxgdata.csv', index = False)

In [17]:
# test
xgdata = pd.read_csv('finalxgdata.csv')
xgdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 479 entries, 0 to 478
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   shotDist       479 non-null    float64
 1   shotAng        479 non-null    float64
 2   isOnTarget     479 non-null    int64  
 3   isGoal         479 non-null    int64  
 4   isHeader       479 non-null    int64  
 5   isBigChance    479 non-null    int64  
 6   isCounter      479 non-null    int64  
 7   isTapIn        479 non-null    int64  
 8   isThroughball  479 non-null    int64  
 9   isGround       479 non-null    int64  
 10  goalLoc        479 non-null    object 
 11  minute         479 non-null    int64  
dtypes: float64(2), int64(9), object(1)
memory usage: 43.1+ KB


In [18]:
from sklearn.model_selection import GridSearchCV