# Data Cleaning, converting to prepared data set

In [2]:
# All import insertions
import numpy as np
import pandas as pd
import seaborn as sns
from pandas import isnull
from sklearn import preprocessing
import os    # For file paths
import re    # Regular Expressions

import matplotlib.pyplot as plt
%matplotlib inline

import datetime as dt

In [3]:
#Open files
df = pd.read_csv("../data/raw/train.csv", low_memory=False)

In [4]:
df.head()

Unnamed: 0,GameId,PlayId,Team,X,Y,S,A,Dis,Orientation,Dir,...,Week,Stadium,Location,StadiumType,Turf,GameWeather,Temperature,Humidity,WindSpeed,WindDirection
0,2017090700,20170907000118,away,73.91,34.84,1.69,1.13,0.4,81.99,177.18,...,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW
1,2017090700,20170907000118,away,74.67,32.64,0.42,1.35,0.01,27.61,198.7,...,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW
2,2017090700,20170907000118,away,74.0,33.2,1.22,0.59,0.31,3.01,202.73,...,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW
3,2017090700,20170907000118,away,71.46,27.7,0.42,0.54,0.02,359.77,105.64,...,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW
4,2017090700,20170907000118,away,69.32,35.42,1.82,2.43,0.16,12.63,164.31,...,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW


In [None]:
#Number of records in the dataframe
df.shape

In [None]:
df.dtypes
df.columns

## Creating the new predictor variable

In [None]:
# Add the column if the first down was met, new predictor variable
df['isFirstDown'] = np.where(df['Distance']<=df['Yards'], 1, 0)

## Fixing the Wind Speed and Wind Direction fields

In [None]:
df['WindSpeed'].unique()

In [None]:
# Fixing the fields that are in the wrong columns
dfindex = df['WindSpeed'].str.contains("[NESW]+", regex=True, na=False)
temp_wd=df[dfindex]['WindSpeed']
temp_ws=df[dfindex]['WindDirection']

df['WindSpeed'].iloc[temp_ws.index]=temp_ws.values
df['WindDirection'].iloc[temp_ws.index]=temp_wd.values

# Filling the missing fields
df['WindSpeed']=df['WindSpeed'].fillna(0)
df['WindDirection']=df['WindDirection'].fillna('No Wind')

# Correcting the fields that have spare texts
df['WindSpeed'].replace(to_replace="^.*(\d+).*$", value=r"\1", regex=True, inplace=True)
df['WindSpeed'].replace(to_replace="Calm", value=r"0", regex=True, inplace=True)


In [None]:
df.astype({'WindSpeed': 'float64', 'TimeHandoff':'datetime64', 'TimeSnap':'datetime64', 'GameClock':'datetime64' }) # .dtypes

## Creating a numerical field for home/away

In [None]:
df['isHomeTeam'] = np.where(df['Team'] == 'home', 1, 0)

In [None]:
df[['Team','isHomeTeam']].head(22)

## Fixing the abbreviated team names

In [None]:
df['HomeTeamAbbr'].sort_values().unique()

In [None]:
# Fixing the abbreviations, some of them differed from the other
df['FieldPosition']=df['FieldPosition'].fillna('Midfield')
df['FieldPosition']=df['FieldPosition'].replace('ARZ','ARI')
df['FieldPosition']=df['FieldPosition'].replace('BLT','BAL')
df['FieldPosition']=df['FieldPosition'].replace('CLV','CLE')
df['FieldPosition']=df['FieldPosition'].replace('HST','HOU')

df['FieldPosition'].sort_values().unique()

In [None]:
df['HomeTeamAbbr'].sort_values().unique()

In [None]:
df['VisitorTeamAbbr'].sort_values().unique()

In [None]:
# Fixing the abbreviations, some of them differed from the other
df['PossessionTeam']=df['PossessionTeam'].replace('ARZ','ARI')
df['PossessionTeam']=df['PossessionTeam'].replace('BLT','BAL')
df['PossessionTeam']=df['PossessionTeam'].replace('CLV','CLE')
df['PossessionTeam']=df['PossessionTeam'].replace('HST','HOU')

df['PossessionTeam'].sort_values().unique()

In [None]:
teamabtxt=df['HomeTeamAbbr'].sort_values().unique()
dctTeamab={}
cnt=1
for x in teamabtxt:
    dctTeamab[x]=cnt
    cnt=cnt+1

df['FieldPosition']=df['FieldPosition'].fillna('Midfield')
dctTeamab['Midfield']=0 #Mid-field

df['FieldPositionCode']=df['FieldPosition'].map(lambda x: dctTeamab[x])
df['HomeTeamAbbrCode']=df['HomeTeamAbbr'].map(lambda x: dctTeamab[x])
df['VisitorTeamAbbrCode']=df['HomeTeamAbbr'].map(lambda x: dctTeamab[x])
df['PossessionTeamAbbrCode']=df['PossessionTeam'].map(lambda x: dctTeamab[x])


## Calculating the number of yards to the end zone, based on the yardline and which team has the bowl

In [None]:
df['YardsToTouchdown']=np.NaN

In [None]:
df['YardsToTouchdown'] = np.where( df['PossessionTeam'] == df['FieldPosition'], (50 + (50 - df['YardLine'])), df['YardLine'])

In [None]:
df['YardLine'].unique()

In [None]:
df['YardsToTouchdown'].sort_values().unique()

## Calculating the number of seconds from snap to handoff

In [None]:
df[['TimeHandoff','TimeSnap']].head()

In [None]:
df['TimeSnapDiff'] = (pd.to_datetime(df['TimeHandoff'])-pd.to_datetime(df['TimeSnap']))
df['TimeSnapDiff']=(df['TimeSnapDiff']).dt.total_seconds()

In [None]:
df[['TimeHandoff','TimeSnap','TimeSnapDiff']].head()

In [None]:
df['TimeSnapDiff'].sort_values().value_counts()

## Converting the player height to inches

In [None]:
df['PlayerHeight'].unique()

In [None]:
df['PlayerHeightFt'] = df['PlayerHeight'].str.extract('^(\d+)', expand=True)
df['PlayerHeightIn'] = df['PlayerHeight'].str.extract('(\d+)$', expand=True)

In [None]:
df['PlayerHeightFt']=df['PlayerHeightFt'].astype('int64')
df['PlayerHeightIn']=df['PlayerHeightIn'].astype('int64')

In [None]:
df['PlayerHeightInches'] = (df['PlayerHeightFt'].mul(12) + df['PlayerHeightIn']).astype('int64')

In [None]:
df[['PlayerHeight', 'PlayerHeightFt', 'PlayerHeightIn', 'PlayerHeightInches']]

In [None]:
df['PlayerHeight'].sort_values().value_counts()

In [None]:
df['PlayerHeightInches'].sort_values().value_counts()

## Filling the empty fields

In [None]:
df['OffenseFormation']=df['OffenseFormation'].fillna('EMPTY')
df['OffenseFormation'].unique()

In [None]:
df['DefendersInTheBox']=df['DefendersInTheBox'].fillna(0)
df['DefendersInTheBox'].unique()

In [None]:
df['Orientation']=df['Orientation'].fillna(0)
df['Orientation'].unique()

In [None]:
df['Dir']=df['Dir'].fillna(0)
df['Dir'].unique()

In [None]:
df[df['Humidity'].isnull()]['Stadium'].unique()

In [None]:
df[(df['Stadium'] == 'Mercedes-Benz Stadium') & (df['Humidity'].isnull())][['Humidity','Temperature']]
#df[df['Stadium'] == 'AT&T Stadium'][['Humidity','Temperature']]
#df[df['Stadium'] == 'State Farm Stadium'][['Humidity','Temperature']]
#df[df['Stadium'] == 'U.S. Bank Stadium'][['Humidity','Temperature']]

In [None]:
df[df['Humidity'].isnull()].to_csv("tmp.csv")

In [None]:
df['StadiumType']=df['StadiumType'].replace('Oudoor','Outdoor')
df['StadiumType']=df['StadiumType'].replace('Outdoors','Outdoor')
df['StadiumType']=df['StadiumType'].replace('Outddors','Outdoor')
df['StadiumType']=df['StadiumType'].replace('Outdor','Outdoor')
df['StadiumType']=df['StadiumType'].replace('Ourdoor','Outdoor')
df['StadiumType']=df['StadiumType'].replace('Outside','Outdoor')
df['StadiumType']=df['StadiumType'].replace('OUTDOOR','Outdoor')

df['StadiumType']=df['StadiumType'].replace('Indoor','Indoors')
df['StadiumType']=df['StadiumType'].replace('indoor','Indoors')

df['StadiumType']=df['StadiumType'].replace('Heinz Field','Outdoor')
df['StadiumType']=df['StadiumType'].replace('Cloudy','Outdoor')


df['StadiumType'].value_counts()

In [None]:
#df[df['StadiumType'] == 'Cloudy'].to_csv("tmp.csv")
df[df['Stadium'] == 'TIAA Bank Field'].to_csv("tmp.csv")


In [None]:
df[df['StadiumType'].isnull()]['Stadium'].value_counts()

In [None]:
df[df['Stadium'] == 'MetLife Stadium']['StadiumType'].unique()
#df[df['Stadium'] == 'StubHub Center']['StadiumType'].unique()
#df[df['Stadium'] == 'Dignity Health Sports Park']['StadiumType'].unique()
#df[df['Stadium'] == 'TIAA Bank Field']['StadiumType'].unique()

In [None]:
null_columns=df.columns[df.isnull().any()]
df[null_columns].isnull().sum()

# Extracting the numbers from the offensive and defensive line

In [None]:
df['OffensePersonnel'].value_counts().to_csv("tmp.csv")

In [None]:
#df[df['OffensePersonnel'] == '1 RB, 2 TE, 3 WR'].to_csv("tmp.csv")
#df['Position'].value_counts()
#df['PlayId'].nunique()
#df[df['Position'] == 'QB']['PlayId'].nunique()


In [None]:
df['DefensePersonnel'].value_counts()

## Removing unnecessary fields

In [1]:
df.drop(['JerseyNumber'])

NameError: name 'df' is not defined

### Save the new files to the appropriate directories

In [None]:
#df.to_csv("../data/interim/data_techcorr.csv")

In [None]:
dfrun = df[df["NflIdRusher"] == df["NflId"]]
dfrun.shape

In [None]:
dfrun.to_csv("../data/processed/data_cleaned.csv")

In [None]:
df.columns

In [None]:
null_columns=df.columns[df.isnull().any()]
df[null_columns].isnull().sum()