In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, LabelBinarizer

# Instructions
1. We will be conducting the entire assignment through this notebook. You will be entering your code in the cells provided, and any explanation and details asked in markdown cells. 
2. You are free to add more code and markdown cells for describing your answer, but make sure they are below the question asked and not somewhere else. 
3. The notebook needs to be submitted on LMS. You can find the submission link [here](https://lms.iiitb.ac.in/moodle/mod/assign/view.php?id=13932). 
4. The deadline for submission is **5th October, 2020 11:59PM**.

# Data import
The data required for this assignment can be downloaded from the following [link](https://www.kaggle.com/dataset/e7cff1a2c6e29e18684fe6b077d3e4c42f9a7ae6199e01463378c60fe4b4c0cc), it's hosted on kaggle. Do check directory paths on your local system.  

In [None]:
alcdata = pd.read_csv("/kaggle/input/iiitb-ai511ml2020-assignment-1/Assignment/alcoholism/student-mat.csv", low_memory=False)
fifadata = pd.read_csv("/kaggle/input/iiitb-ai511ml2020-assignment-1/Assignment/fifa18/data.csv", low_memory=False)
accidata1 = pd.read_csv("/kaggle/input/iiitb-ai511ml2020-assignment-1/Assignment/accidents/accidents_2005_to_2007.csv", low_memory=False)
accidata2 = pd.read_csv("/kaggle/input/iiitb-ai511ml2020-assignment-1/Assignment/accidents/accidents_2009_to_2011.csv", low_memory=False)
accidata3 = pd.read_csv("/kaggle/input/iiitb-ai511ml2020-assignment-1/Assignment/accidents/accidents_2012_to_2014.csv", low_memory=False)

# Part - 1
## Alcohol Consumption Data
The following data was obtained in a survey of students' math course in secondary school. It contains a lot of interesting social, gender and study information about students. 


In [None]:
alcdata.info()

In [None]:
alcdata.isnull().sum()

From the above output we can see that all the data is present and there is no missing value. Hence we need not check for missing values. Now we shall work on encoding the categorical data.

In [None]:
alcdata.columns
le = LabelEncoder()

le_cols = ['sex', 'address', 'famsize', 'Pstatus']

alcdata[le_cols] = alcdata[le_cols].apply(lambda col: le.fit_transform(col))
alcdata[le_cols].head()

In [None]:
bin_cols = ['schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']

for col in bin_cols:
    alcdata[col] = np.where(alcdata[col].str.contains('yes'), 1,0)
    
alcdata[bin_cols].head()

In [None]:
alcdata = pd.get_dummies(alcdata, prefix_sep='_')
alcdata.head()

In [None]:
alcdata.columns

### 1. Try to visualize correlations between various features and grades and see which features have a significant impact on grades. 
Try to engineer the three grade parameters (G1, G2 and G3) as one feature for such comparisons.

Let's first introduce a new colum called average grade which could be used as an additional parameter.

In [None]:
alcdata['G'] = alcdata[['G1', 'G2', 'G3']].mean(axis=1)

cat_vars1 = ['age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu']

fig, axes =  plt.subplots(2, 3, sharex=False, sharey= False, figsize=(24,16))
for i, ax in zip(cat_vars1, axes.flatten()):
    sns.violinplot(split=True, data = alcdata, x=i, y='G', hue='sex', ax=ax)
plt.show()

**Age**: We see that for both males and females grades reduce as age increases and as ages goes up high, it increases, hence we can tell that they are mildly correlated.

**Address, Famsize**: The violin plot seems to be evenly distributed, hence not that much correlation

**Pstatus**: Males, whose parents are apart tend to perform better. If parents are together even violin plot is observed

**Medu**: The violin plot seems to show some correlation as the bump seems to be increasing with increase in education of mother.

**Fedu**: The violin plot seems to show some correlation as the bump seems to be increasing with increase in education of father.

In [None]:
cat_vars2 =  ['traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid']

fig, axes =  plt.subplots(2, 3, sharex=False, sharey= False, figsize=(24,16))
for i, ax in zip(cat_vars2, axes.flatten()):
    sns.violinplot(split=True, data = alcdata, x=i, y='G', hue='sex', ax=ax)
plt.show()

**traveltime**: A minor correlation is seen as with an increase in travel time the mean has gone down

**studytime**: Correlation is observed as with increase in study time, grades went up

**failures**: Heavy correlation is observed as the student with past failures tends to perform poorly

**schoolsup**: Minor correlation as number of people with less grades have reduced on having school support

**famsup**: No correlation

**paid**: Minor correlation as number of people with less grades have reduced on taking paid extra tutions

In [None]:
cat_vars3 = ['activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel']

fig, axes =  plt.subplots(2, 3, sharex=False, sharey= False, figsize=(24,16))
for i, ax in zip(cat_vars3, axes.flatten()):
    sns.violinplot(split=True, data = alcdata, x=i, y='G', hue='sex', ax=ax)
plt.show()

**activities**: Not much correlation is observed

**nursery**: Not much correlation is observed

**higher**: Students who want to take higher education tend to mostly perform better than the ones who dont want to take it. Correlation observed

**internet**: Students with internet tend to perform a bit better than the ones who dont

**romantic**: No correlation is observed

**famrel**: The curve looks parabolic hence a minor correlation is observed

Similarly from the other plots we can deduce out interpretations on how much they are correlated

In [None]:
cat_vars4 = ['freetime', 'goout', 'Dalc', 'Walc', 'health']

fig, axes =  plt.subplots(2, 3, sharex=False, sharey= False, figsize=(24,16))
for i, ax in zip(cat_vars4, axes.flatten()):
    sns.violinplot(split=True, data = alcdata, x=i, y='G', hue='sex', ax=ax)
plt.show()

**freetime**: It is a bit unpredictable as the mean keeps fluctuating

**goout**: Not much changes are observed hence no correlation

**Dalc**: Students with higher daily alcohol consumption tend to perform poorer compared to the once who dont.

**Walc**: Students with less weekly alcohol consumption especially males tend to perform better than others.

**Health**: Students with poor health seem to have better grades compared to the others, and correlation can be observed here.

In [None]:
cat_vars5 = ['school_GP', 'school_MS']

fig, axes =  plt.subplots(1, 2, sharex=False, sharey= False, figsize=(12,5))

for i, ax in zip(cat_vars5, axes.flatten()):
    sns.violinplot(split=True, data = alcdata, x=i, y='G', hue='sex', ax=ax)
plt.show()

School is not correlated with grades as there is almost no difference in the plots.

In [None]:
cat_vars6 = ['Mjob_at_home', 'Mjob_health', 'Mjob_other', 'Mjob_services', 'Mjob_teacher']

fig, axes =  plt.subplots(2, 3, sharex=False, sharey= False, figsize=(24,16))
for i, ax in zip(cat_vars6, axes.flatten()):
    sns.violinplot(split=True, data = alcdata, x=i, y='G', hue='sex', ax=ax)
plt.show()

Mother's job seems to be heavily correlated as the peak of the violin plot keeps shifting

In [None]:
cat_vars7 = ['Fjob_at_home', 'Fjob_health', 'Fjob_other', 'Fjob_services', 'Fjob_teacher']

fig, axes =  plt.subplots(2, 3, sharex=False, sharey= False, figsize=(24,16))
for i, ax in zip(cat_vars7, axes.flatten()):
    sns.violinplot(split=True, data = alcdata, x=i, y='G', hue='sex', ax=ax)
plt.show()

Father's job is heavily correlated with grades, as we can clearly interpret from violin plots

In [None]:
cat_vars8 = ['reason_course', 'reason_home', 'reason_other', 'reason_reputation']

fig, axes =  plt.subplots(2, 2, sharex=False, sharey= False, figsize=(24,16))
for i, ax in zip(cat_vars8, axes.flatten()):
    sns.violinplot(split=True, data = alcdata, x=i, y='G', hue='sex', ax=ax)
plt.show()

Reason of school choice is heavily correlated as the mean as well the bump is shifting

In [None]:
cat_vars9 = ['guardian_father', 'guardian_mother', 'guardian_other']

fig, axes =  plt.subplots(1, 3, sharex=False, sharey= False, figsize=(24,10))
for i, ax in zip(cat_vars9, axes.flatten()):
    sns.violinplot(split=True, data = alcdata, x=i, y='G', hue='sex', ax=ax)
plt.show()

Minor correlation observed for a guardian

### 2. If there is a need for encoding some of the features,  how would you go  about it? 
Would you consider combining certain encodings together ?


As I have performed the encodings earlier, I have encoded binaries of the form Yes/No with 1/0 respectively by simply Binary Encoding it and for the other categorical data I would have used one hot encoding, but then I have found that by creating new columns by grouping dummies in a column and checking their truth values, it brought more insight on the data which is a bit similar to one hot encoding. In cases where we would like to preserve the order we would use Ordinal Encoding. For example, Father's/Mother's job, reason to join the school etc.

You can surely combine encodings like in the previous question we had used `sex` as a hue, so for example we are taking `address` and `age` we can combine their encodings to give rise to a new encoding which shows us where a person lives and whether they are male or female. But I would refrain from doing that mostly as visually plotting such data can be done by using hues. And for adding it in the given dataframe, it does not add any value for now. 

But just as an example:

In [None]:
alcdata_enc = alcdata

alcdata_enc['U_M'] = np.where((alcdata_enc['address'] == 1) & (alcdata_enc['sex'] == 0),1,0)
alcdata_enc['U_F'] = np.where((alcdata_enc['address'] == 1) & (alcdata_enc['sex'] == 1),1,0)
alcdata_enc['R_M'] = np.where((alcdata_enc['address'] == 0) & (alcdata_enc['sex'] == 0),1,0)
alcdata_enc['R_F'] = np.where((alcdata_enc['address'] == 0) & (alcdata_enc['sex'] == 1),1,0)


### 3. Try to find out how family relation(famrel) and parents cohabitation(Pstatus) affect grades of students. 

In [None]:
plt.figure(figsize=(15, 8))
sns.violinplot(data = alcdata, x='famrel', y='G', hue='Pstatus')

#0 = Away
#1 = Together

The above violin plot gives us a clear idea on how Famrel and Pstatus affect grades. We can infer the following:

- Famrel 1: It is observed that among all the observations most of the students whose parents are apart tend to perform in the average range, and the curve is equally distributed from lowest to highest grade. This indicates that, if the relation with family is very bad, then your grade is fluctuating depending of your Pstatus.

- Famrel 2: The mean appears to be same for both the Pstatus, but for students whose parents are together, tend to have a minute bimodal curve towards the lower grade. But overall in this case students whose parents are apart tend to slightly perform better than the others.

- Famrel 3: This is a really odd curve. One hand we observe that for students whose parents are away, most students have a way better hrade average compared to the others, but on the other hand for the students whose parents are together, there is a minute chance that they score higher(evident from the top of the violin)

- Famrel 4: In this case the students whose parents are together perform better and many of them are in the given average range

- Famrel 5: Here the students whose parents are apart completely outshine the ones whose parents are together and have a better average, higher and a lower grade than the others.

Looking at the average grades we see that students whose parents are apart tend to perform better in most cases. On the other hand, we see that the healthy relationship with family does bring a small change initially, but doesn't matter if it is high.


### 4. Figure out which features in the data are skewed, and propose a way to remove skew from all such columns. 


As we know skew is plotted for columns which have continuous data. And some of them in our df are
- age
- absences

Let's check if the data is skewed and let's see if we can remove it.

In [None]:
plt.hist(alcdata.age)

In [None]:
plt.hist(alcdata.absences, bins=20)

In [None]:
plt.figure(figsize=(20, 5))
ax = plt.subplot(1,3,1)
alcdata1 = alcdata
alcdata1['absences'].replace({0.000000: 0.000001}, inplace=True)
plt.title('log-skew-remove')
plt.hist(np.log(alcdata1.absences), bins=20)

ax = plt.subplot(1,3,2)
plt.title('sqrt-skew-remove')
plt.hist(np.power(alcdata.absences, 1/2), bins=20)

ax = plt.subplot(1,3,3)
plt.title('cbrt-skew-remove')
plt.hist(np.power(alcdata.absences, 1/3), bins=20)

plt.show()

We can see that we get a normalised graph with the skew removed in the log applied graph. The bar on the left is present as the value of 0 has been replaced by 0.000001 to prevent log from going to negative infinity. The other graphs dont have much of skew removed and they are mostly right skewed

# Part - 2
## FIFA 2019  Data


In [None]:
#Check the columns in the data given to us
fifadata.info()

From the above command we see that there are 89 columns in each player's profile. We can now proceed with the questions given to us.

### 1. Which clubs are the most economical? How did you decide that?

#### Preprocessing

The initial analysis will begin with preprocessing the given data to us. A few columns/metrics that might affect economy of a club are as follows:

- Overall performance of player
- Potenital of the player
- Wage of the player
- Value of the player

We shall first begin with Data Sanitisation and filling missing value data.

In [None]:
#Columns 7,8,11,12
fifadata.isnull().sum()[7:9], fifadata.isnull().sum()[11:13]

From this we can clearly infer that there is no missing data in the selected columns. We now go ahead for outlier detection if there is any. 

Before going ahead, we must check that the given data is of the right format, and if there are any wrong data, we must sanitise them.

In [None]:
import re

for i in fifadata['Overall']:
    if not re.match(r"^\d+$", str(i)):
        print(i)

In [None]:
for i in fifadata['Potential']:
    if not re.match(r"^\d+$", str(i)):
        print(i)

In [None]:
for i in fifadata['Value']:
    if not re.match(r"^€\d+\.?\d*(M|K)?$", i):
        print(i, count)

In [None]:
for i in fifadata['Wage']:
    if not re.match(r"^€\d+\.?\d*(M|K)?$", i):
        print(i)

Before we proceed let's remove the € and M/K from the Wages and Values column, so that it will make tasks easier for us later.

In [None]:
def curr_clear(inp):
    inp = inp.strip("€")
    if(inp[-1] == 'M'):
        inp = float(inp[:-1])*(10**6)
    elif(inp[-1] == 'K'):
        inp = float(inp[:-1])*(10**3)
    else:
        inp = float(inp)
    
    return inp

def curr_clear_1(inp):
    if(type(inp) != float):
        inp = inp.strip("€")
        if(inp[-1] == 'M'):
            inp = float(inp[:-1])*(10**6)
        elif(inp[-1] == 'K'):
            inp = float(inp[:-1])*(10**3)
        else:
            inp = float(inp)
            
    return inp

In [None]:
fifadata['Release Clause'] = fifadata['Release Clause'].apply(lambda x : curr_clear_1(x))
fifadata['Value'] = fifadata['Value'].apply(lambda x : curr_clear(x))
fifadata['Wage'] = fifadata['Wage'].apply(lambda x : curr_clear(x))

In [None]:
plt.figure(figsize=(12, 8))
ax = plt.subplot(1,2,1)
sns.scatterplot(data=fifadata, x="Unnamed: 0", y="Wage", ax=ax)
ax = plt.subplot(1,2,2)
sns.scatterplot(data=fifadata, x="Unnamed: 0", y="Value", ax=ax)

Since all the columns in consideration pass the `regex` check and the scatter plots look correct which suggests minimal presence of outliers in the data, we shall go ahead by answering the given question.

First we look at how the Wages are distribuited among clubs. Remember visualisation here would be a bit difficult due to the presence of many clubs.

Hence we look at the sorted values of club wise sum of vages and value of wages

In [None]:
#Number of Players in a club
club_count = fifadata.groupby("Club").size()

#Sum of Wages of Players in a Club
club_wage = fifadata.groupby("Club")['Wage']
club_wage.sum().sort_values(ascending=False)

In [None]:
club_value = fifadata.groupby("Club")['Value']

#Descending sort based on club valuation
club_value.sum().sort_values(ascending=False)

On reading a bit about the player value I have understood the following. This is not that great a metric for this question, as value mostly deals with the player transfer rates and player actual worth. I have understood these details from [this website](https://www.espn.in/football/blog/espn-fc-united/68/post/3287187/fc-100-new-measurement-player-value-by-football-whispers). Our main aim is to find how much a player is currently doing in the club and hence anything related to Value need not be considered anymore.

In [None]:
club_potential = fifadata.groupby("Club")['Potential']

#Descending sort based on club potential
club_potential.sum().sort_values(ascending=False)

In [None]:
club_overall = fifadata.groupby("Club")['Overall']

#Descending sort based on club overall skills
club_overall.sum().sort_values(ascending=False)

#### Analysis

What could be the metric to consider how economic a club is. A simple and the most intuitive way is to check how much `Wage` a player gets for 1 unit of `Overall` skill. We can similarly compare `Wage` vs `Potential` too

In [None]:
club_count['WO'] = club_wage.sum()/club_overall.sum()
club_count['WP'] = club_wage.sum()/club_potential.sum()

In [None]:
club_count['WO'].sort_values().head()

The above data shows that for 1 unit of skill the club pays the Player the given amount of money. This could be a **good** metric see how economical a club is. We consider other factors too in the analysis below.

In [None]:
club_count['WP'].sort_values().head()

The above data shows how teams are planning to choose their players in terms of skill development. This is a metric whether a club chooses a player on the basis of his pervious performances and is willing to make a future investment. This is comparitively a poorer metric to the previous one, but will still do good.

In [None]:
plt.figure(figsize=(15, 3))

ax = plt.subplot(1,2,1)
club_count['WO'].sort_values().head().plot(kind='bar')


ax = plt.subplot(1,2,2)
club_count['WP'].sort_values().head().plot(kind='bar')

#### Conclusion 1

The above plots show that in terms of the current club performance, the most economic club is _**Spartak Moscow**_, but when it comes to club's future investement and stuff, the most econmomical club is _**Shakhtar Donetsk**_.

To see how player's performance is related to the wage the player earns, we shall plot a graph, and check how the data looks like, so that we can determine a metric to analyse how economical the club is.

In [None]:
plt.figure(figsize=(13, 8))
ax = plt.subplot(1,2,1)
sns.lineplot(data=fifadata, x="Overall", y="Wage")

#Approximate function to the graph on left
x= np.array([x for x in range(50, 95)])
y=np.power(1.15, x)
ax = plt.subplot(1,2,2)
plt.plot(x,y)
plt.show()

After doing a quick trial and error we observe that the function, **Wage=1.15<sup>Overall</sup>** looks like the function which approximatley behaves like the plotted curve. We shall use this for our economy estimation. Also for more accurate result we could have made a model to understand the realtion in a much better way.

From this we can clearly infer that Wage and Overall performance of player are not linearly dependant.

So from this we predict a `club average wage` for a given `club average overall`. We will then find the difference between the actual club average wage and the predicted one. The lesser it is, more the economical it is.

In [None]:
#Predicting the Average Predicted Wage for a given club
club_count['club_pred_wage'] = (1.15**(club_overall.sum()/club_count))*club_count
#Subtracting Prediction with the Club's Actual Wage
club_count['economy'] = club_wage.sum() - club_count['club_pred_wage']

#Converting Series Dataframe into dictionary
a = club_count['economy'].to_dict()
#Removing useless values in dictionary(values which are not floats)
prediction = {k:v for k,v in a.items() if type(v) == float}
#Sort the dictionary as per values
prediction = {k: v for k, v in sorted(prediction.items(), key=lambda item: item[1])}
prediction = dict(list(prediction.items())[0:5])
prediction

#### Conclusion 2

From the above answer we find almost the same teams in the top 5 economical clubs. Here economical means, how much the club saved as a whole by hiring players as per their Overall skill. Negative value implies that the clubs have saved a lot in these terms. And hence in this way too we can conclude that these are the most economical teams.

So based on how much a team should have spent per the skill of a player, the most economical club is _**SL Benfica**_

### 2. What is the relationship between age and individual potential of the player? How does age influence the players' value? At what age does the player exhibit peak pace ?

#### Preprocessing

In [None]:
#Checking for missing values
fifadata['Age'].isnull().sum()

In [None]:
#Outlier Detection
sns.boxplot(x=fifadata['Age'])

There seem to be no missing/redundant data over here, hence we shall go ahead.

In [None]:
plt.figure(figsize=(18, 8))

sns.violinplot(x="Age", y="Potential", data=fifadata)

#### Analysis

We have used a violin-plot as it would be easy to understand on how a player's age is related to their potential

We know that the width of violin plot gives us an idea on frequency of being in that region. From the plot we can infer the following

Potential is the measure of how good a player will be in upcoming times. So it means that the potential curve must not vary greatly according to age as the potential of a player at current age will be usually similar to the potential of the player in future unless there are major gameplay improvements in player.

- As the player crosses 35 years we see that the violin plots become thinner indicating that probablity of getting the average potential is bleak.

- Players in the age 44 have been proven to have very less potential because age plays a huge role on player's potential decrese

- Till age 33 a player is observed to reach maximum potentials and after that the maximum potential and minimum potential start decresing, indicating that potential does decrease after a certain age

Lets us now analyse how for a given potential what age bracket people come in.

In [None]:
plt.figure(figsize=(18, 8))

sns.boxplot(x="Potential", y="Age", data=fifadata, palette='viridis_r')

- From the above box plot we get a clarity that most of the people in the lower potential are older compared to most of the players. This is evident in the potential range 48-54

- It is very evident that high potential players are usually in the range 23-27 years

- Between potential ranges 62-86 ther is almost a drop in age with increase in potential. Which implies that younger players show more potential compared to the older players

- Potential range 86-95 seems otherwise. The age of the player seems to be high during this age. Which implies that a player might achieve their peak potential if it is > 90 during their 25-30's

Preprocessing for player's age and value has been done in the previous cells, so we will jump right into data analysis.

In [None]:
plt.figure(figsize=(15, 8))
sns.catplot(x="Age", y="Value", data=fifadata, height=6, aspect=1.9, palette='viridis_r')

The `catplot` above gives us a clear idea on how the `Value` of the player changes with their `Age`. 

- Younger players(Age<21) tend to have lesser value, and the players seem to have peak value during the ages 23-28.
- The value of the player reduces as they grow older, which is clearly evident during the ages 31-45.
- The value of the players increases till 27 and then starts decreasing. This means that older players are less valued.

The third part of this question is a bit vague, as I felt it would be hard to predict the pace of the players at certain age without making a model. Anyways here is the plot on how player's `SprintSpeed` is affected with respect to `Age`.

#### Preprocessing

First let us check for any presence of missing values

In [None]:
fifadata['SprintSpeed'].isnull().sum()

We see that 48 Players don't have a `SprintSpeed` value. We shall fill this with the median `SprintSpeed`.

In [None]:
fifadata['SprintSpeed'].fillna(fifadata['SprintSpeed'].median(), inplace = True)

Let's check for the presence of any outliers. We know that all the values of `SprintSpeed` must be between 1-100 and anything greater or lesser than that means incorrect value. 

In [None]:
sns.scatterplot(x="Unnamed: 0", y = 'SprintSpeed', data = fifadata)

We can clearly see that player speed is within the desired value, hence there are no outliers. Now let's analyse the data.

#### Analysis

In [None]:
plt.figure(figsize=(15, 8))
sns.boxplot(x='Age', y='SprintSpeed', data = fifadata, palette='viridis_r')

The box plot clearly shows a trend that the younger players have a really good SprintSpeed and as time passes, the average SprintSpeed of the players reduces, as expected.

### 3. What skill sets are helpful in deciding a player's potential? How do the traits contribute to the players' potential? 

According to me a player's potential is determined on various factors. This is a mixture of skill sets and traits .A few of them are 

- Age
- Body Mass Index (Using the height and Weight)
- Dribbling
- Finishing
- ShortPassing
- BallControl
- SprintSpeed
- Agility
- Stamina
- Vision
- Work Rate
- Body Type

It might not be dependant on 

- Release Clause
- Preferred Foot

In [None]:
def height(inp):
    h_foot = float(inp.split("\'")[0])
    h_inch = float(inp.split("\'")[1])
    h_inch += (h_foot)*12
    #print(inp, h_inch, h_foot)
    h_cm = round(h_inch * 2.54, 1)
    return float(h_cm)
    
def weight(inp):
    return float(inp[:-3])

In [None]:
fifadata[['Height', 'Weight']].isnull().sum()

In [None]:
#Missing value removal with modal data
fifadata['Height'] = fifadata['Height'].fillna(fifadata['Height'].mode().iloc[0])
fifadata['Weight'] = fifadata['Weight'].fillna(fifadata['Weight'].mode().iloc[0])

In [None]:
#Data Sanitisation
fifadata['Height'] = fifadata['Height'].apply(lambda x : height(x))
fifadata['Weight'] = fifadata['Weight'].apply(lambda x : weight(x))

In [None]:
fifadata[['Height', 'Weight']].head()

In [None]:
fifadata['bmi'] = fifadata.apply(lambda row: round((row['Weight']*0.453592)/((row['Height']/100) ** 2), 2), axis=1)
fifadata['bmi'].head()

In [None]:
pot_cols = ['Potential', 'Age', 'Release Clause', 'bmi', 'Preferred Foot', 'Dribbling', 'Finishing', 'ShortPassing', 'BallControl', 'SprintSpeed', 'Agility', 'Stamina', 'Vision', 'Work Rate', 'Body Type']
fifadata[pot_cols].isnull().sum()

In [None]:
fifadata[pot_cols]['Release Clause'].fillna(0)
pot_df = fifadata.dropna(subset = ["Preferred Foot"])
pot_df[pot_cols].isnull().sum()

In [None]:
sns.lmplot(data=pot_df, x='bmi', y='Potential', scatter_kws={'alpha':0.3, 'color':'y'})

sns.lmplot(data=pot_df, x='bmi', y='Potential', order = 2, scatter_kws={'alpha':0.3, 'color':'y'})

So from this we can infer that BMI doesn't play a role on Potential of the player that drastically. This is because, most of the players tend to be fit andd a BMI of anywhere between 18 to 25 is considered fit. Hence we don't observe a correlation.

In [None]:
corr = pot_df[pot_cols].corr()
plt.figure(figsize=(10, 10))
sns.heatmap(corr, annot=True, vmin=-1, cmap="coolwarm")

The correlation plot completely refutes what we had assumed.

- Age: Age is negative correlated with potential stating that more the age less the potential, but the correlation magnitude is low.
- Release Clause: This was completely unexpected, among all the considered parameters the magnitude of correlation is highest which implies that higher the Potential of the player, more the release clause and vice versa.
- BMI has been discussed in previous cells
- The other parameters are moderately correlated and lie between 0.22 to 0.37

### 4. Which features directly contribute to the wages of the players?

These are the set of 4 features which I believe might directly contribute to wages:

- Overall
- Potential
- International Reputation
- Age

These are the set of 4 features which I believe might not directly contribute to wages:

- Release Clause
- Height
- Weight
- Skill Moves

Credits: I have looked at [this notebook](https://www.kaggle.com/chirag02/fifa19-player-wage-prediction) and found the visualisations used here to be beautiful, hence I am going with it

In [None]:
sns.lmplot(data=fifadata, x='Overall', y='Wage', scatter_kws={'alpha':0.3, 'color':'y'})

sns.lmplot(data=fifadata, x='Overall', y='Wage', order=2, scatter_kws={'alpha':0.3, 'color':'y'})

From the above graphs we can see that the second order graph matches a lot with the given scatter plot. Infact the scatterplot looks more of an exponential type so we can see a clear relation between Overall performance of player and their age.

In [None]:
sns.lmplot(data=fifadata, x='Potential', y='Wage', scatter_kws={'alpha':0.3, 'color':'y'})

sns.lmplot(data=fifadata, x='Potential', y='Wage', order = 2, scatter_kws={'alpha':0.3, 'color':'y'})

sns.lmplot(data=fifadata, x='Potential', y='Wage', order = 3, scatter_kws={'alpha':0.3, 'color':'y'})

We can also see how the Wage is correlated with the potential. It is almost in third degree relationship with potential

In [None]:
sns.lmplot(data=fifadata, x='International Reputation', y='Wage', scatter_kws={'alpha':0.3, 'color':'y'})

We can see that with an increasing International Reputation, the wages of the player also tend to increase. Hence they are correlated too.

In [None]:
sns.lmplot(data=fifadata, x='Age', y='Wage', scatter_kws={'alpha':0.3, 'color':'y'})

sns.lmplot(data=fifadata, x='Age', y='Wage', order = 2, scatter_kws={'alpha':0.3, 'color':'y'})

Well a slight correlation is observed in order 2, the wages increase as a player grows and then after a peak age, it starts decreasing.

In [None]:
sns.lmplot(data=fifadata, x='Release Clause', y='Wage', scatter_kws={'alpha':0.3, 'color':'y'})

We can clearly see that there is a heavy correlation as with increase in Release Clause of a player, their Wage also seems to increase in most cases.

**Wage vs Height and Wage vs Weight**

In [None]:
sns.lmplot(data=fifadata, x='Weight', y='Wage', scatter_kws={'alpha':0.3, 'color':'y'})

sns.lmplot(data=fifadata, x='Weight', y='Wage', order = 2, scatter_kws={'alpha':0.3, 'color':'y'})

From the above graphs we can clearly find no correlation, hence we can confirm that Weight does not add any value to the player Wage

In [None]:
sns.lmplot(data=fifadata, x='Height', y='Wage', scatter_kws={'alpha':0.3, 'color':'y'})

sns.lmplot(data=fifadata, x='Height', y='Wage', order = 2, scatter_kws={'alpha':0.3, 'color':'y'})

Similar to Weight we can find a similar trend, the plot remains same in multiple orders too, hence we can conclude any absence of relation between Wage and Height

In [None]:
sns.lmplot(data=fifadata, x='Skill Moves', y='Wage', scatter_kws={'alpha':0.3, 'color':'y'})

sns.lmplot(data=fifadata, x='Skill Moves', y='Wage', order = 2, scatter_kws={'alpha':0.3, 'color':'y'})

Well our hypothesis has been refuted over here, as we can see that with an increase in Skill Moves, there is a rise in Wage too. So this might be slightly correlated. 

In [None]:
num_cols = ['Age', 'Overall', 'Potential', 'Value', 'Wage', 'Special','International Reputation', 'Release Clause']

In [None]:
corr = fifadata[num_cols].corr()
plt.figure(figsize=(10, 10))
sns.heatmap(corr, annot=True, vmin=-1, cmap="coolwarm")

As per our assumptions, Release Clause and Value are correlated very well with Wage

### 5. What is the age distribution in different clubs? Which club has most players young?

Given this huge amount of data, plotting it would be a really hectic task. 

- For finding the age distribution in the clubs, we can first group the players according to clubs and analyse them.
- Once we have distribution data, things become simple as we can directly find the _mode_ age of the club and the one with the *least mode*, has the most youngest club members

#### Preprocessing

From the Preprocessing done on Age column in Q.2.2 it like the Age data is not redundant, hence considering preprocessing for Age done, we shall move ahead.

#### Analysis

In [None]:
club_age = fifadata.groupby("Club")['Age']
club_age.describe()

In [None]:
club_age.median().describe()

#### Conclusion 1

From the above data we can infer most of the data. We have considered the mean age of the club as the age of the club.

A few important details we observe are:
- Mean age of a player in a club is 25.14 years
- The median youngest player in a club is approximately around 20 years
- The oldest player on the club is approximately around 32 years

In [None]:
sorted_age = club_age.median().sort_values()
sorted_age.head()

The data above shows the teams with the least club age

In [None]:
sorted_age.tail()

The data above shows the teams with the most club age

In [None]:
youth = 23
fifadata1 = fifadata.apply(lambda x : 1 if x['Age'] <= youth else 0, axis = 1)
fifadata.assign(Count = fifadata1).groupby('Club')['Count'].sum().sort_values(ascending=False)

#### Conclusion 2

I have made an assumption that all the people under the age of 23 can be considered young. From this we can clearly see that the club _**FC Nordsjælland**_ has most number of young people in their team.

# Part - 3
## UK Road Accidents Data


The UK government amassed traffic data from 2000 and 2016, recording over 1.6 million accidents in the process and making this one of the most comprehensive traffic data sets out there. It's a huge picture of a country undergoing change.

### 1. The very first step should be to merge all the 3 subsets of the data.

 To begin with let's analyse the columns and details in the 3 data sets.

In [None]:
accidata1.info()

In [None]:
accidata2.info()

In [None]:
accidata3.info()

Looks like all the columns above have the same attributes regarding which the data has been collected. So we can directly go on to merge.

In [None]:
frames = [accidata1, accidata2, accidata3]
accidata = pd.concat(frames)
accidata.info()

### 2. What are the number of casualties in each day of the week? Sort them in descending order. 

#### Preprocessing

Let us check for the missing values in the columns that we are going to analyse

In [None]:
accidata[['Number_of_Casualties', 'Day_of_Week']].isnull().sum()

Looks like there are no missing values. Let us now check for outliers in the given data.

In [None]:
sns.boxplot(accidata['Number_of_Casualties'])

From the above boxplot we can see that many outliers do exist, but then they are well within a valid range, and there are no negative values, so this doesn't look like an issue.

In [None]:
day_grp = accidata.groupby('Day_of_Week')
day_grp['Number_of_Casualties'].describe()

In [None]:
plt.figure(figsize=(15, 6))
sns.lineplot(x = 'Day_of_Week', y = 'Number_of_Casualties', data = accidata, estimator=lambda x: len(x))

From the above graph it is evident that Day 1 has the least number of causalities.

- Day 2 to Day 5 have almost a flat line plot.
- Day 6 sees a spike in number of cases and the cases drop down on Day 7

I believe that Day 2 to Day 6 are weekdays where Day 2 is Monday and Day 6 is Friday. As they are weekdays I believe people might commute to work or get some work done etc, and hence travel a lot on these days and this also is linked to the number of causalties. Day 1 might be Sunday and Day 7 might be a Saturday.

Since Day 1 and Day 7 are weekends people tend to stay at home more and do not travel much and that might lead to less causalties.

In [None]:
#Sorting number of causalties per day 
day_grp['Number_of_Casualties'].describe().sort_values(by='count', ascending=False)[['count', 'min', 'max']]

### 3. On each day of the week, what is the maximum and minimum speed limit on the roads the accidents happened?

In [None]:
accidata['Speed_limit'].isnull().sum()

Seeing no missing values we shall go ahead by answering the question

In [None]:
day_grp['Speed_limit'].describe()[['min', 'max', 'mean']]

### 4. What is the importance of Light and Weather conditions in predicting accident severity? What does your intuition say and what does the data portray?

In [None]:
accidata['Light_Conditions'].unique()

In [None]:
accidata['Weather_Conditions'].unique()

#### Intuitions 

It is very evident by the fact that Darkness with no street lighting will increase the severity of accident, as people might not be able to see the roads, obstacles or vehicles ahead. The severity might reduce as the lighting keeps going better.

I believe Snowing with Raining with high winds, Fog or mist will garner more number of accidents as compared to other as the view of the driver might get obstructed and hence leads to severity in accidents.

#### Preprocessing

In [None]:
print('Number of missing values in Light_Conditions column is, ',accidata['Light_Conditions'].isnull().sum())
print('Number of missing values in Weather_Conditions column is, ',accidata['Weather_Conditions'].isnull().sum())

In [None]:
#Filling the missing values in Weather_Conditions with Unknown
accidata['Weather_Conditions'].fillna('Unknown', inplace=True)

In [None]:
plt.figure(figsize=(20, 6))
sns.countplot(data=accidata, x='Weather_Conditions', hue ='Accident_Severity', palette="rocket_r")
# accidata['Weather_Conditions'].value_counts().plot.pie()

Contrary to what we had assumed most of the severe accidents seemed to happen when the weather was fine without high winds. This might show that most of them might be happening due to various other reasons. One of the major reasons could be that it does not rain always and most of the times the climate is perfect.

The above claim has been made by looking at the [following data](https://weatherspark.com/y/45062/Average-Weather-in-London-United-Kingdom-Year-Round#Sections-Summary). I have seen that most of the time in UK the climate has fairly good conditions, hence due to the presence of large timeframe, more accidents might be happening during that period.

But leaving that behind we also observe that Raining without winds has the second highest accident probability, even though it is way lower. From the above plot we can also see that Snowing with high winds has the least amount of accidents, including their severity. This might be because due to the presence of snow on roads people might tend to drive slowly and hence this might have led to lesser accidents and severity.

In [None]:
plt.figure(figsize=(18, 6))
sns.countplot(data=accidata, x='Light_Conditions', hue = 'Accident_Severity', palette="rocket_r")

Just like the previous plot our assumptions have proved to be wrong and most of the accidents tend to happen in proper lighting conditions, be it Daylight/Darkness and Presence of street light. One more feature that was expected is in the streets with No lighting, the severity is really high compared to others.

Another reason is due to the presence of proper lighting system in UK, and most of the motorways are lit. This information has been taken from the [following page](https://www.whatdotheyknow.com/request/number_of_street_lights). Hence this gives us an idea on why th distribution is hight in proper lighting conditions.

### 5. To predict the severity of the accidents which columns do you think are unnecessary and should be dropped before implementing a regression model. Support your statement using relevant plots and hypotheses derived from them.

Some of the most useless columns that can be dropped directly are:

- Accident_Index: It is just indexing and has no requirement with the severity
- Location_Easting_OSGR: [Can be retrieved from latitude and longitude](https://www.latlong.net/lat-long-utm.html)
- Location_Northing_OSGR: [Can be retrieved from latitude and longitude](https://www.latlong.net/lat-long-utm.html)
- Junction_Detail: 1504150 null values which is almost empty, hence doesn't add much to the model.
- Junction_Control: 602835 null values, approximately 1/3rd rows are null, hence doesn't add much to the model.
- LSOA_of_Accident_Location: 108238 null values, hence we can drop this too.
- Year: Because it is already present in date
- Weather_Conditions: As observed in the previous question
- Light_Conditions: As observed in the previous question

In [None]:
accidata.drop(['Accident_Index', 'Location_Easting_OSGR', 'Location_Northing_OSGR', 'Junction_Detail', 'Junction_Control', 'LSOA_of_Accident_Location', 'Year'], axis=1, inplace=True)

In [None]:
accidata.info()

In [None]:
#Dropped with null because can't use mode here as location will completely change
accidata.dropna(subset = ['Latitude', 'Longitude'], inplace=True)

#Used the mode of the data here
acci_cols = ['Weather_Conditions', 'Light_Conditions', 'Road_Surface_Conditions', 'Did_Police_Officer_Attend_Scene_of_Accident', 'Time', 'Pedestrian_Crossing-Human_Control', 'Pedestrian_Crossing-Physical_Facilities', 'Special_Conditions_at_Site', 'Carriageway_Hazards']
accidata[acci_cols] = accidata[acci_cols].fillna(accidata[acci_cols].mode().iloc[0])

accidata.isnull().sum()

In [None]:
#Accident Severity on the map based on latitudes and longitudes
plt.figure(figsize=(5,10))
sns.scatterplot(x='Longitude',y='Latitude',data=accidata, hue = 'Accident_Severity')

In [None]:
sns.catplot(x='Police_Force',y='Accident_Severity',data=accidata, kind="point", height = 5, aspect = 2)

Due to the presence of large force, the average severity decreased. Hence this might be helpful in correlation

In [None]:
plt.figure(figsize=(20, 5))
sns.violinplot(x='Number_of_Vehicles', y='Accident_Severity', data = accidata)

As the number of vehicles increases, the severity of the accident tends to decrease.

In [None]:
accidata['Date'] = pd.to_datetime(accidata['Date'])

In [None]:
accidata.nunique()

There are lots of unique values thrown at us. In this situation all that we can do is Label Encode anything that has unique value more than 5, and for less than 45, we shall do one hot encoding.

In [None]:
# accidata['Did_Police_Officer_Attend_Scene_of_Accident'].unique()
accidata['Did_Police_Officer_Attend_Scene_of_Accident'] = np.where(accidata['Did_Police_Officer_Attend_Scene_of_Accident'].str.contains('Yes'), 1,0)

In [None]:
accidata.info()

In [None]:
le_acc_cols = ['Weather_Conditions', 'Light_Conditions', 'Pedestrian_Crossing-Human_Control', 'Road_Surface_Conditions','Local_Authority_(Highway)', 'Road_Type', 'Pedestrian_Crossing-Physical_Facilities', 'Special_Conditions_at_Site', 'Carriageway_Hazards']

accidata[le_acc_cols] = accidata[le_acc_cols].apply(lambda col: le.fit_transform(col))
accidata[le_acc_cols].head()

In [None]:
# accidata.info()
# edit_cols = ['Road_Surface_Conditions', 'Pedestrian_Crossing-Human_Control']
accidata_enc = accidata
accidata_enc.dtypes