In [None]:
!pip install ppscore

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
import random
import math
import ppscore as pps


# Instructions
1. We will be conducting the entire assignment through this notebook. You will be entering your code in the cells provided, and any explanation and details asked in markdown cells. 
2. You are free to add more code and markdown cells for describing your answer, but make sure they are below the question asked and not somewhere else. 
3. The notebook needs to be submitted on LMS. You can find the submission link [here](https://lms.iiitb.ac.in/moodle/mod/assign/view.php?id=13932). 
4. The deadline for submission is **5th October, 2020 11:59PM**.

# Data import
The data required for this assignment can be downloaded from the following [link](https://www.kaggle.com/dataset/e7cff1a2c6e29e18684fe6b077d3e4c42f9a7ae6199e01463378c60fe4b4c0cc), it's hosted on kaggle. Do check directory paths on your local system.  

File absolute paths

/kaggle/input/iiitb-ai511ml2020-assignment-1/Assignment/ML Assignment 1.pdf

/kaggle/input/iiitb-ai511ml2020-assignment-1/Assignment/alcoholism/student-mat.csv

/kaggle/input/iiitb-ai511ml2020-assignment-1/Assignment/accidents/accidents_2005_to_2007.csv

/kaggle/input/iiitb-ai511ml2020-assignment-1/Assignment/accidents/accidents_2012_to_2014.csv

/kaggle/input/iiitb-ai511ml2020-assignment-1/Assignment/accidents/accidents_2009_to_2011.csv

/kaggle/input/iiitb-ai511ml2020-assignment-1/Assignment/fifa18/data.csv

In [None]:
def import_csv(filename): 
    df = pd.read_csv("/kaggle/input/iiitb-ai511ml2020-assignment-1/Assignment" + filename)
    print(filename + ' loaded...')
    print(filename + ' shape: ',df.shape)
    return df

#pass df.T for better view
def display_all(df) :
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 20): 
        display(df)

In [None]:
alcdata = import_csv("/alcoholism/student-mat.csv")
fifadata = import_csv("/fifa18/data.csv")
accidata1 = import_csv("/accidents/accidents_2005_to_2007.csv")
accidata2 = import_csv("/accidents/accidents_2009_to_2011.csv")
accidata3 = import_csv("/accidents/accidents_2012_to_2014.csv")

# Part - 1
## Alcohol Consumption Data
The following data was obtained in a survey of students' math course in secondary school. It contains a lot of interesting social, gender and study information about students. 


In [None]:
#common space for alcohol data

#G1 and G2 are weighted as 0.25 as they are period marks. G3 is given 0.5 weight as it represents final marks.
def addGrades(df):
    df['Grades'] = 0.25 * df['G1'] + 0.25 * df['G2'] + 0.5 * df['G3']
    return df


### 1. Try to visualize correlations between various features and grades and see which features have a significant impact on grades. 
Try to engineer the three grade parameters (G1, G2 and G3) as one feature for such comparisons.



In [None]:
#enter code/answer in this cell. You can add more code/markdown cells below for your answer.
alcdata = addGrades(alcdata)
alc_corr_data = alcdata.corr()['Grades']
plt.figure(figsize=(30,10))
plt.xticks(rotation=90)
plt.rc('xtick', labelsize=20) 
plt.rc('ytick', labelsize=20) 
sns.lineplot(data=alc_corr_data)
plt.show()

There is not much correlation of features with G1, G2, G3 combination.

Let's look at the ppscore of the feaures as well to see any non linear relationships and more patterns.


In [None]:
predictors_df = pps.predictors(alcdata, y='Grades')

In [None]:
plt.figure(figsize=(35,8))
plt.xticks(rotation=90)
plt.rc('xtick', labelsize=15) 
plt.rc('ytick', labelsize=15) 
sns.barplot(data=predictors_df, x="x", y="ppscore")

Not much action here as well. We will see in below cells how grades are affected due to famrel, Pstatus etc in more detail.

### 2. If there is a need for encoding some of the features,  how would you go  about it? 
Would you consider combining certain encodings together ?


In [None]:
#enter code/answer in this cell. You can add more code/markdown cells below for your answer. 

plt.figure(figsize=(20,20))
plt.rc('xtick', labelsize=10) 
plt.rc('ytick', labelsize=10) 
sns.heatmap(alcdata.select_dtypes(include=['int64','float64']).corr(), cmap="YlGnBu",cbar_kws={"aspect": 40}, annot=True)


Can't seem to figure out which features to combine. ALL features look like they are not related to each other and looking for ways to combine them would be more time consuming than working on something else that might help. 

Also combining different features will depend a lot on what we are looking to achieve from it, what we are going to predict, what relationship we want to explore or what business need we want to solve for our product.

We can combine the G1, G2, G3 as (0.25G1 + 0.25G2 + 0.5G3) for weighted average.


### 3. Try to find out how family relation(famrel) and parents cohabitation(Pstatus) affect grades of students. 


**(a) FAMREL VS GRADES**

We will convert famrel and grades to same scale

In [None]:
#enter code/answer in this cell. You can add more code/markdown cells below for your answer. 
plt.figure(figsize=(25,7))
plt.xticks(rotation=0)
plt.rc('xtick', labelsize=15) 
plt.rc('ytick', labelsize=15) 
alc_grades_fam = pd.DataFrame(alcdata[['Grades', 'famrel']])
#convert grades to same range as family relation
alc_grades_fam['Grades'] = ((alc_grades_fam['Grades'] - alc_grades_fam['Grades'].min()) * 4 )/20 + 1
sns.lineplot(data=alc_grades_fam)
#could have used scatter plot here but it would make no sense as it would be too messy. Still is

Its all over the place

In [None]:
#randomly see variation with 1/8th sample size
random.seed(10)
plt.figure(figsize=(25,7))
plt.xticks(rotation=0)
plt.rc('xtick', labelsize=15) 
plt.rc('ytick', labelsize=15) 
sns.scatterplot(data=(alc_grades_fam.sample(frac=1/8)))

Let's see mean and confidence of grades with each family relation

In [None]:
#Mean and confidence of grades with each family relation
plt.figure(figsize=(15,4))
plt.xticks(rotation=0)
plt.rc('xtick', labelsize=15) 
plt.rc('ytick', labelsize=15) 
sns.lineplot(data=alcdata, x= 'famrel', y = 'Grades')
#Passing the entire dataset in long-form mode will aggregate over repeated values (each famrel) to show the mean and 95% confidence interval

General trends show grades don't vary too much across famrel

**(b) PSTATUS VS GRADES**

In [None]:
alcdata_dummy = pd.get_dummies(alcdata)
plt.figure(figsize=(15,4))
plt.xticks(rotation=0)
plt.rc('xtick', labelsize=15) 
plt.rc('ytick', labelsize=15) 
sns.lineplot(data=alcdata_dummy, x= 'Pstatus_T', y = 'Grades')

In [None]:
plt.figure(figsize=(15,4))
plt.xticks(rotation=0)
plt.rc('xtick', labelsize=15) 
plt.rc('ytick', labelsize=15) 
sns.lineplot(data=alcdata_dummy, x= 'Pstatus_A', y = 'Grades')

General trends shows the grades tend to increase very slightly if parents are apart


### 4. Figure out which features in the data are skewed, and propose a way to remove skew from all such columns. 

We will remove columns which are categorical in nature at their core and try measuring skew on remaining columns.

We will use pairplot to visualize the skew and also skew values.

In [None]:
#enter code/answer in this cell. You can add more code/markdown cells below for your answer. 
num_alcdata = alcdata.select_dtypes(include=['int64','float64'])
dropCol = ['famrel', 'failures', 'Medu', 'Fedu', 'traveltime', 'studytime', 'freetime', 'goout', 'Dalc', 'Walc', 'health']
num_alcdata = num_alcdata.drop(dropCol, axis = 1)

print("Skew of continuous columns:\n\n", num_alcdata.skew())


sns.pairplot(num_alcdata)


In [None]:
alcdata['absences'].unique()

absences column has skew which needs to be corrected. This method can be used for other features as well as the need is.

Right skew can be removed using log, square root, cuberoot, boxcox etc.
Left skew can be removed using power function

In [None]:
num_alcdata['absences'] = np.log2(alcdata['absences']+1)
plt.figure(figsize=(15,4))
plt.xticks(rotation=0)
plt.rc('xtick', labelsize=15) 
plt.rc('ytick', labelsize=15) 
sns.distplot(num_alcdata['absences'], kde_kws={'bw':0.1})
print(num_alcdata['absences'].skew())

# Part - 2
## FIFA 2019  Data


We will use clean_currency function to clean features which have M, K etc in them

In [None]:
#fifadata common space
def clean_currency(df, feature):
    ans = []
    for e in df[feature]:
        e = e.replace('€', '')
        if 'K' in e:
            e = float(e.replace('K', ''))*1000
        elif 'M' in e:
            e = float(e.replace('M',''))*1000000
        ans.append(float(e))
    df[feature] = ans
    return df

display_all(fifadata.T)

### 1. Which clubs are the most economical? How did you decide that?

In [None]:
#enter code/answer in this cell. You can add more code/markdown cells below for your answer. 
'''first = set()
last = set()
for e in fifadata['Wage']:
    first.add(e[0])
    last.add(e[-1])
print(first)         #The first position only has € which can be removed
print(last)          #The last position only has {'0', 'K'} which can be removed
'''
df = fifadata.copy()
df = clean_currency(df, 'Wage')
df = clean_currency(df, 'Value')

plt.figure(figsize=(4,4))
plt.rc('xtick', labelsize=10) 
plt.rc('ytick', labelsize=10) 
sns.heatmap(df[['Wage','Value']].corr(), cmap="YlGnBu",cbar_kws={"aspect": 40}, annot=True)


Wage and Value look correlated

Economical means giving good value or return in relation to the money.

Lets calculate mean Value money spent per potential points and Value money spent per potential points.

The club with least mean should indicate the most economical one.

In [None]:

df['ValuePerPotential'] = df['Value'] / df['Potential']
df['WagePerPotential'] = df['Wage'] / df['Potential']

print("Most economical club in terms of Value: ", df.groupby('Club').mean()['ValuePerPotential'].idxmin())
print("Most economical club in terms of Wage: ", df.groupby('Club').mean()['WagePerPotential'].idxmin())


Bray Wanderers is most economical having least value per potential mean = 1353.438

Shakhtar Donetsk is most economical having least wage per potential mean = 12.888

### 2. What is the relationship between age and individual potential of the player? How does age influence the players' value? At what age does the player exhibit peak pace ?

**(a) Age v/s Potential**

In [None]:
#enter code/answer in this cell. You can add more code/markdown cells below for your answer. 
plt.figure(figsize=(15,4))
plt.xticks(rotation=0)
plt.rc('xtick', labelsize=15) 
plt.rc('ytick', labelsize=15) 
sns.boxplot(data=df, x= 'Age')

Age has some outliers so just keep these in mid while viewing relationship between Age an Potential

In [None]:
plt.figure(figsize=(15,4))
plt.xticks(rotation=0)
plt.rc('xtick', labelsize=15) 
plt.rc('ytick', labelsize=15) 
sns.boxplot(data=df, x= 'Age', y = 'Potential')

plt.figure(figsize=(15,4))
plt.xticks(rotation=0)
plt.rc('xtick', labelsize=15) 
plt.rc('ytick', labelsize=15) 
sns.lineplot(data=df, x= 'Age', y = 'Potential')

Not much variation is observed in potential due to age.

Also above two images show the relation between boxplot and lineplot beautifully.

**(b) Age v/s Value**

In [None]:
plt.figure(figsize=(15,4))
plt.xticks(rotation=0)
plt.rc('xtick', labelsize=15) 
plt.rc('ytick', labelsize=15) 
sns.lineplot(data=df, x= 'Age', y = 'Value')

Players in their prime (24-31) get good Valuation 


**At what age does the player exhibit peak pace ?**

In [None]:
plt.figure(figsize=(15,4))
plt.xticks(rotation=0)
plt.rc('xtick', labelsize=15) 
plt.rc('ytick', labelsize=15) 
sns.lineplot(data=df, x= 'Age', y = 'SprintSpeed')

Around 25 years of age peak pace is observed in most players

### 3. What skill sets are helpful in deciding a player's potential? How do the traits contribute to the players' potential? 

In [None]:
#predict for cleaned values of Value
predictors_df = pps.predictors(df, y='Potential')

In [None]:
#enter code/answer in this cell. You can add more code/markdown cells below for your answer. 
print(predictors_df)
plt.figure(figsize=(35,8))
plt.xticks(rotation=90)
plt.rc('xtick', labelsize=15) 
plt.rc('ytick', labelsize=15) 
sns.barplot(data=predictors_df, x="x", y="ppscore")

Top skills which help in deciding player Potential are **ballcontrol** and **reactions**

The PPS is an asymmetric score that can detect linear or non-linear relationships between two columns irrespective of their type

In [None]:
print(predictors_df.iloc[8])
print("\n\n",predictors_df.iloc[9])

### 4. Which features directly contribute to the wages of the players?

In [None]:
#enter code/answer in this cell. You can add more code/markdown cells below for your answer. 
predictors_df = pps.predictors(df, y='Wage')

In [None]:
print(predictors_df)
plt.figure(figsize=(35,8))
plt.xticks(rotation=90)
plt.rc('xtick', labelsize=15) 
plt.rc('ytick', labelsize=15) 
sns.barplot(data=predictors_df, x="x", y="ppscore")

Among the given features, Overall and Value seems to contribute the most to Wage of the players

In [None]:
predictors_df.iloc[1]

### 5. What is the age distribution in different clubs? Which club has most players young?

In [None]:
#enter code/answer in this cell. You can add more code/markdown cells below for your answer.
#only plotting distribution for 10 clubs
plt.figure(figsize=(20,8))
plt.xticks(rotation=0)
plt.rc('xtick', labelsize=20) 
plt.rc('ytick', labelsize=20)
count = 10
for e in df[['Club', 'Age']].groupby(by='Club'):
    if count == 0: 
        break
    sns.distplot(e[1]['Age'], hist=False, label=e[0])
    count = count - 1

In [None]:
#Summary of age distribution for all clubs
#display_all(df[['Club', 'Age']].groupby(by='Club').describe())

In [None]:
#Considering young means less than or equal to 22 years
display_all(df[['Age','Club']].loc[df['Age'] <= 22].groupby(by='Club').count().max())
display_all(df[['Age','Club']].loc[df['Age'] <= 22].groupby(by='Club').count().idxmax())


Ajax has maximum young players (<=22 years of age) = 21 players

# Part - 3
## UK Road Accidents Data


The UK government amassed traffic data from 2000 and 2016, recording over 1.6 million accidents in the process and making this one of the most comprehensive traffic data sets out there. It's a huge picture of a country undergoing change.

In [None]:
#common space for accident data
df.dtypes


### 1. The very first step should be to merge all the 3 subsets of the data.

In [None]:
#enter code/answer in this cell. You can add more code/markdown cells below for your answer. 
accidata = pd.concat([accidata1,accidata2,accidata3])
df = accidata.copy()
df.shape

### 2. What are the number of casualties in each day of the week? Sort them in descending order. 

In [None]:
#enter code/answer in this cell. You can add more code/markdown cells below for your answer. 
plt.figure(figsize=(10,4))
plt.xticks(rotation=0)
plt.rc('xtick', labelsize=10) 
plt.rc('ytick', labelsize=10)
plotdata = df[['Day_of_Week', 'Number_of_Casualties']].groupby('Day_of_Week').sum().sort_values(by='Number_of_Casualties', ascending = False).reset_index();
sns.barplot(x= 'Day_of_Week', y = 'Number_of_Casualties', data=plotdata, order=plotdata['Day_of_Week'])


Here only ordering and plotting is asked. We are not asked to find out which number corresponds to which day of the week like monday, tuesday etc.

But if it is needed, it can be easily calculated using date field of one of the row and matching it against the day_of_week value and assigning correct value of like monday, tuesday etc.

### 3. On each day of the week, what is the maximum and minimum speed limit on the roads the accidents happened?

In [None]:
#enter code/answer in this cell. You can add more code/markdown cells below for your answer. 
print('Min speed limits\n', df[['Speed_limit','Day_of_Week']].groupby(by='Day_of_Week').min().reset_index())
print("\n\nMax speed limits\n", df[['Speed_limit','Day_of_Week']].groupby(by='Day_of_Week').max().reset_index())

### 4. What is the importance of Light and Weather conditions in predicting accident severity? What does your intuition say and what does the data portray?

Let's do some data cleaning first

In [None]:
print("Junction_Detail null values: ", df['Junction_Detail'].isna().sum())
#print("\nDuplicate index \n", df.groupby('Accident_Index').count())
print("\n2nd_Road_Class with -1 values: ", df[df['2nd_Road_Class'] == -1]['2nd_Road_Class'].count())

Junction_Detail has all null values.

Accident_Index is duplicate with different values. Better drop them.

In [None]:
plt.figure(figsize=(20,20))
plt.rc('xtick', labelsize=10) 
plt.rc('ytick', labelsize=10) 
sns.heatmap(df.corr(), cmap="YlGnBu",cbar_kws={"aspect": 40}, annot=True)

High correlation is observed in 

Location_Easting_OSGR : Longitude

Location_Northing_OSGR : Latitude

Police_Force : Local_Authority_(District)

We can keep one among these pairs. We will also create a new epoch feature using existing features.

In [None]:
df = accidata.copy()
#feature engineering to create epoch timestamp
df['epoch'] = pd.to_datetime(
    pd.to_datetime(df['Date']).apply(str).str[:10] 
    + ' ' 
    + pd.to_datetime(df['Time']).apply(str).str[11:]).astype('int64')//1e9

to_drop = ['Junction_Detail', 'Accident_Index', 'Local_Authority_(District)',
           'Location_Easting_OSGR', 'Location_Northing_OSGR', '2nd_Road_Class', 'Time', 'Date', 'Year' ]
df = df.drop(to_drop,1)

I am trying some more feature engineering

In [None]:
#enter code/answer in this cell. You can add more code/markdown cells below for your answer.
df_t = df.copy()
cols = ['Weather_Conditions', 'Light_Conditions', 'Number_of_Casualties']

#weather preferred
weather_p= ['Raining without high winds', 'Fine without high winds','Snowing without high winds',
                     'Fine with high winds' ]
#weather not preferred
weather_np = ['Raining with high winds', 'Fog or mist','Snowing with high winds']
#weather other
weather_o = ['Unknown', 'Other']

light_p = ['Daylight: Street light present', 'Darkness: Street lights present and lit', ]
light_np = ['Darkness: Street lights present but unlit', 'Darkeness: No street lighting','Darkness: Street lighting unknown']

df_t.loc[df_t['Weather_Conditions'].isin(weather_p), 'Weather_Conditions' ] = 'p'
df_t.loc[df_t['Weather_Conditions'].isin(weather_np), 'Weather_Conditions' ] = 'np'
df_t.loc[df_t['Weather_Conditions'].isin(weather_o), 'Weather_Conditions' ] = 'o'
df_t.loc[df_t['Weather_Conditions'].isna(), 'Weather_Conditions' ] = 'o'
df_t.loc[df_t['Light_Conditions'].isin(light_p), 'Light_Conditions' ] = 'p'
df_t.loc[df_t['Light_Conditions'].isin(light_np), 'Light_Conditions' ] = 'np'

pd.get_dummies(df_t[['Weather_Conditions', 'Light_Conditions', 'Accident_Severity']]).corr()['Accident_Severity']



The data does not show correlation of Accident_Severity with weather and light conditions.
I am aware correlation should not be calculated after encoding for categories.

Intuition tells us that it should affect.

But accident severity has only 3 unique value (practically makes it a category column), this might be the reason it is not showing any correlation.

### 5. To predict the severity of the accidents which columns do you think are unnecessary and should be dropped before implementing a regression model. Support your statement using relevant plots and hypotheses derived from them.

Already done above please check. 

There is not much relation being observed in order to derive conclusions for my intuitions.

### 6. Implement a basic Logistic Regression Model using scikit learn with cross validation = 5, where you predict the severity of the accident (Accident_Severity). Note that here your goal is not to tune appropriate hyperparameters, but to figure out what features will be best to use.

In [None]:
#enter code/answer in this cell. You can add more code/markdown cells below for your answer. 
sns.pairplot(
            df.select_dtypes(include = ['int64', 'float64'])
             .dropna(subset = ['Longitude', 'Latitude'])
             .sample(1000)[['Longitude', 'Latitude','Urban_or_Rural_Area','Accident_Severity']]
            )


In [None]:
#Accident_Severity
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing

cols_to_use = ['Number_of_Casualties', 'Number_of_Vehicles', 'Longitude','Weather_Conditions_Raining with high winds',
       'Weather_Conditions_Snowing with high winds']

def preprocess(df):
    #most preprocessing is already done in above cells
    df = df.dropna(subset = ['Longitude', 'Latitude'])
    return df

def get_X_y(df):
    X = preprocess(df)
    
    #To use classification for one vs rest
    y = pd.get_dummies(X['Accident_Severity'].apply(str), prefix='Severity')
    
    to_drop = ['Local_Authority_(Highway)', 'Accident_Severity', 'LSOA_of_Accident_Location']
    X = pd.get_dummies(X.drop(to_drop,1))
    
    return X, y
    
X, y = get_X_y(df)


In [None]:
scores = []
for i in range(1,4):
    column_to_predict = 'Severity_' + str(i)
    scores.append(cross_validate(
        LogisticRegression(),
        preprocessing.scale(X[cols_to_use]),
        y[column_to_predict],
        cv=5, scoring='f1',
        return_train_score =True))

In [None]:
for i in range(len(scores)):
    print('F1 score for Accident Severity ' + str(i+1) + ': ', scores[i]["test_score"].mean())

Looking at the data it looks more likely that accident severity 1 is more severe than 3.

# Bonus Plot of various regions of accident severity

In [None]:
severity1 = df[ df['Accident_Severity'] == 1]
severity2 = df[ df['Accident_Severity'] == 2]
severity3 = df[ df['Accident_Severity'] == 3]

fig , (a1,a2,a3) = plt.subplots(1, 3, figsize=(15,8))
x = 'Longitude'
y = 'Latitude'
s= .01
a=.3

severity1.plot(kind='scatter', x=x, y =y, color='red', s=s, alpha=a, subplots=True, ax=a1)
a1.set_title("Accident_Severity_1")
a1.set_facecolor('white')

severity2.plot(kind='scatter', x=x,y =y, color='red', s=s, alpha=a, subplots=True, ax=a2)
a2.set_title("Accident_Severity_2")
a2.set_facecolor('white')

severity3.plot(kind='scatter', x=x,y =y, color='red', s=s, alpha=a, subplots=True, ax=a3)
a3.set_title("Accident_Severity_3")
a3.set_facecolor('white')

plt.rc('xtick', labelsize=15) 
plt.rc('ytick', labelsize=15) 
fig.show()

#Referrence : https://www.kaggle.com/yesterdog/

In [None]:
print(severity1['Number_of_Casualties'].sum()/severity1.shape[0])
print(severity2['Number_of_Casualties'].sum()/severity2.shape[0])
print(severity3['Number_of_Casualties'].sum()/severity3.shape[0])