In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Instructions
1. We will be conducting the entire assignment through this notebook. You will be entering your code in the cells provided, and any explanation and details asked in markdown cells. 
2. You are free to add more code and markdown cells for describing your answer, but make sure they are below the question asked and not somewhere else. 
3. The notebook needs to be submitted on LMS. You can find the submission link [here](https://lms.iiitb.ac.in/moodle/mod/assign/view.php?id=13932). 
4. The deadline for submission is **5th October, 2020 11:59PM**.

# Data import
The data required for this assignment can be downloaded from the following [link](https://www.kaggle.com/dataset/e7cff1a2c6e29e18684fe6b077d3e4c42f9a7ae6199e01463378c60fe4b4c0cc), it's hosted on kaggle. Do check directory paths on your local system.  

In [None]:
alcdata = pd.read_csv("../input/iiitb-ai511ml2020-assignment-1/Assignment/alcoholism/student-mat.csv")
fifadata = pd.read_csv("../input/iiitb-ai511ml2020-assignment-1/Assignment/fifa18/data.csv")
accidata1 = pd.read_csv("../input/iiitb-ai511ml2020-assignment-1/Assignment/accidents/accidents_2005_to_2007.csv")
accidata2 = pd.read_csv("../input/iiitb-ai511ml2020-assignment-1/Assignment/accidents/accidents_2009_to_2011.csv")
accidata3 = pd.read_csv("../input/iiitb-ai511ml2020-assignment-1/Assignment/accidents/accidents_2012_to_2014.csv")

# Part - 1
## Alcohol Consumption Data
The following data was obtained in a survey of students' math course in secondary school. It contains a lot of interesting social, gender and study information about students. 


### 1. Try to visualize correlations between various features and grades and see which features have a significant impact on grades. 
Try to engineer the three grade parameters (G1, G2 and G3) as one feature for such comparisons.



In [None]:
#enter code/answer in this cell. You can add more code/markdown cells below for your answer. 
alcdata

In [None]:
#taking average of all the three grades of g1, g2, g3 as g_avg
alcdata['g_avg'] = alcdata['G1']/3 + alcdata['G2']/3 + alcdata['G3']/3

In [None]:
alcdata

In [None]:
del alcdata['G1']
del alcdata['G2']
del alcdata['G3']

In [None]:
x = alcdata_new.iloc[:, :-1] ## independent features
y = alcdata_new.iloc[:,-1] ## dependent features

In [None]:
alcdata.corr()

In [None]:
#only numerical data are taken into consider
corrmat = alcdata.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20, 20))
g = sns.heatmap(alcdata[top_corr_features].corr(), annot = True, cmap = "RdYlGn")

**feature importance**

gives score more important or relevant towards your output variable feature importance is an inbuilt class that comes with tree based classifiers, we will be using extra tree regressor for extracting top 10 features for the dataset

In [None]:
from sklearn.ensemble import ExtraTreesRegressor #solves outlier problem
model  = ExtraTreesRegressor()
model.fit(x,y)

In [None]:
model.feature_importances_

In [None]:
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index = x.columns)
feat_importances.nlargest(7).plot(kind = 'barh') ## top 7 features
plt.show()

### 2. If there is a need for encoding some of the features,  how would you go  about it? 
Would you consider combining certain encodings together ?


Ans: non numerical data type need to be convert to numerical 
    
    first convert all object type data to categorical type then one hot encoding through label encoder

In [None]:
#enter code/answer in this cell. You can add more code/markdown cells below for your answer. 

In [None]:
alcdata.info()

In [None]:
alcdata_new = alcdata.copy()

In [None]:
for feature in alcdata_new.dtypes[alcdata_new.dtypes == 'object'].index:
    alcdata_new[feature] = alcdata_new[feature].astype('category')

In [None]:
del alcdata_new['job_type']
alcdata_new.info()

In [None]:
from sklearn.preprocessing import LabelEncoder
for feature in alcdata_new.dtypes[alcdata_new.dtypes == 'category'].index:
    le = LabelEncoder()
    alcdata_new[feature] = le.fit_transform(alcdata_new[feature]) #encoding data for model training

In [None]:
alcdata_new.info()


### 3. Try to find out how family relation(famrel) and parents cohabitation(Pstatus) affect grades of students. 


In [None]:
#enter code/answer in this cell. You can add more code/markdown cells below for your answer. 

In [None]:
alcdata_new[['famrel', 'Pstatus']]

In [None]:
x_new = alcdata_new[['famrel', 'Pstatus', 'g_avg']]

In [None]:
sns.pairplot(x_new)

In [None]:
x_new.corr()


### 4. Figure out which features in the data are skewed, and propose a way to remove skew from all such columns. 

In [None]:
#enter code/answer in this cell. You can add more code/markdown cells below for your answer. 

In [None]:
sns.pairplot(alcdata)

## there are many features which are skewed which can be visualized by diagonal elements, that can be removed by using standard normalization

In [None]:
alc2 = alcdata.copy()

In [None]:
for feature in alc2.dtypes[alc2.dtypes == 'object'].index:
    del alc2[feature]

In [None]:
for feature in alc2.dtypes[alc2.dtypes == 'int64'].index:
    alc2[feature] = alc2[feature].astype(float)

In [None]:
alc2.info()

In [None]:
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
alc = scale.fit_transform(alc2)

In [None]:
j = 0
for feature in alc2.dtypes[alc2.dtypes != 'object'].index:
    for i in range(0,395):
        alc2[feature][i] = alc[i][j]
    j = j+1

In [None]:
alc2

In [None]:
sns.pairplot(alc2)

In [None]:
alc2.hist(bins=50, figsize=(20,15))
plt.show()

# Part - 2
## FIFA 2019  Data


### 1. Which clubs are the most economical? How did you decide that?

In [None]:
#enter code/answer in this cell. You can add more code/markdown cells below for your answer. 

In [None]:
fifadata

In [None]:
del fifadata['Unnamed: 0']

In [None]:
#top 10 most economical club
fifadata.sort_values(by = ['Release Clause']).head(10)

### 2. What is the relationship between age and individual potential of the player? How does age influence the players' value? At what age does the player exhibit peak pace ?

In [None]:
#enter code/answer in this cell. You can add more code/markdown cells below for your answer. 

In [None]:
x = fifadata[['Age', 'Potential']]

In [None]:
x.corr()

In [None]:
sns.pairplot(x) # no strong correlation

### 3. What skill sets are helpful in deciding a player's potential? How do the traits contribute to the players' potential? 

In [None]:
#enter code/answer in this cell. You can add more code/markdown cells below for your answer. 
fifadata.info()

In [None]:
fif = fifadata.copy()
## delete unrequired
del fif['ID']
del fif['Name']
del fif['Nationality']
del fif['Photo']
del fif['Flag']
del fif['Club Logo']
del fif['Loaned From']

In [None]:
fif = fif.dropna()

In [None]:
fif.info()

In [None]:
for feature in fif.dtypes[fif.dtypes == 'object'].index:
    fif[feature] = fif[feature].astype('category')
from sklearn.preprocessing import LabelEncoder
for feature in fif.dtypes[fif.dtypes == 'category'].index:
    le = LabelEncoder()
    fif[feature] = le.fit_transform(fif[feature]) #encoding data for model training

In [None]:
fif2 = fif.copy()

In [None]:
x = fif['Potential']
del fif['Potential']

In [None]:
from sklearn.ensemble import ExtraTreesRegressor #solves outlier problem
model  = ExtraTreesRegressor()
model.fit(fif,x)
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index = fif.columns)
feat_importances.nlargest(7).plot(kind = 'barh') ## top 7 features
plt.show()

### 4. Which features directly contribute to the wages of the players?

In [None]:
#x = fif2['Wage']
#del fif2['Wage']
from sklearn.ensemble import ExtraTreesRegressor #solves outlier problem
model  = ExtraTreesRegressor()
model.fit(fif2,x)
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index = fif2.columns)
feat_importances.nlargest(7).plot(kind = 'barh') ## top 7 features
plt.show()

### 5. What is the age distribution in different clubs? Which club has most players young?

In [None]:
#enter code/answer in this cell. You can add more code/markdown cells below for your answer. 
sns.countplot(y='Age', data= fif, order = fif['Age'].value_counts().index)
plt.show()

# Part - 3
## UK Road Accidents Data


The UK government amassed traffic data from 2000 and 2016, recording over 1.6 million accidents in the process and making this one of the most comprehensive traffic data sets out there. It's a huge picture of a country undergoing change.

### 1. The very first step should be to merge all the 3 subsets of the data.

In [None]:
#enter code/answer in this cell. You can add more code/markdown cells below for your answer. 
frames = [accidata1, accidata2, accidata3]
df = pd.concat(frames)

In [None]:
df

### 2. What are the number of casualties in each day of the week? Sort them in descending order. 

In [None]:
#enter code/answer in this cell. You can add more code/markdown cells below for your answer.
df.sort_values(by = ['Number_of_Casualties'], ascending = False)

### 3. On each day of the week, what is the maximum and minimum speed limit on the roads the accidents happened?

In [None]:
#enter code/answer in this cell. You can add more code/markdown cells below for your answer.
df.info()

In [None]:
print(df['Speed_limit'].max())
print(df['Speed_limit'].min())

### 4. What is the importance of Light and Weather conditions in predicting accident severity? What does your intuition say and what does the data portray?

In [None]:
del df['Junction_Control']
del df['Junction_Detail']
del df['LSOA_of_Accident_Location']
df.dropna()

In [None]:
df = df.dropna()

In [None]:
df.info()

In [None]:
#enter code/answer in this cell. You can add more code/markdown cells below for your answer. 
df['Light_Conditions']

In [None]:
for feature in df.dtypes[df.dtypes == 'object'].index:
    df[feature] = df[feature].astype('category')
#from sklearn.preprocessing import LabelEncoder
for feature in df.dtypes[df.dtypes == 'category'].index:
    le = LabelEncoder()
    df[feature] = le.fit_transform(df[feature]) #encoding data for model training

In [None]:
x = df[['Light_Conditions', 'Weather_Conditions', 'Accident_Severity']]
sns.pairplot(x)

### 5. To predict the severity of the accidents which columns do you think are unnecessary and should be dropped before implementing a regression model. Support your statement using relevant plots and hypotheses derived from them.

In [None]:
df['Accident_Severity'] = y

In [None]:
df

In [None]:
corr_matrix = df.corr()
corr_matrix['Accident_Severity'].sort_values(ascending=False)

In [None]:
corr_matrix['Accident_Severity']

In [None]:
df2 = df.copy()

In [None]:
#keeping only significant features having more than 3% positive or negative correlation
for feature in df.dtypes[df.dtypes != 'object'].index:
    if(corr_matrix['Accident_Severity'][feature] < 0.03 and corr_matrix['Accident_Severity'][feature] > -0.03):
        del df2[feature]

In [None]:
df2

### 6. Implement a basic Logistic Regression Model using scikit learn with cross validation = 5, where you predict the severity of the accident (Accident_Severity). Note that here your goal is not to tune appropriate hyperparameters, but to figure out what features will be best to use.

In [None]:
#enter code/answer in this cell. You can add more code/markdown cells below for your answer. 
x = df2.iloc[:, :-1] ## independent features
y = df2.iloc[:,-1] ## dependent features
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
from sklearn.model_selection import cross_val_score
score = cross_val_score(log_reg, x, y, cv = 5)
score.mean()