In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

# Instructions
1. We will be conducting the entire assignment through this notebook. You will be entering your code in the cells provided, and any explanation and details asked in markdown cells. 
2. You are free to add more code and markdown cells for describing your answer, but make sure they are below the question asked and not somewhere else. 
3. The notebook needs to be submitted on LMS. You can find the submission link [here](https://lms.iiitb.ac.in/moodle/mod/assign/view.php?id=13932). 
4. The deadline for submission is **5th October, 2020 11:59PM**.

# Data import
The data required for this assignment can be downloaded from the following [link](https://www.kaggle.com/dataset/e7cff1a2c6e29e18684fe6b077d3e4c42f9a7ae6199e01463378c60fe4b4c0cc), it's hosted on kaggle. Do check directory paths on your local system.  

In [None]:
alcdata = pd.read_csv("../input/iiitb-ai511ml2020-assignment-1/Assignment/alcoholism/student-mat.csv")
fifadata = pd.read_csv("../input/iiitb-ai511ml2020-assignment-1/Assignment/fifa18/data.csv")
accidata1 = pd.read_csv("../input/iiitb-ai511ml2020-assignment-1/Assignment/accidents/accidents_2005_to_2007.csv")
accidata2 = pd.read_csv("../input/iiitb-ai511ml2020-assignment-1/Assignment/accidents/accidents_2009_to_2011.csv")
accidata3 = pd.read_csv("../input/iiitb-ai511ml2020-assignment-1/Assignment/accidents/accidents_2012_to_2014.csv")

# Part - 1
## Alcohol Consumption Data
The following data was obtained in a survey of students' math course in secondary school. It contains a lot of interesting social, gender and study information about students. 


### 1. Try to visualize correlations between various features and grades and see which features have a significant impact on grades. 
Try to engineer the three grade parameters (G1, G2 and G3) as one feature for such comparisons.



#### Engineering Grades as G_avg which is mean of G1,G2 and G3, and dropping them.

In [None]:
G_avg=(alcdata.G1+alcdata.G2+alcdata.G3)/3
alcdata["G_avg"]=G_avg
alcdata.drop(["G1","G2","G3"],axis=1,inplace=True)


#### Plotting Correlation heatmap to study relatinship between non-categorical features between features and grades

In [None]:
plt.figure(figsize=(16,10))
sns.color_palette("ocean")
sns.heatmap(alcdata.corr(),annot=True,cmap="coolwarm",vmin=-1)

#### From the above matrix we see that no two features have a correlation of more than 0.95, so we dont have to drop any columns

#### Studying impact of some features on grades

In [None]:
alcdata.columns

### Sex vs Grades

#### Plotting boxplot

In [None]:
sns.boxplot(x=alcdata["G_avg"],y=alcdata["sex"])

### Observation:
#### From the above plots we see that the grade distribution for males is towards the right, whereas for females its shifted to a little left. This implies that males tend to score a higher grade as compared to females, but the difference is not much.
#### In the count plot also, on the left side of G_avg, female peaks are more, and on the right, male peaks are more.

### Romantic vs grades

#### Plotting boxplot

In [None]:
sns.boxplot(x=alcdata["G_avg"],y=alcdata["romantic"],hue=alcdata["romantic"])

### Observation:
#### From the above two plots, it is evident that students with no relationships perform better as their distribution is more towards the high side of grades as comapares to students with relationships, as seen in the boxplot above

### Traveltime vs Grades

* ### Plotting barplot as it is a better represention of the whole data 

In [None]:
sns.barplot(x=alcdata["traveltime"],y=alcdata["G_avg"])

### Observation:
#### This clearly indicates that people whi have less traveltime perform better than students with more traveltime.

### 2. If there is a need for encoding some of the features,  how would you go  about it? 
Would you consider combining certain encodings together ?


* ### I have used a mix of binary encoding (label encoding) for objects which have two unique values, and one-hot for other objeccts. I checked that there are no null values in the dataset, so we dont have to take care of that.

In [None]:
a=alcdata.select_dtypes(include=["object"])
a=a.columns
alcdata.isnull().sum()

In [None]:


p=alcdata[a].nunique().apply(lambda x: x==2)
my_label=p[p].index
my_label=alcdata[my_label]
my_label=my_label.apply(LabelEncoder().fit_transform)
k=alcdata[a].nunique().apply(lambda x: x!=2)
my_onehot=k[k].index
my_onehot=alcdata[my_onehot]
my_onehot

* ### my_onehot contains encodings which had objects having >2 unique values and my_labels contains other object types which are binary encoded.

* ### Dropping all object types and adding their encoded forms.

In [None]:
#enter code/answer in this cell. You can add more code/markdown cells below for your answer. 

dumm_bin = pd.get_dummies(my_onehot)
dumm_bin
new_alcdata=alcdata.drop(p.index,axis=1)
new_alcdata=pd.concat([new_alcdata,dumm_bin,my_label],axis=1)

new_alcdata
# my_onehot
# alcdata


* ### Now everything is numerical

In [None]:
new_alcdata.info()


### 3. Try to find out how family relation(famrel) and parents cohabitation(Pstatus) affect grades of students. 


* ### Here, correlation matrix is not a good measure to find dependence as this is ordinal data, so I have used plots to view dependence.

* ### Plotting barplot as it represents mean of data

In [None]:
sns.barplot(x=alcdata["Pstatus"],y=alcdata["G_avg"])

* ## Explanation:
* ### Students whose parents are apart perform better on an average as compared to students whose parents live together.( Not as expected though :))

In [None]:
sns.barplot(x=alcdata["famrel"],y=alcdata["G_avg"])

* ### Famrel does'nt really affect the grades much, but students' grades first decrease with increasing famrel( which means bad) and then again increase a bit.


### 4. Figure out which features in the data are skewed, and propose a way to remove skew from all such columns. 

In [None]:
alcdata.skew()

* ## From the above table, it is evident that features like absences,failures, trevaeltime and Dalc are highly skewed.I have shown removing skew for absences, but similarly skew for other features can be removed.
* ## For data with left skew, we can take sqrt or log, and for right, we can take any power of data.

In [None]:
sns.distplot(new_alcdata.absences,bins=100)

* ### Its left skew, so we normalise and take sqrt to remove skew.

In [None]:
#enter code/answer in this cell. You can add more code/markdown cells below for your answer. 

def min_max_transform(x):
    return (x-x.min())/(x.max()-x.min())
sns.distplot(min_max_transform(new_alcdata.absences)**0.5,bins=100)


# Part - 2
## FIFA 2019  Data


### 1. Which clubs are the most economical? How did you decide that?

* ## My criteria to decide economical was adding release clause and value of a player, as these are contibuting factores, and subtracting wage as it is a deducting factor,and then divide by overall as it is a key in denoting performance of a club.

* ### Preprocessing wage, value and release clause to remove € sign and convert M and K to 10^6 and 10^3 respectively



In [None]:
fifadata["Release Clause"]=fifadata["Release Clause"].replace('[\€,]', '', regex=True).replace('M','e06' , regex=True).replace('K','e03' , regex=True).astype(float)
fifadata["Value"]= fifadata["Value"].replace('[\€,]', '', regex=True).replace('M','e06' , regex=True).replace('K','e03' , regex=True).astype(float)
fifadata["Wage"]= fifadata["Wage"].replace('[\€,]', '', regex=True).replace('M','e06' , regex=True).replace('K','e03' , regex=True).astype(float)



* ### Caluclating club economy, and sortng in decreasing order

In [None]:
club_economy=fifadata[["Wage","Value","Club","Release Clause","Overall"]].groupby(["Club"]).sum()
economical=(club_economy["Release Clause"]+club_economy.Value-club_economy.Wage)/club_economy["Overall"]
economical.sort_values(ascending = False)

* ### So, Real Madrid is the most economical club along with FC Barcelona, Juventus and Man City.

### 2. What is the relationship between age and individual potential of the player? How does age influence the players' value? At what age does the player exhibit peak pace ?

* ### Plotting linegraph of Potential vs Age

In [None]:
sns.lineplot(fifadata.Age,fifadata.Potential)
# sns.barplot(fifadata.Age,fifadata.Potential)

* ### This shows that Potential of the palyer decreases with age. The spike in the end is due to outliers, as data density is almost zero there.

* ### Plotting linegraph of Value vs Age

In [None]:
sns.lineplot(fifadata.Age,fifadata.Value)

* ### This shows that the Value of a player first increases till a certain age about 31, and then starts decreases, as expected in real life.

* ### Plotting linegraph of Pace(SprintSpeed) vs Age

In [None]:
sns.lineplot(fifadata.Age,fifadata.SprintSpeed)


* ### Pace increases slightly till 26 years of age , then decreases with age as expected

### 3. What skill sets are helpful in deciding a player's potential? How do the traits contribute to the players' potential? 

In [None]:
fifadata.corr()["Potential"]

* ### Looking at above table, Potential is not much correlated to anything, but when we plot graphs of traits and skills  like, penalties, strength,Heading Acuracy, etc, it is a posititvely sloped graph
* ### I'll be plotting some of them
### I have considered Skills- Penalties,HeadingAccuracy and Crossing.
### Traits : Shotpower, Reactions, and Weak Foot.

### There are many others, but I have considered some of them as I feel these are relatively more important in real football. 

* ### Contribution of Penalties to Potential

In [None]:
sns.lineplot(fifadata["Penalties"],fifadata["Potential"])

* ### Potential is posotively related to Penalties

* ### Contribution of HeadingAccuracy to Potential

In [None]:
sns.lineplot(fifadata["HeadingAccuracy"],fifadata["Potential"])

* ### Potential is posotively related to HeadingAccuracy

* ### Contribution of Crossing to Potential

In [None]:
sns.lineplot(fifadata["Crossing"],fifadata["Potential"])

* ### Potential is posotively related to Crossing

* ### Contribution of ShotPower to Potential

In [None]:
sns.lineplot(fifadata["ShotPower"],fifadata["Potential"])

* ### Potential is posotively related to ShotPower

* ### Reactions vs Potential

In [None]:
sns.lineplot(fifadata["Reactions"],fifadata["Potential"])

* ### Potential is positively related to Reactions.

* ### Weak Foot vs Potential

In [None]:
sns.lineplot(fifadata["Weak Foot"],fifadata["Potential"])

### Linear relationship between weak foot and Potential

### 4. Which features directly contribute to the wages of the players?

In [None]:
#enter code/answer in this cell. You can add more code/markdown cells below for your answer. 
fifadata.corr()["Wage"]

* ### Looking at above table, Wage is mostly related to Value, Overall, International Reputation, Potential and Release Clause

* ### Plotting Line plot between Value and Wage

In [None]:
sns.lineplot(fifadata["Value"],fifadata["Wage"])

* ### Plotting Line plot between Overall and Wage

In [None]:
sns.lineplot(fifadata["Overall"],fifadata["Wage"])

* ### Plotting Line plot between International Reputation and Wage

In [None]:
sns.lineplot(fifadata["International Reputation"],fifadata["Wage"])

* ### Plotting Line plot between Release Clause and Wage

In [None]:
sns.lineplot(fifadata["Release Clause"],fifadata["Wage"])

* ### Wages  are positively related to all of the above mentioned features

### 5. What is the age distribution in different clubs? Which club has most players young?

* ### I have used mean of age of players in each club as a measure of youngness of a club, then sorted to get club with lowest average age.

In [None]:
tp1=fifadata[["Age","Club"]].groupby(["Club"]).describe()
tp1.sort_values([('Age',  'mean')])


* ### FC Nordsjælland is the player with most young players

# Part - 3
## UK Road Accidents Data


The UK government amassed traffic data from 2000 and 2016, recording over 1.6 million accidents in the process and making this one of the most comprehensive traffic data sets out there. It's a huge picture of a country undergoing change.

### 1. The very first step should be to merge all the 3 subsets of the data.

* ### Concatenating datasets to make new dataset accidents

In [None]:
accidents=pd.concat([accidata1,accidata2,accidata3],axis=0)



In [None]:
accidents

### 2. What are the number of casualties in each day of the week? Sort them in descending order. 

* ### Calculating Casualties on each day of week, using python  dictionaries

In [None]:
#enter code/answer in this cell. You can add more code/markdown cells below for your answer. 
week_casualty={"Mon":0,"Tue":0,"Wed":0,"Thurs":0,"Fri":0,"Sat":0,"Sun":0}
week_casualty["Mon"]=accidents[accidents.Day_of_Week==1].Number_of_Casualties.sum()
week_casualty["Tue"]=accidents[accidents.Day_of_Week==2].Number_of_Casualties.sum()
week_casualty["Wed"]=accidents[accidents.Day_of_Week==3].Number_of_Casualties.sum()
week_casualty["Thurs"]=accidents[accidents.Day_of_Week==4].Number_of_Casualties.sum()
week_casualty["Fri"]=accidents[accidents.Day_of_Week==5].Number_of_Casualties.sum()
week_casualty["Sat"]=accidents[accidents.Day_of_Week==6].Number_of_Casualties.sum()
week_casualty["Sun"]=accidents[accidents.Day_of_Week==7].Number_of_Casualties.sum()
week_casualty

* ### Sorting the dictionary in descending order

In [None]:
# {k: v for k, v in sorted(l.items(), key=lambda item: item[1])}
# l=sorted(l,key=lambda item:item[1])
{k: v for k, v in sorted(week_casualty.items(), key=lambda item: item[1],reverse=True)}


### 3. On each day of the week, what is the maximum and minimum speed limit on the roads the accidents happened?

In [None]:
#enter code/answer in this cell. You can add more code/markdown cells below for your answer. 

day_speed={"Mon":(0,0),"Tue":(0,0),"Wed":(0,0),"Thurs":(0,0),"Fri":(0,0),"Sat":(0,0),"Sun":(0,0)}
day_speed["Mon"]=(accidents[accidents.Day_of_Week==1].Speed_limit.min(),accidents[accidents.Day_of_Week==1].Speed_limit.max())
day_speed["Tue"]=(accidents[accidents.Day_of_Week==2].Speed_limit.min(),accidents[accidents.Day_of_Week==1].Speed_limit.max())
day_speed["Wed"]=(accidents[accidents.Day_of_Week==3].Speed_limit.min(),accidents[accidents.Day_of_Week==1].Speed_limit.max())
day_speed["Thurs"]=(accidents[accidents.Day_of_Week==4].Speed_limit.min(),accidents[accidents.Day_of_Week==1].Speed_limit.max())
day_speed["Fri"]=(accidents[accidents.Day_of_Week==5].Speed_limit.min(),accidents[accidents.Day_of_Week==1].Speed_limit.max())
day_speed["Sat"]=(accidents[accidents.Day_of_Week==6].Speed_limit.min(),accidents[accidents.Day_of_Week==1].Speed_limit.max())
day_speed["Sun"]=(accidents[accidents.Day_of_Week==7].Speed_limit.min(),accidents[accidents.Day_of_Week==1].Speed_limit.max())
day_speed

### Explanation: Each tuple indicates (min,max) pair

### 4. What is the importance of Light and Weather conditions in predicting accident severity? What does your intuition say and what does the data portray?

* ### Plotting LinePlot and barplot for Light Conditions vs Accident Severity and Weather Conditions vs Accident Severity

In [None]:
#enter code/answer in this cell. You can add more code/markdown cells below for your answer. 
plt.figure(figsize=(16,10))
accidents.Accident_Severity.value_counts()
sns.barplot(x=accidents.Light_Conditions,y=accidents.Accident_Severity)

In [None]:
plt.figure(figsize=(16,10))
sns.lineplot(x=accidents.Light_Conditions,y=accidents.Accident_Severity)

* ### No street lighting has high severity as expected and daylight has less severity as expected.

In [None]:
plt.figure(figsize=(16,10))
sns.barplot(x=accidents.Weather_Conditions,y=accidents.Accident_Severity)

In [None]:
plt.figure(figsize=(16,10))
sns.lineplot(x=accidents.Weather_Conditions,y=accidents.Accident_Severity)

* ### Fog has high severity, and fine without wind has less severity as expected. Only discrepency I found was Raining has less severity, which should be high according to me.

* ### The above plots show that that Light and Weather Conditions do follow our intuition, and that accident severity is more ( smaller in value)  for Harsh conditions, and less for normal conditions.

### 5. To predict the severity of the accidents which columns do you think are unnecessary and should be dropped before implementing a regression model. Support your statement using relevant plots and hypotheses derived from them.

In [None]:
sns.lineplot(accidents["1st_Road_Class"],accidents["Accident_Severity"])

####  Keeping this as Accident_Severity depends linearly on 1st_Road_Class 

#### Converting date to lie in range [1,365]

In [None]:
number_of_days = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30]
for i in range(1, len(number_of_days)):
    number_of_days[i] += number_of_days[i-1]

print(number_of_days)

month = accidents.Date.apply(lambda x: int(x[3:5]))
date = accidents.Date.apply(lambda x: int(x[:2]))

accidents.Date = month.apply(lambda x: number_of_days[x-1])+date
accidents.Date

In [None]:
sns.lineplot(accidents["Date"],accidents["Accident_Severity"])

In [None]:
def convert_time(time):
    if pd.isnull(time):
        return pd.NA
    
    hours, minutes = time.split(":")
    hours, minutes = int(hours), int(minutes)
    return int(hours + minutes/60)

accidents.Time = accidents.Time.apply(convert_time)
accidents.Time.fillna(accidents.Time.mean(), inplace=True)
accidents.Time

In [None]:
sns.lineplot(accidents["Time"],accidents["Accident_Severity"])

In [None]:
sns.lineplot(accidents["Year"],accidents["Accident_Severity"])

In [None]:
sns.lineplot(accidents["Day_of_Week"],accidents["Accident_Severity"])

* ## Dropping Accident severity as it is our target label and saving it as Y. The new dataset is new_accidents

In [None]:
#enter code/answer in this cell. You can add more code/markdown cells below for your answer. 
new_accidents=accidents.drop("Accident_Severity",axis=1)
y=accidents["Accident_Severity"]
new_accidents.corr()

### I am dropping the following columns because:
* ### Accident_Index - It is a unique value, and signifies nothing
* ### Location_Easting_OSGR and Location_Northing_OSGR - Their correlation with Longitude and latitude respectively
* ### Local_Authority_(District) - Correlation with Police force is very high
* ### Junction_Detail - Full of Nan values
* ### LSOA_of_Accident_Location - leaving scene of accident does not affect severity as its just an ID
* ### Local_Authority_(Highway) - Too many unique values, doesnt help in classification much as shown below.
* ### Year, Date and Time- They dont have a definite relationship with Accident Severity as shown in above graphs.
* ### 1st_Road_number  and 2nd_Road_Number - it is a ambiguous and fluctuating graph.

In [None]:
new_accidents["LSOA_of_Accident_Location"]

In [None]:
sns.lineplot(new_accidents["1st_Road_Number"],y)

In [None]:
new_accidents["2nd_Road_Number"].nunique()

In [None]:
new_accidents["1st_Road_Number"].nunique()

In [None]:
sns.boxplot(new_accidents["Local_Authority_(Highway)"],y)

#### Dropping this as its not a feasible relationship

* ### Putting all object type in objects, and int and float in non_objects

In [None]:
new_accidents1 = new_accidents.drop(["Accident_Index","Location_Easting_OSGR","Location_Northing_OSGR","Local_Authority_(District)","Junction_Detail","Year","Date","Time","LSOA_of_Accident_Location","Local_Authority_(Highway)","1st_Road_Number","2nd_Road_Number"],axis=1)
objects=new_accidents1.select_dtypes(include=["object"])
not_objects=new_accidents1.select_dtypes(include=["float64","int64"])



* ### Filling NA in null values for objects

In [None]:
objects=objects.fillna("NA")

* ### Filling null values in int and float by mean

In [None]:
not_objects["Longitude"]=not_objects["Longitude"].fillna(not_objects["Longitude"].mean())
not_objects["Latitude"]=not_objects["Latitude"].fillna(not_objects["Latitude"].mean())
not_objects.isnull().sum()


* ### One hot encoding object types, and then concatenating with non_objects to get dataset accidata1 for training.

In [None]:
objects1=pd.get_dummies(objects)
accidata1=pd.concat([objects1,not_objects],axis=1)
accidata1.dtypes.value_counts()

* ### No Null values present.

In [None]:

accidata1.isnull().sum()

### 6. Implement a basic Logistic Regression Model using scikit learn with cross validation = 5, where you predict the severity of the accident (Accident_Severity). Note that here your goal is not to tune appropriate hyperparameters, but to figure out what features will be best to use.

* ### The idea that I used is, I created 3 models, one for predicting 1( Meaning output 1 for 1 and 0 for 2 and 3), one for predicting 2 and one for predicting 3. Then, while testing, whichever is one, will be output.

* ### In another method I used LogisticRegressionCV  to make a model to predict . I used the multi_class hyperparameter to use Logistic Regression to predict 3 classes.

In [None]:
#enter code/answer in this cell. You can add more code/markdown cells below for your answer. 
import sklearn
from sklearn.linear_model import LogisticRegressionCV,LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

### 1st Approach

In [None]:
y1=[]
y2=[]
y3=[]
for i in y:
    if i==1:
        y1.append(1)
        y2.append(0)
        y3.append(0)
    if i==2:
         y1.append(0)
         y2.append(1)
         y3.append(0)
    if i==3:
         y1.append(0)
         y2.append(0)
         y3.append(1)
        


In [None]:
model1=LogisticRegression(max_iter=1000)
model2=LogisticRegression(max_iter=1000)
model3=LogisticRegression(max_iter=1000)

In [None]:
X_train1,X_test1,Y_train1,Y_test1=train_test_split(accidata1,y1,test_size=0.2)
X_train2,X_test2,Y_train2,Y_test2=train_test_split(accidata1,y2,test_size=0.2)
X_train3,X_test3,Y_train3,Y_test3=train_test_split(accidata1,y3,test_size=0.2)



In [None]:
model1.fit(X_train1,Y_train1)


In [None]:
model2.fit(X_train2,Y_train2)

In [None]:
model3.fit(X_train3,Y_train3)

In [None]:
y_pred1=model1.predict(accidata1)
y_pred2=model2.predict(accidata1)
y_pred3=model3.predict(accidata1)


In [None]:
y_pred=[]
for i in range(len(y_pred1)):
    if y_pred1[i]==1:
        y_pred.append(1)
    elif y_pred2[i]==1:
        y_pred.append(2)
    else:
        y_pred.append(3)

In [None]:
accuracy_score(y_pred,y)

### Accuracy of 1st Model= 85%

### 2nd Approach

### n_jobs gives paralellism multinomial helps to predict 3 or more classes.

In [None]:
model=LogisticRegressionCV(cv=5,multi_class="multinomial",n_jobs=-1,max_iter=1000)


### Normalizing data for faster training

In [None]:
scaler=StandardScaler()

In [None]:
accidata1=scaler.fit_transform(accidata1)

In [None]:
accidata1

In [None]:
X_train,X_test,Y_train,Y_test=train_test_split(accidata1,y,test_size=0.2)

In [None]:
model.fit(X_train,Y_train)

In [None]:
y_pred_2=model.predict(X_test)

In [None]:
accuracy_score(y_pred_2,Y_test)

### Accuracy using LogisticRegressionCV = 85%.