In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import datetime
from sklearn import preprocessing 

# Instructions
1. We will be conducting the entire assignment through this notebook. You will be entering your code in the cells provided, and any explanation and details asked in markdown cells. 
2. You are free to add more code and markdown cells for describing your answer, but make sure they are below the question asked and not somewhere else. 
3. The notebook needs to be submitted on LMS. You can find the submission link [here](https://lms.iiitb.ac.in/moodle/mod/assign/view.php?id=13932). 
4. The deadline for submission is **5th October, 2020 11:59PM**.

# Data import
The data required for this assignment can be downloaded from the following [link](https://www.kaggle.com/dataset/e7cff1a2c6e29e18684fe6b077d3e4c42f9a7ae6199e01463378c60fe4b4c0cc), it's hosted on kaggle. Do check directory paths on your local system.  

In [None]:
alcdata = pd.read_csv("../input/iiitb-ai511ml2020-assignment-1/Assignment/alcoholism/student-mat.csv")
fifadata = pd.read_csv("../input/iiitb-ai511ml2020-assignment-1/Assignment/fifa18/data.csv")
accidata1 = pd.read_csv("../input/iiitb-ai511ml2020-assignment-1/Assignment/accidents/accidents_2005_to_2007.csv")
accidata2 = pd.read_csv("../input/iiitb-ai511ml2020-assignment-1/Assignment/accidents/accidents_2009_to_2011.csv")
accidata3 = pd.read_csv("../input/iiitb-ai511ml2020-assignment-1/Assignment/accidents/accidents_2012_to_2014.csv")

# Part - 1
## Alcohol Consumption Data
The following data was obtained in a survey of students' math course in secondary school. It contains a lot of interesting social, gender and study information about students. 


### 1. Try to visualize correlations between various features and grades and see which features have a significant impact on grades. 
Try to engineer the three grade parameters (G1, G2 and G3) as one feature for such comparisons.



So there is no null values in this dataset,tehrefore we can move on to finding the features with signifcant impact
on grades

In [None]:
alcdata.isnull().sum()
alcdata.info()

In [None]:
#Function to get a sorted correlation plot, based on the target column specified ( decreasing)
def CorrPlotLargest(df, target):
    k = 10
    numerical_feature_columns = list(df._get_numeric_data().columns)
    cols = df[numerical_feature_columns].corr().nlargest(k, target)[target].index
    cm = df[cols].corr()
    plt.figure(figsize=(10,6))
    return sns.heatmap(cm, annot=True, cmap = 'viridis')

#Function to get a sorted correlation plot, based on the target column specified (increasing)
def CorrPlotSmallest(df, target):
    k = 10
    numerical_feature_columns = list(df._get_numeric_data().columns)
    cols = df[numerical_feature_columns].corr().nsmallest(k-1, target)[target].index
    cols = cols.insert(0,target)
    cm = df[cols].corr()
    plt.figure(figsize=(10,6))
    return sns.heatmap(cm, annot=True, cmap = 'viridis')

<b>So as we have to merge the 3 grades column lets just take avg for now as we can't give
more preference ourselves for now</b> <br> Making a single column as grade avg also helps in analysis as now we only neeed to compare the features to this variable and not all the tests.

In [None]:

alcdata['G_avg'] = (alcdata['G1'] + alcdata['G2'] + alcdata['G3'])/3

#extracting features
alcdata_features = alcdata.iloc[:,:-4]

#extracting target
alcdata_target = alcdata.iloc[:,-1]

#drawing a correlation matrix for the numeric data or non object data sorted on the basis of max correlation with g_avg
subplot1 = CorrPlotLargest(alcdata,'G_avg')

#drawing a correlation matrix for the numeric data or non object data sorted on the basis of negative correlation with g_avg
subplot2 = CorrPlotSmallest(alcdata,'G_avg')


<p>After seeing this we dont get much insight, some things is that all the grades G1,G2,G3 are pretty correlated to each other, also the failures are negatively correlated with the grades as it should be. So let's see some of the top things correlated with grades.</p>
<ul>
<li>So if you father's or mother's education is high you can get good grades which seems correct</li>
<li>Your grades decrease as you age(this isn't a very strong relation but I guess as studies get harder it is difficult to score so not unexpected)</li>
    <li>Also if you have more studytime your grade is better as well.</li>
<li>And if you goout or travel more, your grades are expected to be low which is also consistent</li>
    <li>And if you have more failures, your grades would be lesser </li>
 
</ul>
<b>These were some of the highest correlations that could be seen from the given heatmap</b>

<p> Now let's see a distribution of g_avg w.r.t to all the above features to get a more better insight

In [None]:
#As the amount of ppl in same age is different lets compare expectation value of marks in each age gp
ExpectedMarks_With_Age = alcdata.groupby('age').apply(lambda x : x['G_avg']/len(x)).groupby('age').sum()
sns.jointplot(ExpectedMarks_With_Age.index,ExpectedMarks_With_Age.values)

As we can see there is an apparent down slope except for at age 20, let's see the reason why is that.

In [None]:
ExpectedMarks_With_Age[20] 
alcdata[alcdata['age'] == 20]

Well it seems there were 2 very good performers in the age gp of 20 which we could say were kind of like outliers to the data set that we had and are making the mean score of age gp 20 skyrocket.


Next we are checking the relation of grades with father's and mother's education and how it correlates with the 
grade

In [None]:
rel = ['Medu','Fedu','studytime']
for col in rel:
    ex = alcdata.groupby(col)['G_avg'].mean()
    #print(col)
    #print(ex.index,ex.values)
    sns.jointplot(ex.index,ex.values)
    
 

All of the above factors except for some outliers at the value where mother and father have an education level of 0 , we 
see an apparent increase in grades as father's of mother's education increase. We can also see the same relation with study time as well.

### 2. If there is a need for encoding some of the features,  how would you go  about it? 
Would you consider combining certain encodings together ?


Now that we have looked at the *numerical categories* lets take a look at the *categorical data* as well with the help of one hot encoding.

In [None]:
alcdata.describe()
obj_alcdata = alcdata_features.select_dtypes(include=['object']).copy()
#obj_alcdata

onehot_objects = pd.get_dummies(obj_alcdata)
onehot_objects
#These are all the categorical items
#for col in onehot_objects.columns:
means = []

#print(alcdata.iloc[onehot_objects[onehot_objects[col] == 1].index]['G_avg'].sum())
for col in onehot_objects.columns:
    #print(alcdata.iloc[onehot_objects[onehot_objects[col] == 1].index]['G_avg'].sum(),len(onehot_objects[col]))
    means.append(alcdata.iloc[onehot_objects[onehot_objects[col] == 1].index]['G_avg'].sum()/len(onehot_objects[onehot_objects[col]==1]))

print(len(means))
plt.figure(figsize = (20,8))
g = sns.barplot(data=onehot_objects,ci=None)
plt.xticks(rotation = 'vertical')
plt.ylabel("Probability of occurence of a particular categorical gp")
#onehot_objects[onehot_objects[onehot_objects.columns] == 1].sum()[onehot_objects.columns[1]]/len(onehot_objects[onehot_objects.columns[1]])

for index, row in alcdata.iterrows():
    g.text(index,onehot_objects[onehot_objects[onehot_objects.columns] == 1].sum()[onehot_objects.columns[index]]/len(onehot_objects[onehot_objects.columns[index]]), round(means[index],1), color='black', ha="center")
#A plot of probabilities of the entity being in one of the following categories

Above is plot of average marks in comparision to the categorical datasets. Y axis shows the probability of occurence of a particular dataset, X, axis shows the dataset and the value on top of the bar plots tells the mean score in each category.

#### So we can make some statements after saying this like
1. School doesnt affect the grades by a lot
2. Even though there are more females in school , men score higher as compared to females
3. Rural or Urban living conditions affect the grade by quite a lot.
4. Apparently if the parents are apart the student are scoring more, but it could be due to some specially high cases in it as the set with parents apart is pretty low as compared to the parents together
5. Mother's and Father's job plays a major role in upgrading the grades, with the best combination being mother in *healthcare* and father in *teaching* profession.
6. Student who have a father or mother are likely to score compared to with other guardian.
7. So if a student has extra educational support the student scores lesser in the exams, hmmmmmm interseting,same with family educational support.
8. Extra paid classes don't help the student much.
9. If a student wants to pursue higher education the data shows , that they have scored much better grades than those who don't which makes sense as if they don't want to pursue higher there is no reason for them to get good grades here in the first place, maybe they are taking over some father's buisness or such.
10. Internet helps a student to learn more and hence score more in the exams.

Now to improve the features as some of the features only have 2 binary otuputs we can clearly do label encoding in them,
as for the others Im going with one hot for them as It doesnt increase the features like that by a lot

In [None]:
le = preprocessing.LabelEncoder()
cols = alcdata.nunique()
new_cols = []
other_cols = []


for col in alcdata.columns:
        if alcdata.nunique()[col] == 2:
            new_cols.append(col)
        else:
            if(alcdata[col].dtype == 'object'):
                other_cols.append(col)
le = preprocessing.LabelEncoder()
for col in new_cols:
    alcdata[col] = le.fit_transform(alcdata[col].astype(str))

new_obj_data = pd.get_dummies(alcdata[other_cols])

new_obj_data

alcdata = alcdata.merge(new_obj_data,on = alcdata.index)

alcdata.drop(other_cols,inplace = True)

alcdata


### 3. Try to find out how family relation(famrel) and parents cohabitation(Pstatus) affect grades of students. 


In [None]:
alcdata['Pstatus'] = alcdata['Pstatus'].apply(lambda x: x.replace('A','1')).apply(lambda x: x.replace('T','0')).astype(int)
rel_alcdata = alcdata[['famrel','Pstatus','G3']].groupby('famrel')


final_val = rel_alcdata['Pstatus'].sum()/rel_alcdata['Pstatus'].size()
final_val
sns.barplot(final_val.index,final_val.values)
plt.ylabel('PStatus')
#enter code/answer in this cell. You can add more code/markdown cells below for your answer. 

Here we can see that the probability that has a poor family condition the probability that his parents are apart is quite high, so there is clear correlation between these 2 columns. There are an increase towards the end, lets see it in next maybe what is the reason.


### 4. Figure out which features in the data are skewed, and propose a way to remove skew from all such columns. 

In [None]:
#enter code/answer in this cell. You can add more code/markdown cells below for your answer.
#alcdata_features.dtypes
#alcdata_features['health']
#sns.barplot(alcdata_target.index,alcdata_target['G3'].values)
#plt.figure(figsize = (15,5))
#alcdata_features.mean()
possible_skews = alcdata.loc[:, alcdata.dtypes != 'object']
abs(possible_skews.kurtosis()) > 0.5
sns.pairplot(data = possible_skews)

#Features which show that they have a skew when saw kurtosis
#   traveltime,failures,famrel,Dalc,Walc,absences

plt.show()


#possible_skews['absences'].apply(lambda x: np.power(x,0.02)).apply(lambda x:x - possible_skews['absences'].mean()).hist(bins = 50,figsize = (5,5))
#sns.boxplot(alcdata_features.absences,ax=ax[1])
plt.hist(possible_skews['absences'].apply(lambda x: np.power(x,0.45)),bins=40)

#Since cotegoricl data don;t come from the normal distribution therefore there is no concept of skew in these variables
#Thus the only variable here witha skew is absence/ all others either are categorical / have minimal kurtosis / 
#Graph doesnt show much skew
#plt.hist(np.power(alcdata.Walc,1),bins=40)



In [None]:
possible_skews['absences'].apply(lambda x: np.power(x,0.45)).kurtosis()


In [None]:
possible_skews.hist(bins = 50,figsize = (15,15))
plt.show()

In [None]:
sns.pairplot(alcdata)

# Part - 2
## FIFA 2019  Data


### 1. Which clubs are the most economical? How did you decide that?

This is just a test column as some values for bayern munich werent upto the mark

In [None]:
fifadata.iloc[fifadata['Value'].apply(lambda x: x.replace('€','')).apply(lambda x: x.replace('M','e06')).apply(lambda x: x.replace('K','e03')).astype(float).sort_values(ascending = False).index[10:]][['Name','Value','Release Clause','Club']][fifadata['Club'] == "FC Bayern München"].head(100)
#fifadata['Release Clause'].apply(lambda x: x.replace('€','')).apply(lambda x: x.replace('M','e06')).apply(lambda x: x.replace('K','e03')).astype(float)
#fifadata['Release Clause'] = fifadata['Release Clause'].apply(lambda x: x.replace('€','')).apply(lambda x: x.replace('M','e06')).apply(lambda x: x.replace('K','e03')).astype(float)

#remove_nan = fifadata.copy()
null_data = fifadata[fifadata['Release Clause'].isnull()]
non_null_data = fifadata[fifadata['Release Clause'].notnull()]
non_null_data['Release Clause'] = non_null_data['Release Clause'].apply(lambda x: x.replace('€','')).apply(lambda x: x.replace('M','e06')).apply(lambda x: x.replace('K','e03')).astype(float)
gped_data = non_null_data.groupby('Overall')
#non_null_data.head()
gped_data = gped_data['Release Clause'].mean()
#gped_data
#null_data['Release Clause'] = gped_data[null_data['Overall']].values
#null_data['Release Clause']
#fifadata[fifadata['Release Clause'].isnbull()].apply(lambda x: x)
#fifadata["Release Clause"].fillna(null_data["Release Clause"], inplace=True)

#gped_data[88]

#fifadata
#CorrPlotLargest(remove_nan,'Release Clause')
#So i guess International Reputation and Overall are pretty good standards to get the release clause


above cell is later used in next column to replace the 0 or null value in the release claluse with the mean of the players with same overall as that player, due to high correlation between those columns.

So my intution for the most economical club would be one which even whn it goes backrupt and has to sell all its players
has the maximum amount of money. So when a player is sold the minimum amount that is recieved by the club is the release 
clause and value here is assumed by me as the money spent by the club to get that player. 
Therefore for me economical:
    
                                             Release Clause - (Value + Wages)


In [None]:
#So my intution for the most economical club would be one which has enough money for it to have 
#highest release clause when summed over all the players which yields us the following list


fifadata['Release Clause'].replace(np.nan,'€0.0M',inplace=True)
#fifadata[fifadata['Release Clause'] == '€0.0M']
testdata = fifadata.copy()
testdata.head()
testdata.head()
testdata['Release Clause'] = testdata['Release Clause'].apply(lambda x: x.replace('€','')).apply(lambda x: x.replace('M','e06')).apply(lambda x: x.replace('K','e03')).astype(float)
testdata['Value'] = testdata['Value'].apply(lambda x: x.replace('€','')).apply(lambda x: x.replace('M','e06')).apply(lambda x: x.replace('K','e03')).astype(float)
testdata['Wage'] = testdata['Wage'].apply(lambda x: x.replace('€','')).apply(lambda x: x.replace('M','e06')).apply(lambda x: x.replace('K','e03')).astype(float)

null_data['Release Clause'] = gped_data[null_data['Overall']].values
#null_data['Release Clause']
#fifadata[fifadata['Release Clause'].isnull()].apply(lambda x: x)
#fifadata["Release Clause"].fillna(null_data["Release Clause"], inplace=True)

testdata[testdata['Release Clause'] == 0]['Release Clause'] = null_data['Release Clause']
#testdata.head()
club_data = testdata[['Wage','Value','Release Clause','Club']].groupby('Club')
AmountPerClub = club_data['Release Clause'].apply(lambda x: x.sum()) - club_data['Value'].apply(lambda x: x.sum()) - club_data['Wage'].apply(lambda x: x.sum())   
AmountPerClub
final_values = AmountPerClub.apply(lambda x: (x - AmountPerClub.values.min())/AmountPerClub.values.max())
final_values =AmountPerClub.sort_values(ascending = False)
fig, axs = plt.subplots(3,1,figsize=(17,15))
fig.tight_layout()
plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=None, hspace=1.5)
axs[0].tick_params(axis='x', rotation=90)
sns.barplot(final_values[:100].index,final_values[:100].values,ax=axs[0])
axs[1].tick_params(axis='x', rotation=90)
sns.barplot(final_values[100:200].index,final_values[100:200].values,ax=axs[1])
axs[2].tick_params(axis='x', rotation=90)
sns.barplot(final_values[200:300].index,final_values[200:300].values,ax=axs[2])


In [None]:
fifadata['Club'].isnull().sum()

Here some of the players dont have a club value as well lets keep it for later to relate club value with other things. But it is potentially very difficult to predict this value. 

### 2. What is the relationship between age and individual potential of the player? How does age influence the players' value? At what age does the player exhibit peak pace ?

In [None]:
corrMatrix = testdata[['Age','Potential','Value']].corr()
#plt.figure(figsize = (20,20))
sns.heatmap(corrMatrix, annot=True)
plt.show()

In [None]:
#fifadata['Age'].isnull().sum()
gp_data = fifadata[['Age','Potential']].groupby('Age')
fv = gp_data['Potential'].max()
sns.barplot(fv.index,fv.values)

The decrease plot is shown here. The max has been takes as we are seeing the peak performance at a particular age gp.

In [None]:
sns.jointplot(fifadata['Potential'],fifadata['Age'])

In [None]:
sns.jointplot(testdata['Age'],testdata['Value'])

This graph shows that a player's value is max at some point between 25-30 and follows a gaussian pattern

In [None]:
fifadata.columns
#'Acceleration','SprintSpeed', 'Agility' ,  'Stamina'
fifadata['Acceleration'].fillna(fifadata['Acceleration'].median(),inplace = True)
fifadata['SprintSpeed'].fillna(fifadata['SprintSpeed'].median(),inplace = True)
fifadata['Agility'].fillna(fifadata['Agility'].median(),inplace = True)
fifadata['Stamina'].fillna(fifadata['Stamina'].median(),inplace = True)
fifadata[['Acceleration','SprintSpeed', 'Agility' ,  'Stamina']].isnull().sum()
#fifadata[['SprintSpeed','Age']].groupby('Age').mean()

### 3. What skill sets are helpful in deciding a player's potential? How do the traits contribute to the players' potential? 

In [None]:
testdata = fifadata.copy()
#fifadata['Wage'].isnull().sum()
plt.figure(figsize = (30,4))
testdata['Wage'] = testdata['Wage'].apply(lambda x: x.replace('€','')).apply(lambda x: x.replace('M','e06')).apply(lambda x: x.replace('K','e03')).astype(float)

#fifadata.info()
CorrPlotLargest(testdata,'Potential')
#CorrPlotSmallest(testdata,'Potential')


So here we can see there is quite a high correlation:

Potential V traits: +ve correlation with Overall , Reactions, Composure, International Reputation, Special and so on

In [None]:
cols = testdata.columns
fig,ax = plt.subplots(5,1,figsize = (20,20))
sns.barplot(testdata['Overall'],testdata['Potential'],ax = ax[0])
sns.barplot(testdata['Reactions'],testdata['Potential'],ax = ax[1])
sns.barplot(testdata['Composure'],testdata['Potential'],ax = ax[2])
sns.barplot(testdata['International Reputation'],testdata['Potential'],ax = ax[3])
#sns.barplot(testdata['Special'],testdata['Potential'],ax = ax[4])


We can see in all of them there is an apparent increase in the Potential along witht he respective features

In [None]:
cols = testdata.columns
fig,ax = plt.subplots(5,1,figsize = (20,20))
sns.barplot(testdata['BallControl'],testdata['Potential'],ax = ax[0])
sns.barplot(testdata['Skill Moves'],testdata['Potential'],ax = ax[1])
sns.barplot(testdata['LongPassing'],testdata['Potential'],ax = ax[2])
sns.barplot(testdata['Dribbling'],testdata['Potential'],ax = ax[3])

### 4. Which features directly contribute to the wages of the players?

In [None]:
CorrPlotLargest(testdata,'Wage')
#CorrPlotSmallest(testdata,'Wage')

#enter code/answer in this cell. You can add more code/markdown cells below for your answer. 

### 5. What is the age distribution in different clubs? Which club has most players young?

Here I am assuming young age refers to below 21, and on basis of that I am finding the count of players below that age
in different clubs

In [None]:
club_data = fifadata[['Age','Club']].groupby('Club')
young_age = 21
ageVclub = club_data['Age'].apply((lambda x: len(x[x<=young_age])))
final_values = ageVclub.sort_values(ascending = False)
fig, axs = plt.subplots(3,1,figsize=(17,18))
fig.tight_layout()
plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=None, hspace=2)
axs[0].tick_params(axis='x', rotation=90)
sns.barplot(final_values[:100].index,final_values[:100].values,ax=axs[0])
axs[1].tick_params(axis='x', rotation=90)
sns.barplot(final_values[100:200].index,final_values[100:200].values,ax=axs[1])
axs[2].tick_params(axis='x', rotation=90)
sns.barplot(final_values[200:300].index,final_values[200:300].values,ax=axs[2])
plt.show()

Thus we can see that the Clubs like FC NORdjessallen, Ajax etc. have invested more into building their team comprised of younger players

# Part - 3
## UK Road Accidents Data


The UK government amassed traffic data from 2000 and 2016, recording over 1.6 million accidents in the process and making this one of the most comprehensive traffic data sets out there. It's a huge picture of a country undergoing change.

### 1. The very first step should be to merge all the 3 subsets of the data.

Since all the dataframes have exactly the same columns which I saw individually, we can simply concat the three sets 
and work on these sets individually according to our will.

In [None]:
print(accidata1.columns == accidata2.columns)
print(accidata2.columns == accidata1.columns)
print(accidata2.columns == accidata3.columns)
print(accidata3.columns == accidata2.columns)

In [None]:
accidata = pd.concat([accidata1,accidata2,accidata3])
#Since on initial analysis it displayed itself as a entirely null column
accidata = accidata.drop('Junction_Detail',axis = 1)
#len(accidata)
#accidata.isnull().sum()
#accidata.info()




#Again the number of null values in Longitude and Latitude seemed to be pretty low so I just straight away removed them 
#so that it doesn't cause trouble in later stages
accidata = accidata.dropna(axis=0,thresh = 28)

#Removing the null values of those variable that are very hard to predict and are pretty low
accidata = accidata[accidata['Special_Conditions_at_Site'].notna()]
#accidata = accidata[accidata['LSOA_of_Accident_Location'].notna()]

#Creating a new category unknown for categorical variable that have a high amount of nulll values
accidata['LSOA_of_Accident_Location'] = np.where(accidata['LSOA_of_Accident_Location'].isnull(),"Unknown_location",accidata['LSOA_of_Accident_Location'])
accidata['Junction_Control'] = np.where(accidata['Junction_Control'].isnull(),"Unknown_Junction",accidata['Junction_Control'])

#Filling with the max imputed value
accidata['Pedestrian_Crossing-Human_Control'] = accidata['Pedestrian_Crossing-Human_Control'].fillna('None within 50 metres')
accidata['Pedestrian_Crossing-Physical_Facilities'] = accidata['Pedestrian_Crossing-Physical_Facilities'].fillna('No physical crossing within 50 meters')

#Removing all the rest of the null values
accidata = accidata.dropna()

#Setting date to be a python timestamp
accidata['Date'] = accidata['Date'].apply(lambda x: time.mktime(datetime.datetime.strptime(str(x),"%d/%m/%Y").timetuple()))

#Very high correlation with date so can be removed
accidata = accidata.drop('Year',axis = 1)

accidata.notna().sum()
test = accidata['Accident_Severity']
#enter code/answer in this cell. You can add more code/markdown cells below for your answer. 

The above column is a mixture of merge of all the columns and we are preprocessing the data as well. More details have been written somewhat in the comments.

In [None]:
accidata['LSOA_of_Accident_Location'].value_counts()

In [None]:
accidata['Junction_Control'].value_counts()

Plotting a Correlation Mapping between the different features

In [None]:
plt.figure(figsize = (15,15))
corrMatrix = accidata.corr()
sns.heatmap(corrMatrix, annot=True)

Now since there 1:1 correlation between longitude and Location_Eastern_OSGR and some with latitude and 
Location_Northing_OSGR, we can remove one of the 2 columns in both.

In [None]:
#Checking for skew
accidata[['Location_Easting_OSGR']]
accidata['Location_Easting_OSGR'].kurtosis()
accidata[['Location_Northing_OSGR']]
accidata['Location_Northing_OSGR'].kurtosis()

#Location Northing Shows a bit of skew so updating it to not have skew
accidata['Location_Northing_OSGR'] = accidata['Location_Northing_OSGR'].apply(lambda x: np.log(x))

#accidata

In [None]:
#removing longitude and latitude as they are the same as the other 2 and above we have update the skew in both and stored
#here in this column
accidata = accidata.loc[:, accidata.columns != 'Longitude']
accidata = accidata.loc[:, accidata.columns != 'Latitude']
accidata

Just some columns to see the plot of eastern and checking if there is no skew there

In [None]:
#Location
accidata['Location_Easting_OSGR'].apply(lambda x : np.power(x,1)).hist(bins = 1000,figsize = (5,5))

In [None]:
pd.set_option('display.max_columns', 50)
accidata.head(150)

### 2. What are the number of casualties in each day of the week? Sort them in descending order. 

In [None]:
#Pretty much a simple group by, sum and then sort
Weekday_Deaths = accidata.groupby('Day_of_Week')['Number_of_Casualties'].sum()
Weekday_Deaths = Weekday_Deaths.sort_values(ascending = False)
labels = ['Mon','Tue','Wed','Thur','Fri','Sat','Sun']
g = sns.barplot(Weekday_Deaths.index,Weekday_Deaths.values,order = Weekday_Deaths.index)


### 3. On each day of the week, what is the maximum and minimum speed limit on the roads the accidents happened?

In [None]:
#Here you can change the 'Road_Type' column to see how exactly that feature changes speed limit.
MaxSpeedLimit_Week = accidata.groupby(['Day_of_Week','Road_Type'])['Speed_limit'].max()
MinSpeedLimit_Week = accidata.groupby(['Day_of_Week','Road_Type'])['Speed_limit'].min()

fig, axs = plt.subplots(14,1,figsize=(15,50))

for i in range(1,8):
    if((i-1)%2 == 0):
        sns.barplot(MaxSpeedLimit_Week[i].index,MaxSpeedLimit_Week[i].values,ax = axs[i-1])
        axs[i-1].set_ylabel('Max Speed on Week Day' + str(i))
        sns.barplot(MinSpeedLimit_Week[i].index,MinSpeedLimit_Week[i].values,ax = axs[i])
        axs[i].set_ylabel('Min Speed on Week Day' + str(i))

for i in range(1,8):
    if((i-1)%2 != 0):
        sns.barplot(MaxSpeedLimit_Week[i].index,MaxSpeedLimit_Week[i].values,ax = axs[7 + i-1])
        axs[7 + i-1].set_ylabel('Max Speed on Week Day' + str(i))
        sns.barplot(MinSpeedLimit_Week[i].index,MinSpeedLimit_Week[i].values,ax = axs[7 + i])
        axs[7 + i].set_ylabel('Min Speed on Week Day' + str(i))


Pretty much consistent data around the week, except for a change in single carriagewat min and max speed changing around
different days of the week, and the same in One way streen but only in the even week days and Sunday(7).

### 4. What is the importance of Light and Weather conditions in predicting accident severity? What does your intuition say and what does the data portray?

In [None]:
val = accidata['Light_Conditions'].value_counts()
sns.barplot(val.index,val.values)
plt.xticks(rotation = 'vertical')

In [None]:
val = accidata['Weather_Conditions'].value_counts()
sns.barplot(val.index,val.values)
plt.xticks(rotation = 'vertical')

In [None]:
Severity_Light = accidata.groupby('Accident_Severity')['Light_Conditions'].value_counts()
fig,axs = plt.subplots(2,1,figsize = (10,10))
#fig.tight_layout()
#plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=None, hspace=2)
severity_index = [item[0] for item in Severity_Light.index] 
light_index = [item[1] for item in Severity_Light.index] 
sns.barplot(severity_index[:-10],Severity_Light.values[:-10],light_index[:-10],ax = axs[0])
sns.barplot(severity_index,Severity_Light.values,light_index,ax = axs[1])

plt.xticks(rotation = 90)

<p>So we can see from the first plot that most of the accidents happened when the daylight were present followed by 
present and lit , which was not expected by me as well it makes sense for more accidents to happen in night when ppl 
can't see.</p>
<p>Seeing the second plot tells us that as accidents generally tend to be more towards a severity of 3 than compared to 2 and 1 and all of them follow the same trend as above.</p>


### 5. To predict the severity of the accidents which columns do you think are unnecessary and should be dropped before implementing a regression model. Support your statement using relevant plots and hypotheses derived from them.

In [None]:
plt.figure(figsize = (20,5))
CorrPlotLargest(accidata,'Accident_Severity')

Well this is just sad we dont have any good correlation, but they dont tell causaality so lets look at categorical data
they might provide some more insite.

In [None]:
accidata_categorical = accidata[accidata.columns[accidata.dtypes == object]]
target = accidata['Accident_Severity']
#accidata_categorical
df = pd.merge(accidata_categorical,target,on = accidata_categorical.index)
gped_data = df.groupby('Accident_Severity')
fig,axs = plt.subplots(len(accidata_categorical.columns),1,figsize = (10,10))
fig.tight_layout()
plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=None, hspace=2)
key = 0
for i in accidata_categorical.columns:
    val = gped_data[i].value_counts()
    sns.barplot(val.index,val.values,ax = axs[key])
    key += 1
plt.show()
#Severity_Light = accidata_categorical.groupby('Road_Type')['Light_Conditions'].value_counts()
#fig,axs = plt.subplots(2,1,figsize = (10,10))
#fig.tight_layout()
#plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=None, hspace=2)
#severity_index = [item[0] for item in Severity_Light.index] 
#light_index = [item[1] for item in Severity_Light.index] 
#sns.barplot(severity_index[:-10],Severity_Light.values[:-10],light_index[:-10],ax = axs[0])
#sns.barplot(severity_index,Severity_Light.values,light_index,ax = axs[1])

#plt.xticks(rotation = 90)

In [None]:
accidata['Number_of_Casualties'].isnull().sum()

#below is a set of data that needs to be removed
#accidata['Date'] = pd.to_datetime(val, errors='coerce', cache=False).strftime('%m/%d/%Y')
accidata['Date'] = accidata['Date'].apply(lambda x: time.mktime(datetime.datetime.strptime(str(x),"%d/%m/%Y").timetuple()))

#too much corelation with date
accidata = accidata.drop('Year',axis = 1)
#accidata = accidata.drop('Year',axis = 1)

#There is no need for index or time there is realy low correaltion with them
accidata = accidata.drop('Accident_Index',axis = 1)
accidata = accidata.drop('Time',axis = 1)

#Im using label encoder for all of the features
le = preprocessing.LabelEncoder()
cols = accidata.select_dtypes(include=['object']).columns
for col in cols:
    accidata[col] = le.fit_transform(accidata[col].astype(str))

accidata.info()

### 6. Implement a basic Logistic Regression Model using scikit learn with cross validation = 5, where you predict the severity of the accident (Accident_Severity). Note that here your goal is not to tune appropriate hyperparameters, but to figure out what features will be best to use.

In [None]:
testdata = accidata.copy()
testdata1 = accidata.copy()
testdata2 = accidata.copy()

testdata[testdata['Accident_Severity'] < 3] = 0
testdata[testdata['Accident_Severity'] == 3] = 1
testdata['Accident_Severity'].value_counts()

testdata1[testdata1['Accident_Severity'] != 2] = 0
testdata1[testdata1['Accident_Severity'] == 2] = 1
testdata1['Accident_Severity'].value_counts()


testdata2[testdata2['Accident_Severity'] != 1] = 0
testdata2[testdata2['Accident_Severity'] == 1] = 1
testdata2['Accident_Severity'].value_counts()

#testdata1 = accidata[accidata['Accident_Severity'] < 3]
testdata['Accident_Severity'].value_counts()
#testdata2 = accidata[accidata['Accident_Severity'] < 3]

In [None]:
#accidata[accidata['Accident_Severity'] > 1] = 1
from sklearn.model_selection import train_test_split

testdata['Accident_Severity'].value_counts()
y = accidata['Accident_Severity']
X = accidata.loc[:, accidata.columns != 'Accident_Severity']
X_train,X_test,Y_train,Y_test = train_test_split(X,y,test_size=0.3)

X_scaled = preprocessing.scale(X_train)
len(Y_test3)
#enter code/answer in this cell. You can add more code/markdown cells below for your answer. 

In [None]:
from sklearn.linear_model import LogisticRegression

logreg1 = LogisticRegressionCV(cv=5, n_jobs = -1,random_state=0,multi_class = "multinomial").fit(X_scaled,Y_train)



In [None]:
X_test = preprocessing.scale(X_test)
X_test

In [None]:
logreg1.score(X_test,Y_test)