# Breaking down EDA for a set of 4 datasets

In [None]:
import numpy as np
import pandas as pd

from IPython.display import display, display_html , HTML

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split

from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report, roc_curve
from sklearn.model_selection import learning_curve, cross_val_score, GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import RobustScaler,StandardScaler,MinMaxScaler

import warnings
warnings.filterwarnings('ignore')

In [None]:
characteristics = pd.read_csv('../input/2019-database-of-road-traffic-injuries/caracteristiques-2019.csv')
characteristics.name = 'characteristics'
places = pd.read_csv('../input/2019-database-of-road-traffic-injuries/lieux-2019.csv')
places.name = 'places'
drivers = pd.read_csv('../input/2019-database-of-road-traffic-injuries/usagers-2019.csv')
drivers.name = 'drivers'
vehicles = pd.read_csv('../input/2019-database-of-road-traffic-injuries/vehicules-2019.csv')
vehicles.name = 'vehicles'

datasets = [characteristics,places,vehicles,drivers]

characteristics = characteristics.set_index('Num_Acc')
places = places.set_index('Num_Acc')
vehicles = vehicles.set_index('id_vehicule')
drivers = drivers.set_index('id_vehicule')

pd.set_option('display.max_row',max(characteristics.shape[0],places.shape[0],drivers.shape[0],vehicles.shape[0]))
pd.set_option('display.max_column',max(characteristics.shape[1],places.shape[1],drivers.shape[1],vehicles.shape[1]))

for df in datasets:
    print ("The dataset",df.name,"has",df.shape[0],"rows and",df.shape[1],"columns")

In [None]:
display(HTML('<h1>characteristics</h1>'))
display(characteristics.head())
display(HTML('<h1>vehicles</h1>'))
display(vehicles.head())
display(HTML('<h1>drivers</h1>'))
display(drivers.head())
display(HTML('<h1>places</h1>'))
display(places.head())

<h1><center><mark>Expand the following markdown to read the features description</mark></center></h1>

## Num_Acc
Identification number of the accident.

## jour
Day of the accident.

## mois
Month of the accident.

## an
Year of accident.

## hrmn
Hour and minutes of the accident.
This one is tricky, it correspond to a percentage of 24h (your turn to convert it into day time)

## lum
	Light: lighting conditions in which the accident occurred:
1. Full day
2. Twilight or dawn
3. Night without public lighting
4. Night with public lighting not on
5. Night with public lighting on

## dep
Department: Code INSEE (National Institute of Statistics and Economic Studies) of department (2A Corse-du-Sud. 2B Haute-Corse).

## com
Municipality: The municipality number is a code given by INSEE. The code is made up of the code INSEE of the department followed by 3 digits.

## agg
	Location :
1. Outside agglomeration
2. In built-up areas

## int
	Intersection:
1. Excluding intersection
2. Intersection in X
3. T-intersection
4. Y intersection
5. Intersection with more than 4 branches
6. Roundabout
7. Place
8. Level crossing
9. Other intersection 

## atm
	Atmospheric conditions:
-1. Not specified
1. Normal
2. Light rain
3. Heavy rain
4. Snow. hail
5. Fog. smoke
6. Strong wind. storm
7. Dazzling weather
8. Cloudy weather
9. Other

## col
	Collision type:
-1. Not specified
1. Two vehicles. frontal
2. Two vehicles. from the rear
3. Two vehicles. from the side
4. Three vehicles and more. in a chain
5. Three or more vehicles. multiple collisions
6. Other collision 7. No collision

## adr
Postal address: variable entered for accidents occurring in built-up areas.

## lat
Latitude

## Long
Longitude 

## catr
	Road category:
1. Highway
2. National road
3. Departmental road
4. Communal roads
5. Outside the public network
6. Parking lot open to public traffic
7. Urban metropolis roads
9. other

## voie
Route number.

## V1
Numerical index of the road number (example: 2 bis, 3 ter etc.). 

## V2
Alphanumeric road index letter.

## circ
	Traffic regime:
-1. Not specified
1. One way
2. Bidirectional
3. A separate carriageway
4. With variable assignment channels

## nbv
Total number of traffic lanes.

## vosp
	Indicates the existence of a reserved lane, regardless of whether or 
	not the accident took place on this way.
-1. Not specified
0. Not applicable
1. Cycle path
2. Cycle lane
3. Reserved lane

## prof
	Longitudinal profile describes the gradient of the road at the location of the accident:
-1. Not specified
1. Flat
2. Slope
3. hilltop
4. Bottom of coast

## pr
Number of the associated PR (number of the upstream terminal). 
The value -1 means that the PR is not informed.

## pr1
Distance in meters from the PR (in relation to the upstream terminal). 
The value -1 means that the PR is not informed.

## plan
	Plan layout:
-1. Not specified
1. rectilinear part
2. In a curve to the left
3. In a curve to the right 4. In "S"

## lartpc
Width of the central reservation (TPC) if it exists (in m).
## larrout
Width of the roadway used for vehicular traffic 
is not included in the stopping strips emergency, TPC and parking spaces (in m).

## surf
	Surface condition: 
-1. Not specified
1. Normal
2. Wet
3. Puddles
4. Flooded
5. Snowy
6. Mud
7. Icy
8. Fat. oil
9. Other

## infra
	Development. Infrastructure:
-1. Not specified
0. None
1. Underground. tunnel
2. Bridge. flyover
3. Exchanger or connection sling
4. Railroad
5. Crossroads
6. Pedestrian zone
7. Toll zone
8. Site
9. Others

## situ
	Situation of the accident:
-1. Not specified
0. None
1. On the road
2. On emergency lane
3. On the shoulder
4. On the sidewalk
5. On a cycle path
6. On other special track
8. Others

## vma
Maximum authorized speed at the scene and at the time of the accident. 

## vehicle_id
Unique identifier of the vehicle used for each user occupying this 
vehicle (including pedestrians who are attached to the vehicles 
which collided with them). Numerical code.

## Num_Veh
Identifier of the vehicle taken back for each of the users 
occupying this vehicle (including pedestrians who are attached to the 
vehicles which collided with them). Alphanumeric code. 

## senc
	Flow direction :
-1. Not specified
0. Unknown
1. PK or PR or increasing postal address number
2. PK or PR or decreasing postal address number
3. Lack of reference

## catv
	Vehicle category:
00. Not determinable
01. Bicycle
02. Moped &lt;50cm3
03. Cart (Quadricycle with bodywork motor) (formerly "cart or motor tricycle")
04. Reference not used since 2006 (registered scooter)
05. Reference unused since 2006 (motorcycle)
06. Reference unused since 2006 (sidecar)
07. VL only
08. Reference unused since 2006 (VL + caravan)
09. Reference not used since 2006 (light vehicles + trailer)
10. VU only 1.5T &lt;= PTAC &lt;= 3.5T with or without trailer (formerly VU only 1.5T &lt;= PTAC &lt;= 3.5T)
11. Reference not used since 2006 (VU (10) + caravan)
12. Reference not used since 2006 (VU (10) + trailer)
13. PL only 3.5T <PTCA <= 7,5T 
14. PL only > 7.5T
15. PL> 3,5T + trailer
16. Road tractor only
17. Road tractor + semi-trailer
18. Reference not used since 2006 (public transport)
19. Reference not used since 2006 (tram)
20. Special gear
21. Farm tractor
30. Scooter <50 cm3
31. Motorcycle> 50 cm3 and <= 125 cm3
32. Scooter> 50 cm3 and <= 125 cm3
33. Motorcycle> 125 cm3
34. Scooter> 125 cm3
35. Light quad <= 50 cm3 (Quadricycle without bodywork engine)
36. Heavy quad> 50 cm3 (Quadricycle without bodywork engine)
37. Bus
38. Coach
39. Train
40. Tram
41. 3WD <= 50 cm3
42. 3WD> 50 cm3 <= 125 cm3
43. 3WD> 125 cm3
50. EDP with motor
60. EDP without motor
80. VAE
99. Other vehicle 

## obs
	Fixed obstacle struck:
-1. Not specified
0. Not applicable
1. Parked vehicle
2. Tree
3. Metal slide
4. Concrete slide
5. Other slide
6. Building, wall, bridge pier
7. Vertical signage support or emergency call station
8. Post
9. Street furniture
10. Parapet
11. Island, refuge, upper terminal
12. Sidewalk edge
13. Ditch, embankment, rock face
14. Other fixed obstacle on the road
15. Other fixed obstacle on sidewalk or shoulder
16. Clearance of the roadway without obstacle
17. Nozzle. aqueduct head

## obsm
	Movable obstacle struck:
-1. Not specified
0. None
1. Pedestrian
2. Vehicle
4. Rail vehicle
5. Domestic animal
6. Wild animal
9. Other 

## choc
Initial shock point:
-1. Not specified
0. None
1. Before
2. Right front
3. Front left
4. Rear
5. Right back
6. Left rear
7. Right side
8. Left side
9. Multiple shocks (rolls) 

## manv
	Main maneuver before the accident:
-1. Not specified
0. Unknown
1. Without change of direction
2. Same direction, same row
3. Between 2 lines
4. In reverse
5. In the wrong way
6. Crossing the central reservation
7. In the bus lane, in the same direction
8. In the bus lane, in the opposite direction
9. By inserting
10. By making a U-turn on the road
	Changing lane
11. Left
12. Right
	Deported
13. Left
14. Right
	Turning
15. Left
16. Right
	Exceeding
17. Left
18. Right
	Various
19. Crossing the road
20. Parking maneuver
21. Avoidance maneuver
22. Door opening
23. Stopped (except parking)
24. Parked (with occupants
25. Traveling on sidewalk
26. Other maneuvers 

## motor
	Vehicle engine type:
-1. Not specified
0. Unknown
1. Hydrocarbons
2. Electric hybrid
3. Electric
4. Hydrogen
5. Human
6. Other 

## occutc
Number of occupants in public transport. 

## id_vehicule
Unique identifier of the vehicle used for each user occupying this vehicle (including
pedestrians who are attached to the vehicles which collided with them). Numerical code. 

## Num_Veh
Identifier of the vehicle taken back for each of the users occupying this vehicle 
(including pedestrians who are attached to the vehicles which collided with 
them). Alphanumeric code. 

## place
Used to locate the space occupied in the vehicle by the user at the time of the accident
Check on this link for the pattern : https://ibb.co/NsTxbXP

## catu
	User category:
1. Driver
2. Passenger
3. Pedestrian 

## grav
	Severity of user injury, injured users are classified into three categories of
	victims plus unharmed:
1. Unharmed
2. Killed
3. Injured hospitalized
4. Slightly injured 

## sexe
	Driver gender:
1. Male
2. Female

## An_nais
Year of birth of the driver

## trajet
	Reason for travel at the time of the accident:
-1. Not specified
0. Not specified
1. Home. work
2. Home. school
3. Shopping. shopping
4. Professional use
5. Walk. leisure
9. Other  



Security equipment until 2018 was divided into 2 variables: existence and use.
From 2019, this concerns use with up to 3 possible devices for the same user
(especially for motorcyclists whose helmets and gloves are compulsory). 


## secu1
	The character intelligence indicates the presence and use of safety equipment:
-1. Not specified
0. No equipment
1. Belt
12
2. Helmet
3. Children's device
4. reflective vest
5. Airbag (2WD / 3WD)
6. Gloves (2WD / 3WD)
7. Gloves + Airbag (2WD / 3WD)
8. Not determinable
9. Other

## secu2
	The character intelligence indicates the presence and use of safety equipment:
-1. Not specified
0. No equipment
1. Belt
2. Helmet
3. Children's device
4. reflective vest
5. Airbag (2WD / 3WD)
6. Gloves (2WD / 3WD)
7. Gloves + Airbag (2WD / 3WD)
8. Not determinable
9. Other

## secu3
	The character intelligence indicates the presence and use of safety equipment:
-1. Not specified
0. No equipment
1. Belt
2. Helmet
3. Children's device
4. reflective vest
5. Airbag (2WD / 3WD)
6. Gloves (2WD / 3WD)
7. Gloves + Airbag (2WD / 3WD)
8. Not determinable
9. Other

## locp
	Localisation du piéton :
-1. Non renseigné
0. Sans objet
Sur chaussée :
1. A + 50 m du passage piéton
2. A. 50 m du passage piéton
	Sur passage piéton :
3. Sans signalisation lumineuse
4. Avec signalisation lumineuse
	Divers :
5. Sur trottoir
6. Sur accotement
7. Sur refuge ou BAU
8. Sur contre allée
9. Inconnue 

## actp
	Pedestrian action:
-1. Not specified
	Moving
0. Not specified or not applicable
1. Direction of colliding vehicle
2. Opposite direction of the vehicle
	Various
3. Crossing
4. Masked
5. Playing. running
6. With animal
9. Other
A. Get on / off the vehicle
B. Unknown 

## etatp
	This variable is used to specify whether the injured pedestrian was alone or not:
-1. Not specified
1. Alone
2. Accompanied
3. In a group 

In [None]:
display(HTML('<h1><center>Missing values of the different tables (%)</center></h1>'))

a = pd.DataFrame(np.transpose(np.array((characteristics.columns,round(characteristics.isna().sum()/characteristics.shape[0]*100,2)),dtype=object,)),columns=['features','missing_rate'])
b = pd.DataFrame(np.transpose(np.array((vehicles.columns,round(vehicles.isna().sum()/vehicles.shape[0]*100,2)),dtype=object,)),columns=['features','missing_rate'])
c = pd.DataFrame(np.transpose(np.array((places.columns,round(places.isna().sum()/places.shape[0]*100,2)),dtype=object,)),columns=['features','missing_rate'])
d = pd.DataFrame(np.transpose(np.array((drivers.columns,round(drivers.isna().sum()/drivers.shape[0]*100,2)),dtype=object,)),columns=['features','missing_rate'])

def highlight_greaterthan(x):
    if x.missing_rate > 80:
        return ['background-color: #FFCECE']*2
    if x.missing_rate > 40:
        return ['background-color: #FFE9CE']*2
    if x.missing_rate > 5:
        return ['background-color: #FFFECE']*2
    else:
        return ['background-color: #CEFFFC']*2
    
a = a.style.apply(highlight_greaterthan, axis=1).set_table_styles([{
    'selector': 'caption',
    'props': [
        ('color', '#585858'),
        ('font-size', '30px')
    ]
}])
b = b.style.apply(highlight_greaterthan, axis=1).set_table_styles([{
    'selector': 'caption',
    'props': [
        ('color', '#585858'),
        ('font-size', '30px')
    ]
}])
c = c.style.apply(highlight_greaterthan, axis=1).set_table_styles([{
    'selector': 'caption',
    'props': [
        ('color', '#585858'),
        ('font-size', '30px')
    ]
}])
d = d.style.apply(highlight_greaterthan, axis=1).set_table_styles([{
    'selector': 'caption',
    'props': [
        ('color', '#585858'),
        ('font-size', '30px')
    ]
}])

a_styler = a.set_table_attributes("style='display:inline'").set_caption('characteristics')
b_styler = b.set_table_attributes("style='display:inline'").set_caption('vehicles')
c_styler = c.set_table_attributes("style='display:inline'").set_caption('places')
d_styler = d.set_table_attributes("style='display:inline'").set_caption('drivers')

space = "\xa0" * 50
display_html(a_styler._repr_html_() + space + b_styler._repr_html_() + space + c_styler._repr_html_() + space + d_styler._repr_html_(), raw=True)

display(HTML('<h3><i>The values highlighted are the ones above a certain threshold of missing values</i></h3>'))
display(HTML('<h3><i>We will get rid of those for the rest of the notebook</i></h3>'))

<h1><center>Visualizing datasets dtypes 1 by 1</center></h1>

In [None]:
characteristics_dtypes = pd.DataFrame(np.transpose(np.array((characteristics.columns,characteristics.dtypes),dtype=object,)),columns=['features','dtype'])
vehicles_dtypes = pd.DataFrame(np.transpose(np.array((vehicles.columns,vehicles.dtypes),dtype=object,)),columns=['features','dtype'])
places_dtypes = pd.DataFrame(np.transpose(np.array((places.columns,places.dtypes),dtype=object,)),columns=['features','dtype'])
drivers_dtypes = pd.DataFrame(np.transpose(np.array((drivers.columns,drivers.dtypes),dtype=object,)),columns=['features','dtype'])


characteristics_dtypes = characteristics_dtypes.style.set_table_styles([{
    'selector': 'caption',
    'props': [
        ('color', '#585858'),
        ('font-size', '30px')
    ]
}])
vehicles_dtypes = vehicles_dtypes.style.set_table_styles([{
    'selector': 'caption',
    'props': [
        ('color', '#585858'),
        ('font-size', '30px')
    ]
}])
places_dtypes = places_dtypes.style.set_table_styles([{
    'selector': 'caption',
    'props': [
        ('color', '#585858'),
        ('font-size', '30px')
    ]
}])
drivers_dtypes = drivers_dtypes.style.set_table_styles([{
    'selector': 'caption',
    'props': [
        ('color', '#585858'),
        ('font-size', '30px')
    ]
}])


characteristics_dtypes_styler = characteristics_dtypes.set_table_attributes("style='display:inline'").set_caption('characteristics')
vehicles_dtypes_styler = vehicles_dtypes.set_table_attributes("style='display:inline'").set_caption('vehicles')
places_dtypes_styler = places_dtypes.set_table_attributes("style='display:inline'").set_caption('places')
drivers_dtypes_styler = drivers_dtypes.set_table_attributes("style='display:inline'").set_caption('drivers')
space = "\xa0" * 50
display_html(characteristics_dtypes_styler._repr_html_() + space + vehicles_dtypes_styler._repr_html_() + space + 
             places_dtypes_styler._repr_html_() + space + drivers_dtypes_styler._repr_html_(), raw=True)

<h1><center>characteristics</center></h1>

In [None]:
for col in characteristics.select_dtypes("object"):
    print('\n')
    print('Number of values in "',col,'"', {characteristics[col].nunique()})
    print(characteristics[col].unique())
    print('\n')
    print('------------------------------------------------')

In [None]:
sns.set(font_scale = 1.5)
plt.figure(figsize=(10, 30))
plt.title('Number of accidents in 2019 per Department')
sns.countplot(y=characteristics['dep'])
plt.xlabel("Number of accidents")
plt.ylabel("Department")
plt.show()

In [None]:
sns.set(font_scale = 1.5)
fig, ax = plt.subplots(3,4, figsize=(30, 15))
i=0
for col in characteristics.select_dtypes(include=['float64','int64']):
    sns.distplot(characteristics[col],label=col,ax=ax[i//4][i%4])
    i=i+1
fig.show()

<h2>Comments</h2>
<ul>
    <li>We can notice on the first graph (accidents by department) that the number 75 skyrockets compared to the others</li>
    <li>We can notice on the second graph that the feature "an" that corresponds to "year" has no variance (in fact the whole dataset is based only on the year 2019 so thats pretty much obvious). We will get rid of this feature in the future.</li>
    <li>The two features "lat" and "long" corresponding to the Latitude and the Longitude of the accident are not scaled. There is a factor of 1e7 (to be changed)</li>
</ul>

<h1><center>vehicles</center></h1>

In [None]:
for col in vehicles.select_dtypes("object"):
    print('\n')
    print('Number of values in "',col,'"', {vehicles[col].nunique()})
    print(vehicles[col].unique())
    print('\n')
    print('------------------------------------------------')

In [None]:
sns.set(font_scale = 1.5)
plt.figure(figsize=(10, 15))
plt.title('Number of accidents in 2019 per number of occupants (by category)')
sns.countplot(y=vehicles['num_veh'])
plt.xlabel("Number of accidents")
plt.ylabel("Category")
plt.show()

In [None]:
sns.set(font_scale = 1.5)
fig, ax = plt.subplots(3,3, figsize=(30, 10))
i=0
for col in vehicles.select_dtypes(include=['float64','int64']):
    sns.distplot(vehicles[col],label=col,ax=ax[i//3][i%3])
    i=i+1
fig.show()

<h2>Comments</h2>
<ul>
    <li>We can notice that the feature "obs" has a low variance</li>
</ul>

<h1><center>places</center></h1>

In [None]:
for col in places.select_dtypes("object"):
    print('\n')
    print('Number of values in "',col,'"', {places[col].nunique()})
    print(places[col].unique())
    print('\n')
    print('------------------------------------------------')

In [None]:
sns.set(font_scale = 1.5)
fig, ax = plt.subplots(3,5, figsize=(30, 15))
i=0
for col in places.select_dtypes(include=['float64','int64']):
    sns.distplot(places[col],label=col,ax=ax[i//5][i%5])
    i=i+1
fig.show()

<h2>Comments</h2>
<ul>
    <li>We can notice that the feature "V1","vosp","pr" have a low variance</li>
</ul>

<h1><center>drivers</center></h1>

In [None]:
for col in drivers.select_dtypes("object"):
    print('\n')
    print('Number of values in "',col,'"', {drivers[col].nunique()})
    print(drivers[col].unique())
    print('\n')
    print('------------------------------------------------')

In [None]:
sns.set(font_scale = 1.5)
plt.figure(figsize=(10, 5))
plt.title('Number of accidents in 2019 sorted by pedestrian accions')
sns.countplot(y=drivers['actp'])
plt.xlabel("Number of accidents")
plt.ylabel("Category")
plt.show()

In [None]:
sns.set(font_scale = 1.5)
fig, ax = plt.subplots(3,4, figsize=(30, 15))
i=0
for col in drivers.select_dtypes(include=['float64','int64']):
    sns.distplot(drivers[col],label=col,ax=ax[i//4][i%4])
    i=i+1
fig.show()

<h2>Comments</h2>
<ul>
    <li>We can notice that the feature "secu3" has a low variance</li>
</ul>

# Summary
    
## characteristics
- "dep" 75 skyrockets compared to the others
- "an" that corresponds to "year" has no variance (in fact the whole dataset is based only on the year 2019 so thats pretty much obvious). We will get rid of this feature in the future.
- The two features "lat" and "long" corresponding to the Latitude and the Longitude of the accident are not scaled. There is a factor of 1e7 (to be changed)

## vehicles
- "obs" has a low variance

## places
- "V1","vosp","pr" have a low variance

## drivers
- "secu3" has a low variance

# Cleaning data

In [None]:
characteristics = pd.read_csv('../input/2019-database-of-road-traffic-injuries/caracteristiques-2019.csv')
characteristics.name = 'characteristics'
places = pd.read_csv('../input/2019-database-of-road-traffic-injuries/lieux-2019.csv')
places.name = 'places'
drivers = pd.read_csv('../input/2019-database-of-road-traffic-injuries/usagers-2019.csv')
drivers.name = 'drivers'
vehicles = pd.read_csv('../input/2019-database-of-road-traffic-injuries/vehicules-2019.csv')
vehicles.name = 'vehicles'

datasets = [characteristics,places,vehicles,drivers]

# Indexing the tables
characteristics = characteristics.set_index('Num_Acc')
places = places.set_index('Num_Acc')
vehicles = vehicles.set_index('id_vehicule')
drivers = drivers.set_index('id_vehicule')

# Dealing with features with too many NaNs 
vehicles = vehicles.drop('occutc',axis=1)
places = places.drop(['v2','lartpc','larrout'],axis=1)

# Dealing with features according to the EDA
characteristics = characteristics.drop(['an','adr','com'],axis=1)
characteristics['lat']=characteristics['lat']/10000000
characteristics['long']=characteristics['long']/10000000
characteristics = characteristics.drop('201900033874',axis=0)
#characteristics = characteristics[characteristics['dep']!='2B'] # comment / uncomment
#characteristics = characteristics[characteristics['dep']!='2A'] # comment / uncomment
#characteristics = characteristics[(characteristics['dep'].astype(float)<100)] # comment / uncomment
#places = places.loc[characteristics.index.values] # comment / uncomment
places = places.drop('201900033874',axis=0) # comment / uncomment
places = places.drop(['v1','vosp','pr','voie'],axis=1)
vehicles = vehicles.drop('obs',axis=1)
drivers = drivers.drop(['secu3'],axis=1)

pd.set_option('display.max_row',max(characteristics.shape[0],places.shape[0],drivers.shape[0],vehicles.shape[0]))
pd.set_option('display.max_column',max(characteristics.shape[1],places.shape[1],drivers.shape[1],vehicles.shape[1]))

for df in datasets:
    print ("The dataset",df.name,"has",df.shape[0],"rows and",df.shape[1],"columns")

In [None]:
from sklearn.preprocessing import LabelEncoder
def encoding(df):
    label = LabelEncoder()
    for c in df.select_dtypes("object"):
        df[c]=df[c].astype("|S")
        df[c]=label.fit_transform(df[c])
    return df

def imputation(df):
    df = df.fillna(df.median())
    df = df.dropna()
    return df

def preprocessing(df):
    df = encoding(df)
    df = imputation(df) 

    return df

In [None]:
characteristics = preprocessing(characteristics)
vehicles = preprocessing(vehicles)
places = preprocessing(places)
drivers = preprocessing(drivers)

In [None]:
display(HTML('<h1>characteristics</h1>'))
display(characteristics.head())
display(HTML('<h1>vehicles</h1>'))
display(vehicles.head())
display(HTML('<h1>drivers</h1>'))
display(drivers.head())
display(HTML('<h1>places</h1>'))
display(places.head())

# Display accidents on a map
#### (Unzoom for oversea french lands)

In [None]:
lat = characteristics['lat']
lon = characteristics['long']
dep = characteristics['dep']
catr = places['catr'].map({1 : 'Highway',
2 : 'National road',
3 : 'Departmental road',
4 : 'Communal roads',
5 : 'Outside the public network',
6 : 'Parking lot open to public traffic',
7 : 'Urban metropolis roads',
9 : 'other'})
lum = characteristics['lum'].map({1 : 'Full day',
2 : 'Twilight or dawn',
3 : 'Night without public lighting',
4 : 'Night with public lighting not on',
5 : 'Night with public lighting on'})
atm = characteristics['atm'].map({1 : 'Normal',
2 : 'Light rain',
3 : 'Heavy rain',
4 : 'Snow. hail',
5 : 'Fog. smoke',
6 : 'Strong wind. storm',
7 : 'Dazzling weather',
8 : 'Cloudy weather',
9 : 'Other'})
col = characteristics['col'].map({1 : 'Two vehicles. frontal',
2 : 'Two vehicles. from the rear',
3 : 'Two vehicles. from the side',
4 : 'Three vehicles and more. in a chain',
5 : 'Three or more vehicles. multiple collisions',
6 : 'Other collision 7. No collision'})
circ = places['circ'].map({1 : 'One way',
2 : 'Bidirectional',
3 : 'A separate carriageway',
4 : 'With variable assignment channels'})
prof = places['prof'].map({1 : 'Flat',
2 : 'Slope',
3 : 'hilltop',
4 : 'Bottom of coast'})
plan = places['plan'].map({1 : 'rectilinear part',
2 : 'In a curve to the left',
3 : 'In a curve to the right',
4 : 'In "S"'})
surf = places['surf'].map({1 : 'Normal',
2 : 'Wet',
3 : 'Puddles',
4 : 'Flooded',
5 : 'Snowy',
6 : 'Mud',
7 : 'Icy',
8 : 'Fat. oil',
9 : 'Other'})
vma = places['vma']

In [None]:
import plotly.express as px
fig = px.scatter_mapbox(characteristics, 
                        lat="lat", 
                        lon="long", 
                        hover_name=catr, 
                        hover_data={'Light':lum,
                                    'Atmosphere':atm,
                                    'Collision':col,
                                    'Regime':circ,
                                    'Profile':prof,
                                    'Layout':plan,
                                    'Surface':surf,
                                    'Speed':vma,
                                    'long':False,
                                    'lat':False}, 
                        zoom=4.9, 
                        height=800, 
                        width=800)
fig.data[0]['marker'].update(color='red') #green
fig.data[0]['marker'].update(size=3)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

# Concatenating datasets

In [None]:
df_acc = pd.concat([characteristics, places.reindex(characteristics.index)], axis=1)
df_veh = pd.concat([drivers, vehicles.reindex(drivers.index)], axis=1)
pd.set_option('display.max_row',40)
pd.set_option('display.max_column',40)
display(HTML('<h1>df_acc</h1>'))
display(df_acc.head())
display(HTML('<h1>df_veh</h1>'))
display(df_veh.head())

In [None]:
df_acc = df_acc.loc[:,~df_acc.columns.duplicated()] # Get rid of duplicates
df_veh = df_veh.loc[:,~df_veh.columns.duplicated()]
df_veh.reset_index(drop=True, inplace=True)
df_veh.index = df_veh['Num_Acc'].astype('str')
df_veh = df_veh.drop(['Num_Acc'],axis=1)
df = pd.concat([df_acc.reindex(df_veh.index),df_veh],axis=1)
df = preprocessing(df)

In [None]:
pd.set_option('display.max_row',40)
pd.set_option('display.max_column',40)
display(HTML('<h1>df_acc</h1>'))
display(df_acc.head())
display(HTML('<h1>df_veh</h1>'))
display(df_veh.head())
display(HTML('<h1>Fully concatenated dataset : df</h1>'))
display(df.head())
print('\n')
print("----------------------------------")
display(HTML('<h3>Complete dataset shape :</h3>'))
display(df.shape)
print("----------------------------------")

# Target visualization

In [None]:
plt.figure(figsize=(15,6))
sns.countplot(df['grav'].map({1:'Unharmed',
                                  2:'Killed',
                                  3:'Injured hospitalized',
                                  4:'Slightly injured'
                                }))
plt.plot()

# Resampling (if needed)

A widely adopted technique for dealing with highly unbalanced datasets is called resampling. It consists of removing samples from the majority class (under-sampling) and / or adding more examples from the minority class (over-sampling).

![](https://raw.githubusercontent.com/rafjaa/machine_learning_fecib/master/src/static/img/resampling.png)

Despite the advantage of balancing classes, these techniques also have their weaknesses (there is no free lunch). The simplest implementation of over-sampling is to duplicate random records from the minority class, which can cause overfitting. In under-sampling, the simplest technique involves removing random records from the majority class, which can cause loss of information.

Let's implement a basic example, which uses the <code>DataFrame.sample</code> method to get random samples each class.

In [None]:
# Class count
count_class_4, count_class_1, count_class_3, count_class_2 = df['grav'].value_counts()

# Divide by class
df_class_1 = df[df['grav'] == 1]
df_class_2 = df[df['grav'] == 2]
df_class_3 = df[df['grav'] == 3]
df_class_4 = df[df['grav'] == 4]

df_class_1_under = df_class_1.sample(count_class_2,random_state=42)
df_class_4_under = df_class_4.sample(count_class_2,random_state=42)
df_class_3_under = df_class_3.sample(count_class_2,random_state=42)
df_under = pd.concat([df_class_1_under, df_class_2, df_class_3_under, df_class_4_under], axis=0)

df_class_2_over = df_class_2.sample(count_class_1, replace=True, random_state=42)
df_class_3_over = df_class_3.sample(count_class_1, replace=True, random_state=42)
df_class_4_over = df_class_4.sample(count_class_1, replace=True, random_state=42)
df_over = pd.concat([df_class_1, df_class_2_over, df_class_3_over, df_class_4_over], axis=0)

fig,axes = plt.subplots(1,2,figsize=(20,6),sharey=True)
sns.countplot(ax=axes[0],x=df_under['grav'].map({1:'Unharmed',
                                  2:'Killed',
                                  3:'Injured hospitalized',
                                  4:'Slightly injured'
                                }))
axes[0].set_title('Random Downsampling')
sns.countplot(ax=axes[1],x=df_over['grav'].map({1:'Unharmed',
                                  2:'Killed',
                                  3:'Injured hospitalized',
                                  4:'Slightly injured'
                                }))
axes[1].set_title('Random Oversampling')
plt.plot()

# Modelling

In [None]:
trainset, testset = train_test_split(df_over, test_size=0.15, random_state=42)
fig, ax = plt.subplots(1,2, figsize=(18, 5))
sns.countplot(x = 'grav' , data = trainset,ax=ax[0],palette="Accent").set_title('TrainSet')
sns.countplot(x = 'grav' , data = testset,ax=ax[1],palette="Accent").set_title('TestSet')

In [None]:
X_train = trainset.drop('grav',axis=1)
y_train = trainset['grav']
X_test = testset.drop('grav',axis=1)
y_test = testset['grav']

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA

In [None]:
preprocessor = make_pipeline(StandardScaler())

PCAPipeline = make_pipeline(preprocessor, PCA(n_components=3,random_state=0))

RandomPipeline = make_pipeline(preprocessor,RandomForestClassifier(random_state=0))
AdaPipeline = make_pipeline(preprocessor,AdaBoostClassifier(random_state=0))
SVMPipeline = make_pipeline(preprocessor,SVC(random_state=0,probability=True))
KNNPipeline = make_pipeline(preprocessor,KNeighborsClassifier())
LRPipeline = make_pipeline(preprocessor,LogisticRegression(solver='sag'))

# PCA Analysis

In [None]:
PCA_df = pd.DataFrame(PCAPipeline.fit_transform(df.drop('grav',axis=1)))
PCA_df = pd.concat([PCA_df.reset_index(), df['grav'].map({1:'Unharmed',
                                  2:'Killed',
                                  3:'Injured hospitalized',
                                  4:'Slightly injured'
                                }).reset_index()], axis=1)
PCA_df = PCA_df.drop(['index','Num_Acc'],axis=1)
PCA_df.head()

In [None]:
figure1 = px.scatter_3d(PCA_df,
        x=0, 
        y=1, 
        z=2, 
        color = 'grav',
                       width=600, height=800)
figure1.update_traces(marker=dict(size=5,
                                  line=dict(width=0.15,
                                        color='black')),
                      selector=dict(mode='markers'))

figure1.show()

# Training models
## Models overview

In [None]:
dict_of_models = {'KNN': KNNPipeline,
                  'RandomForest': RandomPipeline,
                  'AdaBoost': AdaPipeline,
                  #'SVM': SVMPipeline,
                  'LR': LRPipeline}

In [None]:
def evaluation(model):
    model.fit(X_train, y_train)
    # calculating the predictions
    y_pred = model.predict(X_test)
    print('Accuracy = ', accuracy_score(y_test, y_pred))
    print('-')
    print(confusion_matrix(y_test,y_pred))
    print('-')
    print(classification_report(y_test,y_pred))
    print('-')

In [None]:
for name, model in dict_of_models.items():
    print('---------------------------------')
    print(name)
    evaluation(model)

# Using RandomForest

In [None]:
from sklearn.model_selection import RandomizedSearchCV
RandomPipeline.get_params().keys()

In [None]:
hyper_params = {
    'randomforestclassifier__n_estimators':[10,100,150,250,400,600],
    'randomforestclassifier__criterion':['gini','entropy'],
    'randomforestclassifier__min_samples_split':[2,6,12],
    'randomforestclassifier__min_samples_leaf':[1,4,6,10],
    'randomforestclassifier__max_features':['auto','srqt','log2',int,float],
    'randomforestclassifier__verbose':[0,1,2],
    'randomforestclassifier__class_weight':['balanced','balanced_subsample'],
    'randomforestclassifier__n_jobs':[-1],
}

In [None]:
RF_grid = RandomizedSearchCV(RandomPipeline,hyper_params,scoring='accuracy',n_iter=40)
RF_grid.fit(X_train,y_train)

In [None]:
print(RF_grid.best_params_)

In [None]:
best_forest = (RF_grid.best_estimator_)
best_forest.fit(X_train,y_train)
# calculating the predictions
y_pred = best_forest.predict(X_test)

N, train_score, test_score = learning_curve(best_forest, X_train, y_train, 
                                           cv=4, scoring='accuracy', 
                                           train_sizes=np.linspace(0.1,1,10))

In [None]:
print('Accuracy = ', accuracy_score(y_test, y_pred))
print('-')
print(confusion_matrix(y_test,y_pred))
print('-')
print(classification_report(y_test,y_pred))
print('-')
    
plt.figure(figsize=(5,5))
plt.plot(N, train_score.mean(axis=1), label='train score')
plt.plot(N, test_score.mean(axis=1), label='validation score')
plt.legend()
plt.title('Accuracy')
plt.show()

In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier

# Binarize the output
y_train = label_binarize(y_train, classes=[1, 2, 3, 4])
y_test = label_binarize(y_test, classes=[1, 2, 3, 4])
n_classes = y_train.shape[1]

# Learn to predict each class against the other
classifier = OneVsRestClassifier(best_forest)
#y_score = classifier.fit(X_train, y_train).decision_function(X_test)
y_score = classifier.fit(X_train, y_train).predict_proba(X_test)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

# # Plot of a ROC curve for a specific class
# plt.figure()
# plt.plot(fpr[2], tpr[2], label='ROC curve (area = %0.2f)' % roc_auc[2])
# plt.plot([0, 1], [0, 1], 'k--')
# plt.xlim([0.0, 1.0])
# plt.ylim([0.0, 1.05])
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('Receiver operating characteristic for class 2')
# plt.legend(loc="lower right")
# plt.show()

# Plot ROC curve
plt.figure(figsize=(15,10))
plt.plot(fpr["micro"], tpr["micro"],
         label='micro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["micro"]))
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], label='ROC curve of class {0} (area = {1:0.2f})'
                                   ''.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic to multi-class')
plt.legend(loc="lower right")
plt.show()