# Generate Reasonable fake data for Senegal road accidents
The generated fake data will follow the same features model as UK 

- Long, Lat: Senegal 
- age_of_driver: number
- engince_capacity_cc: number
- vehicle_type: 1-13 (x-y) for categorical data
- day_of_week: 1-7
- weathers_condtion: 1-7
- ligth_conditions: 1-4
- road_surface_conditons: 1-6
- sex_of_driver: 1-2
- speed_limit: number


# Read Senegal highway data 
The data was generated by transforming senegal gis shapefile road data network to csv

In [1]:
import os
import pandas as pd
df = pd.read_csv(os.path.join('datasets', 'senegal_highway.csv'))
print(df.shape)
df.head()

(7292, 6)


Unnamed: 0,X,Y,TYPE,NAME,ONEWAY,LANES
0,-17.528406,14.745116,unclassified,,,
1,-17.528145,14.744716,footway,,,
2,-17.516773,14.740623,secondary,route des Almadies,,
3,-17.527764,14.744895,unclassified,,,
4,-17.516326,14.738957,tertiary,,,


In [2]:
accidents = pd.read_csv(os.path.join('datasets', 'Accidents1.csv'))
print(accidents.shape)
accidents.head()

(57756, 62)


Unnamed: 0,accident_index,accident_year_x,accident_reference_x,location_easting_osgr,location_northing_osgr,longitude,latitude,police_force,accident_severity,number_of_vehicles,...,age_of_driver,age_band_of_driver,engine_capacity_cc,propulsion_code,age_of_vehicle,generic_make_model,driver_imd_decile,driver_home_area_type,lsoa_of_driver,date_time
0,2019521904284,2019,521904284,355372.0,171184.0,-2.643441,51.437882,52,3,2,...,39,7,1298,1,18,-1,7,1,-1,2019-12-06 14:00:00
1,2019521904294,2019,521904294,358292.0,172906.0,-2.601637,51.453588,52,3,2,...,40,7,1968,2,5,-1,1,1,-1,2019-06-22 00:30:00
2,2019521904297,2019,521904297,362297.0,174198.0,-2.544138,51.465486,52,3,2,...,51,8,1595,1,8,-1,2,1,-1,2019-02-07 17:25:00
3,2019521904297,2019,521904297,362297.0,174198.0,-2.544138,51.465486,52,3,2,...,32,6,1997,2,10,-1,6,1,-1,2019-02-07 17:25:00
4,2019521904311,2019,521904311,358925.0,174692.0,-2.592736,51.469693,52,3,2,...,36,7,1956,2,8,-1,5,1,-1,2019-03-25 16:53:00


In [3]:
# replace coord with senegal coord
from random import randint

for i in accidents.index:
  x = randint(1, 7200)
  accidents.at[i, 'latitude'] = df.iloc[x]['X']
  accidents.at[i, 'longitude'] = df.iloc[x]['Y']

#accidents.to_csv(os.path.join('datasets', 'FakeAccidents.csv'))


#  Generate Model for the fake data

In [4]:
# importation des fonctions ML
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

# Affiche Exactitude Modele
def show_accuracy(model, X_test, y_test, y_pred): 
  acc = round(model.score(X_test, y_test)*100, 2)
  report = classification_report(digits=6, y_true=y_test, y_pred=y_pred)
  print('Exactitude: {}'.format(acc))
  print(report)
  pd.crosstab(y_test, y_pred, rownames=['Actuel'], colnames=['Prédit'], margins=True)

In [5]:
fields = ['latitude','longitude', 'age_of_driver' ,'vehicle_type', 'age_of_vehicle','engine_capacity_cc','day_of_week' , 'weather_conditions' , 'road_surface_conditions', 'light_conditions', 'sex_of_driver' ,'speed_limit']

# choix colonnes d'apprentissage
new_accident = accidents[fields]

# séparation des données
X_train, X_test, y_train, y_test = train_test_split(new_accident, accidents['accident_severity'], test_size=0.2, random_state=33)
print(X_train.shape)
print(y_train.shape)

(46204, 12)
(46204,)


In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_attribs = ['speed_limit', 'engine_capacity_cc',]
cat_attribs = ['vehicle_type', 'sex_of_driver', 'light_conditions', 'weather_conditions', 'road_surface_conditions', 'day_of_week']

pipelines = ColumnTransformer([
  ("num", StandardScaler(), num_attribs),
  ("cat", OneHotEncoder(handle_unknown='ignore'), cat_attribs),
])

accident_prepared = pipelines.fit_transform(X_train)
print(accident_prepared.shape)

(46204, 48)


In [7]:
X_test_prepared  = pipelines.fit_transform(X_test)

In [8]:
from sklearn.model_selection import GridSearchCV
param_grid = {
                 'n_estimators': [5, 10, 15, 20],
                 'max_depth': [2, 5, 7, 9]
             }
crf = RandomForestClassifier()

grid_crf  = GridSearchCV(crf, param_grid, cv=10)
grid_crf.fit(accident_prepared, y_train)
best = grid_crf.best_estimator_

In [9]:
from sklearn.pipeline import Pipeline
import joblib
full_pipeline_with_predictor = Pipeline([
        ("preparation", pipelines),
        ("linear", best)
    ])

full_pipeline_with_predictor.predict(X_test)

joblib.dump(full_pipeline_with_predictor, 'modelfake.pkl')

['modelfake.pkl']