In [1]:
import pandas as pd
from datetime import datetime

# Chargement des dataset

## YouGov - Wearing Mask in public


In [2]:
start = datetime.now()


##Chargement dataset
df = pd.read_csv(
    "./data/wearing_face_mask_public.csv",
    sep=";"
)


## Transformation du dataset = 1 ligne par date/pays
# On ne garde qu'une seule valeur par jour : le max pour chaque pays
format = '%Y-%m-%d %H:%M:%S'
df['DateTime'] = pd.to_datetime(df['DateTime'], format=format)
df['DateTime'] = df['DateTime'].dt.normalize()

df = df.sort_values('DateTime').groupby(df['DateTime']).max()
df = df.set_index(pd.DatetimeIndex(df['DateTime'])).drop(['DateTime'], axis=1)
wearing_mask_in_public_data = df.resample('1D').pad()
wearing_mask_in_public_data = wearing_mask_in_public_data.fillna(0)
wearing_mask_in_public_data = wearing_mask_in_public_data.reset_index().melt(
                                id_vars=['DateTime'], 
                                var_name='country', 
                                value_name='percent_wearing_mask')


print(f"Le dataset contient {len(df)} enregistrements")

print("Sample dataset final:")
print(wearing_mask_in_public_data.sample(5))

stop = datetime.now()

print("Temps de chargement et tranformation petit dataset : ", (stop-start).microseconds/1000, "ms")

Le dataset contient 192 enregistrements
Sample dataset final:
       DateTime    country  percent_wearing_mask
2492 2020-10-30      India                   0.0
2054 2020-05-25  Hong Kong                  87.0
3551 2020-08-30   Malaysia                   0.0
3632 2020-11-19   Malaysia                   0.0
2474 2020-10-12      India                   0.0
Temps de chargement et tranformation petit dataset :  22.862 ms


## Google - Covid 19 Open Data

In [3]:
start = datetime.now()

#Chargement dataset
covid19_opendata = pd.read_csv(
    "./data/latest.csv",
    keep_default_na=False,
    na_values=[""])


# Jointure entre open data covid 19 et yougo
covid19_opendata['date'] = pd.to_datetime(covid19_opendata['date'], format=format)
covid19_merge1 = covid19_opendata.merge(wearing_mask_in_public_data, 
                                      left_on = ['country_name','date'],
                                      right_on = ['country','DateTime'], how = 'left')


remove_cols = ['key', 'country','aggregation_level','locality_code', 'wikidata', 'datacommons', 'country_code', 'subregion1_code', 'subregion1_name', 'subregion2_code', 'subregion2_name', 'locality_name', '3166-1-alpha-2', '3166-1-alpha-3', 'DateTime']

covid19_merge1 = covid19_merge1.drop(remove_cols, axis=1)
covid19_merge1 = covid19_merge1.fillna(0)

prepared_data =  covid19_merge1.copy()

## Encode Pays
from sklearn.preprocessing import LabelEncoder

encoded_countries = LabelEncoder().fit_transform(prepared_data.country_name)
prepared_data['country_name'] = encoded_countries

## Encode Date
dates = prepared_data.date.apply(lambda x: x.strftime('%Y%m%d'))
encoded_dates = LabelEncoder().fit_transform(dates)
prepared_data['date'] = encoded_dates

print(f"Le dataset contient {len(prepared_data)} enregistrements")

print("Sample dataset final:")
print(prepared_data.sample(5))

stop = datetime.now()


print("Temps de chargement et tranformation grand dataset : ", (stop-start).microseconds/1000, "ms")

Le dataset contient 21505 enregistrements
Sample dataset final:
       date  country_name  new_confirmed  new_deceased  new_recovered  \
16562     0           173            1.0           0.0            0.0   
5803      0            29            4.0           0.0            0.0   
21113     0           236           -9.0          -1.0            0.0   
13729     0           141            1.0           0.0            0.0   
2736      0            29            2.0           0.0            0.0   

       new_tested  total_confirmed  total_deceased  total_recovered  \
16562         0.0             22.0             0.0              0.0   
5803          0.0            184.0             4.0              0.0   
21113         5.0            473.0             5.0              0.0   
13729         0.0             62.0             9.0              0.0   
2736          0.0             70.0             3.0              0.0   

       total_tested  ...  noaa_station  noaa_distance  average_tempera

## Entrainement et inférence

In [4]:
start = datetime.now()

# Split Train/Test
from sklearn.model_selection import train_test_split
X = prepared_data.loc[:, prepared_data.columns != 'new_confirmed']
y = prepared_data['new_confirmed'].ravel()


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

# Scale des valeurs
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Entraintement MLP
from sklearn.neural_network import MLPRegressor

regr = MLPRegressor(max_iter=10, hidden_layer_sizes=(100, 50, 25, 10, 5), verbose=True).fit(X_train, y_train)

# Prédiction et Score
score = regr.score(X_test, y_test)

stop = datetime.now()

print("Temps préparation et inférence (ML) : ", (stop-start).microseconds/1000, "ms")
print(f"model score: {score}")

Iteration 1, loss = 1913512.05099825
Iteration 2, loss = 1912826.56734122
Iteration 3, loss = 1908411.19594036
Iteration 4, loss = 1882865.65023970
Iteration 5, loss = 1813530.45950809
Iteration 6, loss = 1718267.55117788
Iteration 7, loss = 1633556.30522023
Iteration 8, loss = 1598085.43872462
Iteration 9, loss = 1549119.57839646
Iteration 10, loss = 1493999.51880322
Temps préparation et inférence (ML) :  204.974 ms
model score: 0.4150852144079944


## Entrainement et inférence avec Pipeline

In [5]:
start = datetime.now()

# Split Train/Test
from sklearn.model_selection import train_test_split
X = prepared_data.loc[:, prepared_data.columns != 'new_confirmed']
y = prepared_data['new_confirmed'].ravel()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

from sklearn.pipeline import Pipeline


# Scale des valeurs
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# Entraintement MLP
from sklearn.neural_network import MLPRegressor

regr = MLPRegressor(max_iter=10, hidden_layer_sizes=(100, 50, 25, 10, 5), verbose=True)

pipeline = Pipeline([('scaler', scaler), ('regressor', regr)])

# Exécution du pipeline
pipeline.fit(X_train, y_train)

# Prédiction et Score
score = pipeline.score(X_test, y_test)

stop = datetime.now()

print("Temps préparation et inférence (ML) : ", (stop-start).microseconds/1000, "ms")
print(f"model score: {score}")

Iteration 1, loss = 1861531.72514722
Iteration 2, loss = 1846450.99528227
Iteration 3, loss = 1711786.04932882
Iteration 4, loss = 1632325.13654811
Iteration 5, loss = 1556339.93399228
Iteration 6, loss = 1475497.80209052
Iteration 7, loss = 1409453.21787756
Iteration 8, loss = 1350760.63649599
Iteration 9, loss = 1265274.17505883
Iteration 10, loss = 1151185.94036052
Temps préparation et inférence (ML) :  202.327 ms
model score: 0.6564073262602386
