# Dataset traitement

In [None]:
%pip install opendatasets pandas boto3 scikit-learn python-dotenv

In [None]:
import opendatasets as od
import pandas as pd
import os
import uuid
import logging
import sys
import boto3
from sklearn.model_selection import StratifiedShuffleSplit
from datetime import datetime, timezone
import joblib
import dotenv
from io import BytesIO

In [None]:
DOTENV_PATH = os.environ.get('DOTENV_PATH', './../.env')

if dotenv.load_dotenv(dotenv_path=DOTENV_PATH) == False:
    print(f'no environment have been loaded from .env path \"{DOTENV_PATH}\"')

In [None]:
DATASET_URL = 'https://www.kaggle.com/datasets/devansodariya/road-accident-united-kingdom-uk-dataset'
TRAITEMENT_ID = uuid.uuid4()
DOWNLOAD_DIR = "/var/tmp/pink-twins-{TRAITEMENT_ID}"
LOG_LEVEL = 'INFO'
OUTPUT_CSV_PATH = ''
PUSH_DUMP_TO_S3_ENABLED = True
TRAITEMENT_VERSION = '1.0.0'
S3_BUCKET_NAME = os.environ.get('BUCKET_NAME', 'pink-twins-bucket')
S3_BUCKET_FOLDER = os.environ.get('S3_DATASETS_BUCKET_FOLDER', '')
S3_ACCESS_KEY_ID = os.environ.get('S3_ACCESS_KEY_ID', '')
S3_SECRET_ACCESS_KEY = os.environ.get('S3_SECRET_ACCESS_KEY', '')
AUTHOR = os.environ.get('AUTHOR', 'undefined')

In [None]:
# Set logger format
logging.basicConfig(
    format="%(levelname)s | %(asctime)s | %(message)s",
    datefmt="%Y-%m-%dT%H:%M:%SZ",
    encoding='utf-8',
    level=logging.getLevelName(LOG_LEVEL),
    stream=sys.stdout,
)

In [None]:
od.download(DATASET_URL, DOWNLOAD_DIR)

In [None]:
"""
Brief  : removing column that we won't use because irrelevant to our key question
var    : df : pandas dataframe we want to sort
return : return the pandas sorted dataframe
"""
def sorting_dataset(df):
    df = df.drop('Accident_Index', axis=1)
    df = df.drop('Police_Force', axis=1)
    df = df.drop('Local_Authority_(District)', axis=1)
    df = df.drop('Local_Authority_(Highway)', axis=1)
    df = df.drop('2nd_Road_Number', axis=1)
    df = df.drop('Did_Police_Officer_Attend_Scene_of_Accident', axis=1)
    df = df.drop('LSOA_of_Accident_Location', axis=1)
    return df

"""
Brief  : changing qualitative attributs to numerical one
var    : column_name : name of the column we want to change
         dataframe : pandas dataframe on which we are working
return : dataframe : pandas dataframe actualised
         values_and_keys : dictionnary with the column name as key and a list in attribut containing the map of the numerical values
"""
def qualititative_to_numerical(column_name, dataframe):
    values = list(set(dataframe[column_name].values))
    values_remplaced = [i for i in range(len(values))]
    dataframe[column_name].replace(values,values_remplaced, inplace=True)
    values_and_keys = {}

    for i in range(len(values)):
        values_and_keys[values[i]] = i
    
    return dataframe, values_and_keys

"""
Brief  : retrieving all the column which have qualitative values
var    : dataframe : pandas dataframe on which we want to work
return : return a list of all the column we need to change
"""
def get_object_column(dataframe):
    list_to_remplace = dataframe.select_dtypes(include = 'object').columns.tolist()

    if "Date" in list_to_remplace:
        list_to_remplace.remove("Date")
        # sort the date column to only keeping the month
        dataframe['Month'] = dataframe['Date'].str.split('/').str[1]
    if "Time" in list_to_remplace:
        list_to_remplace.remove("Time")
        # sort the hour column to removes the minutes
        dataframe['Time'] = dataframe['Time'].str.split(':').str[0]

    return list_to_remplace

"""
Brief  : changing NaN into number
var    : dataframe : pandas dataframe on which we want to work
return : return a dictionnary with the column name as key and a list in attribut containing the map of the numerical values
"""
def create_map_and_remove_nan(dataframe):
    list_qualitative = get_object_column(dataframe)
    
    record_dic = {}
    for element in list_qualitative :
        _, dic_value = qualititative_to_numerical(element, dataframe)
        record_dic[element] = dic_value

    return record_dic


In [None]:
def stratified(dataframe,column_target):
    y = dataframe[column_target]
    x = dataframe.drop(column_target, axis=1)
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.33)
    for train_index, test_index in sss.split(x, y):
        X_train, X_test = x.iloc[train_index], x.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    return X_train, X_test, y_train, y_test

In [None]:
# Load the downloaded dataset as a pandas dataframe
df = pd.read_csv(f'{DOWNLOAD_DIR}/road-accident-united-kingdom-uk-dataset/UK_Accident.csv', index_col=0)

# Remove column that we won't use because irrelevant to our key question
df = sorting_dataset(df)

# Create Time and Month column, and change undefined value to labels
map_dic = create_map_and_remove_nan(df)

# suppress the remaining NaN, if any
df = df.dropna(axis=0)

# convert an object column in int
df['Time'] = df['Time'].astype(int)
df['Month'] = df['Month'].astype(int)

# Drop the now useless Date column
df = df.drop(columns='Date')

In [None]:
if PUSH_DUMP_TO_S3_ENABLED:
    key = f'{S3_BUCKET_FOLDER}/{TRAITEMENT_ID}.joblib'

    try:
        s3 = boto3.client('s3', aws_access_key_id=S3_ACCESS_KEY_ID, aws_secret_access_key=S3_SECRET_ACCESS_KEY)

        df_bytes = BytesIO()
        joblib.dump(df, df_bytes)
        s3.put_object(Bucket=S3_BUCKET_NAME, Key=key,
                      Body=df_bytes.getvalue(), Metadata={
                          'author': AUTHOR,
                          'traitement-version': TRAITEMENT_VERSION,
                          'date': datetime.now(timezone.utc).astimezone().isoformat(),
        })
    except Exception as err:
        logging.fatal(f'failed to push dataset {key}: {err}')

if OUTPUT_CSV_PATH != "":
    try:
        df.to_csv(OUTPUT_CSV_PATH)
    except Exception as err:
        logging.fatal(f'failed to save dataset as {OUTPUT_CSV_PATH}: {err}')
