# Bayes Model

In [None]:
%pip install numpy scikit-learn pandas boto3 matplotlib seaborn python-dotenv

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from io import BytesIO
from pathlib import Path
import pandas as pd
import seaborn as sn
import joblib
import dotenv
import boto3
import logging
import os
import uuid
import sys
import traitement.normalisation as norm
from datetime import datetime, timezone

In [None]:
DOTENV_PATH = os.environ.get('DOTENV_PATH', './../.env')

if dotenv.load_dotenv(dotenv_path=DOTENV_PATH) == False:
    print(f'no environment have been loaded from .env path \"{DOTENV_PATH}\"')

In [None]:
LOG_LEVEL = 'INFO'
LOCAL_DATASET_PATH = os.environ.get('LOCAL_DATASET_PATH', '')
IMPORTED_DATASET_S3_KEY = os.environ.get('IMPORTED_DATASET_S3_KEY', '')
IMPORTED_K_MEANS_S3_KEY = os.environ.get('IMPORTED_K_MEANS_S3_KEY', '')
PUSH_MODEL_DUMP_TO_S3_ENABLED = os.environ.get('PUSH_MODEL_DUMP_TO_S3_ENABLED', 'true').lower() == 'true'
TMP_DIR = os.environ.get('TMP_DIR', '/tmp/pink-twins')
S3_BUCKET_NAME = os.environ.get('BUCKET_NAME', 'pink-twins-bucket')
S3_BUCKET_FOLDER = os.environ.get('S3_MODELS_BUCKET_FOLDER', '')
S3_ACCESS_KEY_ID = os.environ.get('S3_ACCESS_KEY_ID', '')
S3_SECRET_ACCESS_KEY = os.environ.get('S3_SECRET_ACCESS_KEY', '')
AUTHOR = os.environ.get('AUTHOR', 'undefined')

In [None]:
# Ensure that the temporary folder exist and create one if it doesn't exists
Path(TMP_DIR).mkdir(parents=True, exist_ok=True)

In [None]:
# Set logger format
logging.basicConfig(
    format="%(levelname)s | %(asctime)s | %(message)s",
    datefmt="%Y-%m-%dT%H:%M:%SZ",
    encoding='utf-8',
    level=logging.getLevelName(LOG_LEVEL),
    stream=sys.stdout,
)

In [None]:
if LOCAL_DATASET_PATH != '':
    try:
        df = pd.read_csv(LOCAL_DATASET_PATH)
    except Exception as err:
        logging.fatal(f'failed to load dataset at path {LOCAL_DATASET_PATH}: {err}')
elif IMPORTED_DATASET_S3_KEY != '':
    try:
        # Create an S3 client
        s3 = boto3.client('s3', aws_access_key_id=S3_ACCESS_KEY_ID, aws_secret_access_key=S3_SECRET_ACCESS_KEY)

        # Download the dump file from S3 into memory
        response = s3.get_object(Bucket=S3_BUCKET_NAME, Key=IMPORTED_DATASET_S3_KEY)
        df_bytes = BytesIO(response['Body'].read())

        # Load the variable back from the dump data
        df = joblib.load(df_bytes)

    except Exception as err:
        logging.fatal(f'failed to load dataset {IMPORTED_DATASET_S3_KEY} from S3 bucket: {err}')
else:
    logging.fatal('no source dataset have been defined')

In [None]:
# Normalize dataset
df = df # TO DO: APPLY NORMALIZATION

# Drop useless columns for the K means algorithm
df = df.drop('Location_Easting_OSGR', axis=1)
df = df.drop('Location_Northing_OSGR', axis=1)
df = df.drop('Longitude', axis=1)
df = df.drop('Latitude', axis=1)
df = df.drop('Number_of_Vehicles', axis=1)
df = df.drop('Number_of_Casualties', axis=1)
df = df.drop('1st_Road_Class', axis=1)
df = df.drop('1st_Road_Number', axis=1)
df = df.drop('Junction_Control', axis=1)
df = df.drop('2nd_Road_Class', axis=1)
df = df.drop('Pedestrian_Crossing-Physical_Facilities', axis=1)
df = df.drop('Pedestrian_Crossing-Human_Control', axis=1)
df = df.drop('Special_Conditions_at_Site', axis=1)
df = df.drop('Carriageway_Hazards', axis=1)

# Create scaled DataFrame where each variable has mean of 0 and standard dev
# of 1
scaled_df = StandardScaler().fit_transform(df)


In [None]:
"""
Test K means algorithm for dataset (suspicions of unbalanced dataset)
"""

# Test K-means algorithm with only one sample of the attribute Accident_Severity - works
# groupes = new_df.groupby('Accident_Severity')
# class1 = groupes.get_group(1)
# new_df = new_gr.apply(lambda x: x.sample(n=1))

# Test K-means algorithm with only two sample of the attribute Accident_Severity - don't work
# groupes = new_df.groupby('Accident_Severity')
# class1 = groupes.get_group(1)
# new_df = new_gr.apply(lambda x: x.sample(n=2))
#print("idk : ", new_df)


"""
Research for optimal number of cluster
"""

# kmeans_per_k = [KMeans(n_clusters=k, random_state=42).fit(X)
#                 for k in range(1, 10)]
# inertias = [model.inertia_ for model in kmeans_per_k]

# plt.figure(figsize=(8, 3.5))
# plt.plot(range(1, 10), inertias, "bo-")
# plt.xlabel("$k$", fontsize=14)
# plt.ylabel("Inertia", fontsize=14)
# plt.annotate('Elbow',
#              xy=(4, inertias[3]),
#              xytext=(0.55, 0.55),
#              textcoords='figure fraction',
#              fontsize=16,
#              arrowprops=dict(facecolor='black', shrink=0.1)
#             )
# plt.axis([1, 8.5, 0, 1300])
# plt.show()


In [None]:
if IMPORTED_K_MEANS_S3_KEY != '':
    try:
        # Create an S3 client
        s3 = boto3.client('s3', aws_access_key_id=S3_ACCESS_KEY_ID, aws_secret_access_key=S3_SECRET_ACCESS_KEY)
        imported_model_id = IMPORTED_K_MEANS_S3_KEY.split('/')[-1]
        imported_model_file = f'{TMP_DIR}/{imported_model_id}'
        
        # Download the dump file from S3
        response = s3.download_file(Bucket=S3_BUCKET_NAME, Key=IMPORTED_K_MEANS_S3_KEY,
            Filename=imported_model_file)

        # Load the variable back from the dump data
        model = joblib.load(imported_model_file)

    except Exception as err:
        logging.fatal(f'failed to load dataset {IMPORTED_K_MEANS_S3_KEY} from S3 bucket: {err}')
else:
    #instantiate the k-means class, using optimal number of clusters
    model = KMeans(n_clusters=3, init="random", n_init=10, max_iter=10000, random_state=1)

    #fit k-means algorithm to data
    model.fit(df)

In [None]:
df['Cluster']  = model.labels_

# Extract features excluding "Accident_Severity", "Cluster"
features_for_pca = df.drop(['Accident_Severity', 'Cluster'], axis=1)

# Perform PCA
pca = PCA(n_components=2)  # You can choose the number of components as per your requirement
pca_result = pca.fit_transform(features_for_pca)

# Add PCA results to the DataFrame
df['PCA1'] = pca_result[:, 0]
df['PCA2'] = pca_result[:, 1]

# Visualize the PCA results
# Create two subplots
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))

# Plot for Clusters
scatter_cluster = axes[0].scatter(df['PCA1'], df['PCA2'], c=df['Cluster'], cmap='viridis', alpha=0.5)
axes[0].set_title('K-Means Clusters')
axes[0].set_xlabel('Principal Component 1')
axes[0].set_ylabel('Principal Component 2')
fig.colorbar(scatter_cluster, ax=axes[0], label='Cluster')

# Plot for Accident_Severity
scatter_severity = axes[1].scatter(df['PCA1'], df['PCA2'], c=df['Accident_Severity'], cmap='viridis', alpha=0.5)
axes[1].set_title('Accident Severity')
axes[1].set_xlabel('Principal Component 1')
axes[1].set_ylabel('Principal Component 2')
fig.colorbar(scatter_severity, ax=axes[1], label='Severity')

plt.tight_layout()
plt.show()

In [None]:
if IMPORTED_K_MEANS_S3_KEY == '' and PUSH_MODEL_DUMP_TO_S3_ENABLED:
    model_id = uuid.uuid4()
    key = f'{S3_BUCKET_FOLDER}/k_means/{model_id}.joblib'

    try:
        model_id = uuid.uuid4()
        tmp_file = f'{TMP_DIR}/{model_id}.joblib'

        joblib.dump(model, tmp_file)

        s3 = boto3.client('s3', aws_access_key_id=S3_ACCESS_KEY_ID, aws_secret_access_key=S3_SECRET_ACCESS_KEY)
        s3.upload_file(Bucket=S3_BUCKET_NAME, Key=key, Filename=tmp_file,
                      ExtraArgs={
                          'Metadata': {
                          'author': AUTHOR,
                          'date': datetime.now(timezone.utc).astimezone().isoformat(),
                          'training_dataset_key': IMPORTED_DATASET_S3_KEY,
        }})

        logging.info(f'successfully pushed model as: {key}')
    except Exception as err:
        logging.fatal(f'failed to push model {key}: {err}')
