In [None]:
# --- Tarek Djaker notebook profile ---
import sys, os
sys.path.append(r'C:\Users\pigio\OneDrive\Documents\OneDrive\Desktop\projets\data_science_practice_2025\Tarek Djaker\lib')
from tarek_profile import nb_init, profile_banner
nb_init()
profile_banner(title=None)
# -------------------------------------

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import requests
import zipfile
import os
import io
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time

with zipfile.ZipFile('module4_exercise_train.zip', 'r') as zip_ref:
    zip_ref.extractall('.')

print("Fichiers extraits du zip : CityMart_data.csv, Greenfield_Grocers_data.csv, SuperSaver_Outlet_data.xlsx, HighStreet_Bazaar_data.json")

## Data Collection

### Files sources

In [None]:
# URLs of the files
train_datas_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module4/exercise/module4_exercise_train.zip'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module4/exercise/Neighborhood_Market_data.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_datas_url, 'module4_exercise_train.zip')
download_file(test_data_url, 'Neighborhood_Market_data.csv')

#### CityMart

In [None]:

df_citymart = pd.read_csv("CityMart_data.csv")
df_citymart.set_index('item_code', inplace=True)
print("CityMart Data:")
print(df_citymart.head())

#### Greenfield_Grocers

In [None]:
#### Greenfield_Grocers
df_greenfield = pd.read_csv("Greenfield_Grocers_data.csv", sep='|', header=3)
df_greenfield = df_greenfield.drop(columns=['Unnamed: 12', '1'], errors='ignore')
df_greenfield.columns = df_greenfield.columns.str.lower().str.strip()
df_greenfield.set_index('item_code', inplace=True)

print("\nGreenfield Grocers Data (Corrigé):")
print(df_greenfield.head())

#### Outlet_data

In [None]:
#### Outlet_data
df_outlet = pd.read_excel("SuperSaver_Outlet_data.xlsx")
df_outlet.columns = df_outlet.columns.str.lower().str.strip()
df_outlet.set_index('item_code', inplace=True)
print("Outlet Data:")
print(df_outlet.head())

#### HighStreet_Bazaar

In [None]:
# read 'HighStreet_Bazaar_data.json'
df_highstreet = pd.read_json('HighStreet_Bazaar_data.json')
df_highstreet['last_modified'] = pd.to_datetime(
    df_highstreet['last_modified'],
    unit='ms'
).dt.date
df_highstreet.columns = df_highstreet.columns.str.lower().str.strip()
df_highstreet.set_index('item_code', inplace=True)

print("HighStreet Bazaar Data (dates converties):")
print(df_highstreet.head())

In [None]:
# cct
dfs = [df_citymart, df_greenfield, df_highstreet, df_outlet]
data = pd.concat(dfs, axis=0)
print("--- Aggregated Data Head ---")
print(data.head())
print("\n--- Aggregated Data Shape ---")
print(f"Shape: {data.shape}")
print(f"Number of rows (len): {len(data)}")
print("\n--- Aggregated Data Info ---")
data.info()

#### Aggregate

#### Simple baseline

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import numpy as np

def get_simple_baseline(data, fillna_value=-1, drop_cols=None, k_fold=5, scaler='standard', model='linear', metric='mae', target_col=None, X_data_test=None):

    data = data.copy()
    # Handle missing values
    data.fillna(fillna_value, inplace=True)
    if X_data_test is not None:
        X_data_test = X_data_test.copy()
        X_data_test.fillna(fillna_value, inplace=True)

    # Drop unwanted columns
    if drop_cols:
        data.drop(drop_cols, axis=1, inplace=True)
        if X_data_test is not None:
            X_data_test.drop(drop_cols, axis=1, inplace=True)

    # Split data into features (X) and target (y)
    y = data[target_col]
    X = data.drop(target_col, axis=1)

    # Feature scaling
    if scaler == 'standard':
        scaler = StandardScaler()
    elif scaler == 'minmax':
        scaler = MinMaxScaler()
    else:
        scaler = None

    if scaler:
        X = scaler.fit_transform(X)
        if X_data_test is not None:
            X_data_test = scaler.transform(X_data_test)

    # Initialize the model
    if model == 'linear':
        model = LinearRegression()
    elif model == 'logistic':
        model = LogisticRegression()
    elif model == 'random_forest':
        model = RandomForestClassifier()
    else:
        raise ValueError("Unsupported model type")

    # Initialize cross-validation
    kf = KFold(n_splits=k_fold, shuffle=True, random_state=42)
    scores = []

    # Train and evaluate using k-fold cross-validation
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Train the model
        model.fit(X_train, y_train)

        # Make predictions
        y_pred = model.predict(X_test)

        # Evaluate using the specified metric
        if metric == 'mae':
            score = mean_absolute_error(y_test, y_pred)
        elif metric == 'accuracy':
            score = accuracy_score(y_test, np.round(y_pred))
        else:
            raise ValueError("Unsupported metric")

        scores.append(score)

    if X_data_test is not None:
        model.fit(X, y)
        return np.mean(scores), model.predict(X_data_test)

    # Return the average score
    return np.mean(scores)

In [None]:
get_simple_baseline(data, fillna_value=-1, drop_cols=['store_name', 'last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')

### API sources

In [None]:
import requests
import pandas as pd
import numpy as np

def get_api(endpoint_url):
    response = requests.get(endpoint_url)

    if response.status_code == 200:
        data_json = response.json()
        print(data_json["message"])
        return data_json['data']

    # Ne rien retourner si le statut n'est pas 200 (conformément au code de base)
    return None

cle_api = get_api("https://www.raphaelcousin.com/api/exercise/auth")["password"]
print(cle_api)

donnees_prix = get_api(f"https://www.raphaelcousin.com/api/exercise/{cle_api}/prices")
print(len(donnees_prix))
print(type(donnees_prix))

In [None]:
df_prices = pd.DataFrame.from_dict(prices, orient="index", columns=["price"])
df_prices.head()

#### Aggregate

In [None]:
data = pd.merge(data, df_prices, left_index=True, right_index=True, how='left')
data

In [None]:
get_simple_baseline(data, fillna_value=-1, drop_cols=['store_name', 'last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')

### Scrapping sources

In [None]:
# Full setup in one cell
!pip install selenium

# Install Chrome
!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

def create_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")

    return webdriver.Chrome(options=chrome_options)

# Usage example
driver = create_driver()

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import time
import re # Import pour le nettoyage potentiel des données

# --- Définition de la fonction de création de Driver (à exécuter une fois) ---
def create_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    # Pour un environnement de notebook, on suppose que le driver est déjà accessible via PATH
    # Si non, vous devriez spécifier l'emplacement: webdriver.Chrome(executable_path='/usr/bin/chromedriver', options=chrome_options)
    return webdriver.Chrome(options=chrome_options)

# Initialisation du driver
driver = create_driver()

# Open the URL
url = 'https://www.raphaelcousin.com/module4/scrapable-data'
driver.get(url)

# Wait for the page to fully load
time.sleep(5)

# Get the fully rendered page source
html = driver.page_source

# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')

# Initialize lists to store scraped data
exercise_data = []

# Find both tables
tables = soup.find_all('table')

# Close the Selenium WebDriver
driver.quit()

# Scrape the second table (Exercise Data)
if len(tables) > 1:
    course_table = tables[1]

    for row in course_table.find('tbody').find_all('tr'):
        cols = row.find_all('td')

        # Vérification qu'il y a au moins 3 colonnes de données
        if len(cols) >= 3:
            # --- LOGIQUE D'EXTRACTION COMPLÉTÉE ---
            exercise_data.append({
                'item_code': cols[0].text.strip(),
                'quantity_limit': cols[1].text.strip(),
                'last_checked': cols[2].text.strip()
            })

    # Convert the lists to pandas DataFrames
    df_exercise = pd.DataFrame(exercise_data)

    df_exercise['quantity_limit'] = pd.to_numeric(df_exercise['quantity_limit'], errors='coerce')

    # Mettre 'item_code' en index pour la fusion
    df_exercise.set_index('item_code', inplace=True)

else:
    print("Erreur : Impossible de trouver la seconde table pour le scraping.")
    df_exercise = pd.DataFrame() # Retourne un DataFrame vide en cas d'échec

print("DataFrame de l'exercice (df_exercise) prêt :")
print(df_exercise.head())

#### Aggregate

In [None]:
data = pd.merge(data, df_exercise, left_index=True, right_index=True, how='left')

In [None]:
get_simple_baseline(data, fillna_value=-1, drop_cols=['store_name', 'last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')

In [None]:

null = data.isnull().values
if null.any() >0 :
    count = len(data[data.isnull().any(axis=1)])
    proportion = count / len(data) * 100

data=data.fillna(data.median(numeric_only=True))


### Generating Submission File

In [None]:
X_test =  pd.read_csv('Neighborhood_Market_data.csv')

# read
df_StoreN =  pd.read_csv("Neighborhood_Market_data.csv", sep=",", index_col='item_code')
df_StoreN

In [None]:
df_StoreN = pd.merge(df_StoreN, df_prices, left_index=True, right_index=True, how='left')
df_StoreN = pd.merge(df_StoreN, df_exercise, left_index=True, right_index=True, how='left')

In [None]:
data.drop(columns=['price', 'price_x', 'price_y'], errors='ignore', inplace=True)
df_StoreN.drop(columns=['price', 'price_x', 'price_y'], errors='ignore', inplace=True)
df_StoreN = pd.merge(df_StoreN, df_prices, left_index=True, right_index=True, how='left')

df_StoreN.drop(columns=['price'], errors='ignore', inplace=True)
m, x_pred = get_simple_baseline(
    data,
    fillna_value=data.mean(numeric_only=True),
    drop_cols=['store_name', 'last_modified'],
    k_fold=3,
    scaler='minmax',
    model='linear',
    metric='mae',
    target_col='quantity_sold',
    X_data_test=df_StoreN
)
x_pred_int = np.round(x_pred).astype(int)
x_pred_int[x_pred_int < 0] = 0

print(f"Prédictions générées avec succès (MAE moyenne): {m:.2f}")
print("Aperçu des prédictions (x_pred) :", x_pred[:5])

In [None]:
submission = pd.DataFrame({
    'item_code': df_StoreN.index,
    'quantity_sold': x_pred # your_prediction
})

submission.to_csv('submission.csv', index=False, sep=',')
submission.head()