In [234]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import requests
import zipfile
import os
import io
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time

with zipfile.ZipFile('module4_exercise_train.zip', 'r') as zip_ref:
    zip_ref.extractall('.')

print("Fichiers extraits du zip : CityMart_data.csv, Greenfield_Grocers_data.csv, SuperSaver_Outlet_data.xlsx, HighStreet_Bazaar_data.json")

Fichiers extraits du zip : CityMart_data.csv, Greenfield_Grocers_data.csv, SuperSaver_Outlet_data.xlsx, HighStreet_Bazaar_data.json


## Data Collection

### Files sources

In [235]:
# URLs of the files
train_datas_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module4/exercise/module4_exercise_train.zip'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module4/exercise/Neighborhood_Market_data.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_datas_url, 'module4_exercise_train.zip')
download_file(test_data_url, 'Neighborhood_Market_data.csv')

Downloaded module4_exercise_train.zip from https://www.raphaelcousin.com/modules/data-science-practice/module4/exercise/module4_exercise_train.zip
Downloaded Neighborhood_Market_data.csv from https://www.raphaelcousin.com/modules/data-science-practice/module4/exercise/Neighborhood_Market_data.csv


#### CityMart

In [236]:

df_citymart = pd.read_csv("CityMart_data.csv")
df_citymart.set_index('item_code', inplace=True)
print("CityMart Data:")
print(df_citymart.head())

CityMart Data:
          store_name  mass  dimension_length  dimension_width  \
item_code                                                       
P0019       CityMart  2.81             26.83            38.75   
P0024       CityMart  3.30             59.23            34.99   
P0025       CityMart  2.34             22.60            16.90   
P0034       CityMart  6.54             18.59            68.72   
P0039       CityMart  9.94             57.89            88.33   

           dimension_height  days_since_last_purchase  package_volume  \
item_code                                                               
P0019                 24.89                       323    25877.199625   
P0024                 21.78                       321    45138.128706   
P0025                 60.12                       291    22962.232800   
P0034                 21.99                       126    28092.330552   
P0039                 35.45                       312   181270.870165   

           stock_

#### Greenfield_Grocers

In [237]:
#### Greenfield_Grocers
df_greenfield = pd.read_csv("Greenfield_Grocers_data.csv", sep='|', header=3)
df_greenfield = df_greenfield.drop(columns=['Unnamed: 12', '1'], errors='ignore')
df_greenfield.columns = df_greenfield.columns.str.lower().str.strip()
df_greenfield.set_index('item_code', inplace=True)

print("\nGreenfield Grocers Data (Corrigé):")
print(df_greenfield.head())


Greenfield Grocers Data (Corrigé):
                   store_name  mass  dimension_length  dimension_width  \
item_code                                                                
P0006      Greenfield_Grocers  5.02             86.68            71.64   
P0014      Greenfield_Grocers  9.91             21.67            54.91   
P0016      Greenfield_Grocers  1.13             60.03            97.39   
P0021      Greenfield_Grocers  0.95             40.36            67.91   
P0028      Greenfield_Grocers  5.24             22.37            61.78   

           dimension_height  days_since_last_purchase  package_volume  \
item_code                                                               
P0006                 16.42                        66   101964.180384   
P0014                 84.80                        99   100903.494560   
P0016                 90.00                       348   526168.953000   
P0021                 99.76                       301   273426.956576   
P0028  

#### Outlet_data

In [238]:
#### Outlet_data
df_outlet = pd.read_excel("SuperSaver_Outlet_data.xlsx")
df_outlet.columns = df_outlet.columns.str.lower().str.strip()
df_outlet.set_index('item_code', inplace=True)
print("Outlet Data:")
print(df_outlet.head())

Outlet Data:
           quantity_sold
item_code               
P0003                198
P0007                211
P0008                200
P0009                209
P0012                186


#### HighStreet_Bazaar

In [239]:
# read 'HighStreet_Bazaar_data.json'
df_highstreet = pd.read_json('HighStreet_Bazaar_data.json')
df_highstreet['last_modified'] = pd.to_datetime(
    df_highstreet['last_modified'],
    unit='ms'
).dt.date
df_highstreet.columns = df_highstreet.columns.str.lower().str.strip()
df_highstreet.set_index('item_code', inplace=True)

print("HighStreet Bazaar Data (dates converties):")
print(df_highstreet.head())

HighStreet Bazaar Data (dates converties):
                  store_name  mass  dimension_length  dimension_width  \
item_code                                                               
P0001      HighStreet_Bazaar  6.11             75.46            91.62   
P0011      HighStreet_Bazaar  4.34             16.97            93.21   
P0015      HighStreet_Bazaar  1.37             58.93            11.28   
P0017      HighStreet_Bazaar  7.27             51.51            39.21   
P0020      HighStreet_Bazaar  0.89             57.50            69.84   

           dimension_height  days_since_last_purchase  package_volume  \
item_code                                                               
P0001                 92.08                      78.0   636608.450016   
P0011                 54.58                     344.0    86333.208546   
P0015                 73.87                     344.0    49103.634648   
P0017                 16.17                     138.0    32658.663807   
P0020  

In [240]:
# cct
dfs = [df_citymart, df_greenfield, df_highstreet, df_outlet]
data = pd.concat(dfs, axis=0)
print("--- Aggregated Data Head ---")
print(data.head())
print("\n--- Aggregated Data Shape ---")
print(f"Shape: {data.shape}")
print(f"Number of rows (len): {len(data)}")
print("\n--- Aggregated Data Info ---")
data.info()

--- Aggregated Data Head ---
          store_name  mass  dimension_length  dimension_width  \
item_code                                                       
P0019       CityMart  2.81             26.83            38.75   
P0024       CityMart  3.30             59.23            34.99   
P0025       CityMart  2.34             22.60            16.90   
P0034       CityMart  6.54             18.59            68.72   
P0039       CityMart  9.94             57.89            88.33   

           dimension_height  days_since_last_purchase  package_volume  \
item_code                                                               
P0019                 24.89                     323.0    25877.199625   
P0024                 21.78                     321.0    45138.128706   
P0025                 60.12                     291.0    22962.232800   
P0034                 21.99                     126.0    28092.330552   
P0039                 35.45                     312.0   181270.870165   

   

#### Aggregate

#### Simple baseline

In [285]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import numpy as np

def get_simple_baseline(data, fillna_value=-1, drop_cols=None, k_fold=5, scaler='standard', model='linear', metric='mae', target_col=None, X_data_test=None):

    data = data.copy()
    # Handle missing values
    data.fillna(fillna_value, inplace=True)
    if X_data_test is not None:
        X_data_test = X_data_test.copy()
        X_data_test.fillna(fillna_value, inplace=True)

    # Drop unwanted columns
    if drop_cols:
        data.drop(drop_cols, axis=1, inplace=True)
        if X_data_test is not None:
            X_data_test.drop(drop_cols, axis=1, inplace=True)

    # Split data into features (X) and target (y)
    y = data[target_col]
    X = data.drop(target_col, axis=1)

    # Feature scaling
    if scaler == 'standard':
        scaler = StandardScaler()
    elif scaler == 'minmax':
        scaler = MinMaxScaler()
    else:
        scaler = None

    if scaler:
        X = scaler.fit_transform(X)
        if X_data_test is not None:
            X_data_test = scaler.transform(X_data_test)

    # Initialize the model
    if model == 'linear':
        model = LinearRegression()
    elif model == 'logistic':
        model = LogisticRegression()
    elif model == 'random_forest':
        model = RandomForestClassifier()
    else:
        raise ValueError("Unsupported model type")

    # Initialize cross-validation
    kf = KFold(n_splits=k_fold, shuffle=True, random_state=42)
    scores = []

    # Train and evaluate using k-fold cross-validation
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Train the model
        model.fit(X_train, y_train)

        # Make predictions
        y_pred = model.predict(X_test)

        # Evaluate using the specified metric
        if metric == 'mae':
            score = mean_absolute_error(y_test, y_pred)
        elif metric == 'accuracy':
            score = accuracy_score(y_test, np.round(y_pred))
        else:
            raise ValueError("Unsupported metric")

        scores.append(score)

    if X_data_test is not None:
        model.fit(X, y)
        return np.mean(scores), model.predict(X_data_test)

    # Return the average score
    return np.mean(scores)

In [286]:
get_simple_baseline(data, fillna_value=-1, drop_cols=['store_name', 'last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')

np.float64(41.99040780244333)

### API sources

In [243]:
import requests
import pandas as pd
import numpy as np

def get_api(endpoint_url):
    response = requests.get(endpoint_url)

    if response.status_code == 200:
        data_json = response.json()
        print(data_json["message"])
        return data_json['data']

    # Ne rien retourner si le statut n'est pas 200 (conformément au code de base)
    return None

cle_api = get_api("https://www.raphaelcousin.com/api/exercise/auth")["password"]
print(cle_api)

donnees_prix = get_api(f"https://www.raphaelcousin.com/api/exercise/{cle_api}/prices")
print(len(donnees_prix))
print(type(donnees_prix))

Authentication successful
RcUZjhdsYLRzwi4
Volume data retrieved successfully
2000
<class 'dict'>


In [244]:
df_prices = pd.DataFrame.from_dict(prices, orient="index", columns=["price"])
df_prices.head()

Unnamed: 0,price
P0001,22.14
P0002,26.91
P0003,16.9
P0004,7.04
P0005,20.84


#### Aggregate

In [245]:
data = pd.merge(data, df_prices, left_index=True, right_index=True, how='left')
data

Unnamed: 0_level_0,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,quantity_sold,last_modified,price
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
P0019,CityMart,2.81,26.83,38.75,24.89,323.0,25877.199625,253.0,202,2023-01-19,17.86
P0024,CityMart,3.30,59.23,34.99,21.78,321.0,45138.128706,21.0,225,2023-01-24,14.04
P0025,CityMart,2.34,22.60,16.90,60.12,291.0,22962.232800,316.0,278,2023-01-25,3.38
P0034,CityMart,6.54,18.59,68.72,21.99,126.0,28092.330552,612.0,233,2023-02-03,15.11
P0039,CityMart,9.94,57.89,88.33,35.45,312.0,181270.870165,968.0,203,2023-02-08,14.20
...,...,...,...,...,...,...,...,...,...,...,...
P1976,,,,,,,,,236,,18.69
P1980,,,,,,,,,192,,17.20
P1986,,,,,,,,,193,,20.91
P1993,,,,,,,,,185,,17.35


In [246]:
get_simple_baseline(data, fillna_value=-1, drop_cols=['store_name', 'last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')

np.float64(44.454488170296564)

### Scrapping sources

In [247]:
# Full setup in one cell
!pip install selenium

# Install Chrome
!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

Hit:1 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:3 https://cli.github.com/packages stable InRelease
Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:8 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading packag

In [248]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

def create_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")

    return webdriver.Chrome(options=chrome_options)

# Usage example
driver = create_driver()

In [258]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import time
import re # Import pour le nettoyage potentiel des données

# --- Définition de la fonction de création de Driver (à exécuter une fois) ---
def create_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    # Pour un environnement de notebook, on suppose que le driver est déjà accessible via PATH
    # Si non, vous devriez spécifier l'emplacement: webdriver.Chrome(executable_path='/usr/bin/chromedriver', options=chrome_options)
    return webdriver.Chrome(options=chrome_options)

# Initialisation du driver
driver = create_driver()

# Open the URL
url = 'https://www.raphaelcousin.com/module4/scrapable-data'
driver.get(url)

# Wait for the page to fully load
time.sleep(5)

# Get the fully rendered page source
html = driver.page_source

# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')

# Initialize lists to store scraped data
exercise_data = []

# Find both tables
tables = soup.find_all('table')

# Close the Selenium WebDriver
driver.quit()

# Scrape the second table (Exercise Data)
if len(tables) > 1:
    course_table = tables[1]

    for row in course_table.find('tbody').find_all('tr'):
        cols = row.find_all('td')

        # Vérification qu'il y a au moins 3 colonnes de données
        if len(cols) >= 3:
            # --- LOGIQUE D'EXTRACTION COMPLÉTÉE ---
            exercise_data.append({
                'item_code': cols[0].text.strip(),
                'quantity_limit': cols[1].text.strip(),
                'last_checked': cols[2].text.strip()
            })

    # Convert the lists to pandas DataFrames
    df_exercise = pd.DataFrame(exercise_data)

    df_exercise['quantity_limit'] = pd.to_numeric(df_exercise['quantity_limit'], errors='coerce')

    # Mettre 'item_code' en index pour la fusion
    df_exercise.set_index('item_code', inplace=True)

else:
    print("Erreur : Impossible de trouver la seconde table pour le scraping.")
    df_exercise = pd.DataFrame() # Retourne un DataFrame vide en cas d'échec

print("DataFrame de l'exercice (df_exercise) prêt :")
print(df_exercise.head())

DataFrame de l'exercice (df_exercise) prêt :
           quantity_limit last_checked
item_code                             
P0001                   2          972
P0002                   3          260
P0003                   2          285
P0004                   5          512
P0005                   3           85


#### Aggregate

In [259]:
data = pd.merge(data, df_exercise, left_index=True, right_index=True, how='left')

In [260]:
get_simple_baseline(data, fillna_value=-1, drop_cols=['store_name', 'last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')

np.float64(41.99040780244333)

In [261]:

null = data.isnull().values
if null.any() >0 :
    count = len(data[data.isnull().any(axis=1)])
    proportion = count / len(data) * 100

data=data.fillna(data.median(numeric_only=True))


### Generating Submission File

In [262]:
X_test =  pd.read_csv('Neighborhood_Market_data.csv')

# read
df_StoreN =  pd.read_csv("Neighborhood_Market_data.csv", sep=",", index_col='item_code')
df_StoreN

Unnamed: 0_level_0,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,last_modified
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
P0002,Neighborhood_Market,5.51,51.79,46.72,72.02,344,174261.666176,287,2023-01-02
P0004,Neighborhood_Market,3.97,84.63,39.42,42.46,189,141651.425916,387,2023-01-04
P0005,Neighborhood_Market,5.99,39.33,83.51,5.12,183,16816.375296,382,2023-01-05
P0010,Neighborhood_Market,4.10,77.43,49.56,74.41,208,285543.225828,656,2023-01-10
P0013,Neighborhood_Market,6.96,95.39,34.61,23.24,114,76725.649196,755,2023-01-13
...,...,...,...,...,...,...,...,...,...
P1972,Neighborhood_Market,5.68,17.89,71.87,95.67,296,123008.113881,411,2028-05-25
P1977,Neighborhood_Market,1.53,84.32,64.91,66.30,171,362873.902560,702,2028-05-30
P1991,Neighborhood_Market,7.79,84.46,82.96,52.14,50,365334.635424,190,2028-06-13
P1997,Neighborhood_Market,8.91,66.50,5.79,41.11,336,15828.788850,177,2028-06-19


In [263]:
df_StoreN = pd.merge(df_StoreN, df_prices, left_index=True, right_index=True, how='left')
df_StoreN = pd.merge(df_StoreN, df_exercise, left_index=True, right_index=True, how='left')

In [291]:
data.drop(columns=['price', 'price_x', 'price_y'], errors='ignore', inplace=True)
df_StoreN.drop(columns=['price', 'price_x', 'price_y'], errors='ignore', inplace=True)
df_StoreN = pd.merge(df_StoreN, df_prices, left_index=True, right_index=True, how='left')

df_StoreN.drop(columns=['price'], errors='ignore', inplace=True)
m, x_pred = get_simple_baseline(
    data,
    fillna_value=data.mean(numeric_only=True),
    drop_cols=['store_name', 'last_modified'],
    k_fold=3,
    scaler='minmax',
    model='linear',
    metric='mae',
    target_col='quantity_sold',
    X_data_test=df_StoreN
)
x_pred_int = np.round(x_pred).astype(int)
x_pred_int[x_pred_int < 0] = 0

print(f"Prédictions générées avec succès (MAE moyenne): {m:.2f}")
print("Aperçu des prédictions (x_pred) :", x_pred[:5])

Prédictions générées avec succès (MAE moyenne): 41.92
Aperçu des prédictions (x_pred) : [185.41216628 220.86223297 168.81180472 211.02940148 225.40193643]


In [292]:
submission = pd.DataFrame({
    'item_code': df_StoreN.index,
    'quantity_sold': x_pred # your_prediction
})

submission.to_csv('submission.csv', index=False, sep=',')
submission.head()

Unnamed: 0,item_code,quantity_sold
0,P0002,185.412166
1,P0004,220.862233
2,P0005,168.811805
3,P0010,211.029401
4,P0013,225.401936
