In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import requests

## Data Collection

### Files sources

In [2]:
# URLs of the files
train_datas_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module4/exercise/module4_exercise_train.zip'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module4/exercise/Neighborhood_Market_data.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_datas_url, 'module4_exercise_train.zip')
download_file(test_data_url, 'Neighborhood_Market_data.csv')

Downloaded module4_exercise_train.zip from https://www.raphaelcousin.com/modules/data-science-practice/module4/exercise/module4_exercise_train.zip
Downloaded Neighborhood_Market_data.csv from https://www.raphaelcousin.com/modules/data-science-practice/module4/exercise/Neighborhood_Market_data.csv


In [3]:
import zipfile
import os
# Unzipping the training data
with zipfile.ZipFile('module4_exercise_train.zip', 'r') as zip_ref:
    zip_ref.extractall('module4_exercise_train')
print('Unzipped training data to module4_exercise_train/')

Unzipped training data to module4_exercise_train/


#### CityMart

In [4]:
# read "CityMart_data.csv"
citymart_data = pd.read_csv("module4_exercise_train/CityMart_data.csv", index_col=0, parse_dates=["last_modified"], dtype={"store_name": "category"})
#pd.read_csv('Neighborhood_Market_data.csv',index_col="item_code")

display(citymart_data)

Unnamed: 0_level_0,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,quantity_sold,last_modified
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
P0019,CityMart,2.81,26.83,38.75,24.89,323,25877.199625,253,202,2023-01-19
P0024,CityMart,3.30,59.23,34.99,21.78,321,45138.128706,21,225,2023-01-24
P0025,CityMart,2.34,22.60,16.90,60.12,291,22962.232800,316,278,2023-01-25
P0034,CityMart,6.54,18.59,68.72,21.99,126,28092.330552,612,233,2023-02-03
P0039,CityMart,9.94,57.89,88.33,35.45,312,181270.870165,968,203,2023-02-08
...,...,...,...,...,...,...,...,...,...,...
P1984,CityMart,8.24,99.78,68.06,11.87,117,80609.488116,333,263,2028-06-06
P1989,CityMart,5.79,71.19,92.39,63.98,250,420812.077518,417,238,2028-06-11
P1992,CityMart,8.64,97.57,8.58,83.35,348,69776.502510,489,219,2028-06-14
P1996,CityMart,9.90,66.40,75.90,9.88,101,49792.828800,501,232,2028-06-18


In [5]:
citymart_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 415 entries, P0019 to P1998
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   store_name                415 non-null    category      
 1   mass                      415 non-null    float64       
 2   dimension_length          415 non-null    float64       
 3   dimension_width           415 non-null    float64       
 4   dimension_height          415 non-null    float64       
 5   days_since_last_purchase  415 non-null    int64         
 6   package_volume            412 non-null    float64       
 7   stock_age                 415 non-null    int64         
 8   quantity_sold             415 non-null    int64         
 9   last_modified             415 non-null    datetime64[ns]
dtypes: category(1), datetime64[ns](1), float64(5), int64(3)
memory usage: 32.8+ KB


In [6]:
citymart_data.describe()

Unnamed: 0,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,quantity_sold,last_modified
count,415.0,415.0,415.0,415.0,415.0,412.0,415.0,415.0,415
mean,5.157928,53.145349,52.52906,51.932145,174.390361,149468.294206,512.893976,217.66506,2025-09-08 21:09:58.554216704
min,0.1,5.4,5.37,5.04,0.0,1356.796496,1.0,123.0,2023-01-19 00:00:00
25%,2.76,30.635,28.635,30.29,78.5,36069.65112,279.0,195.0,2024-05-06 12:00:00
50%,5.38,53.87,51.73,51.27,177.0,100763.124275,524.0,217.0,2025-08-24 00:00:00
75%,7.6,76.405,75.905,72.75,261.0,207696.302871,724.0,240.5,2027-01-03 00:00:00
max,9.99,99.78,99.11,99.85,364.0,754043.219631,996.0,325.0,2028-06-20 00:00:00
std,2.848829,27.195911,26.78394,26.807918,106.790878,147972.258996,275.327654,33.531888,


In [7]:
citymart_data.dtypes


store_name                        category
mass                               float64
dimension_length                   float64
dimension_width                    float64
dimension_height                   float64
days_since_last_purchase             int64
package_volume                     float64
stock_age                            int64
quantity_sold                        int64
last_modified               datetime64[ns]
dtype: object

#### Greenfield_Grocers

In [8]:
greenfield_data = pd.read_csv('module4_exercise_train/Greenfield_Grocers_data.csv',sep="|")


In [9]:
greenfield_data = pd.read_csv('module4_exercise_train/Greenfield_Grocers_data.csv',sep="|")
greenfield_data.columns = greenfield_data.iloc[2]  
greenfield_data.columns=list(greenfield_data.columns.str.lower())
greenfield_data=greenfield_data.drop([0,1,2])
greenfield_data=greenfield_data.set_index("item_code")
greenfield_data=greenfield_data[[              'store_name',                     'mass',
               'dimension_length',          'dimension_width',
               'dimension_height', 'days_since_last_purchase',
                 'package_volume',                'stock_age',
                  'quantity_sold',            'last_modified']]
display(greenfield_data)

Unnamed: 0_level_0,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,quantity_sold,last_modified
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
P0006,Greenfield_Grocers,5.02,86.68,71.64,16.42,66,101964.18038400002,450,130,2023-01-06
P0014,Greenfield_Grocers,9.91,21.67,54.91,84.8,99,100903.49455999999,827,118,2023-01-14
P0016,Greenfield_Grocers,1.13,60.03,97.39,90.0,348,526168.953,981,136,2023-01-16
P0021,Greenfield_Grocers,0.95,40.36,67.91,99.76,301,273426.95657599997,289,155,2023-01-21
P0028,Greenfield_Grocers,5.24,22.37,61.78,68.67,15,94903.217262,423,177,2023-01-28
...,...,...,...,...,...,...,...,...,...,...
P1983,Greenfield_Grocers,3.13,37.39,74.87,84.11,168,235456.63402300002,31,154,2028-06-05
P1985,Greenfield_Grocers,0.66,30.04,15.77,70.62,5,33454.869096,76,73,2028-06-07
P1988,Greenfield_Grocers,7.28,7.34,89.94,37.33,92,24643.757867999997,770,162,2028-06-10
P1990,Greenfield_Grocers,6.71,73.91,10.24,89.64,9,67842.994176,968,115,2028-06-12


In [10]:
greenfield_data.info()
greenfield_data.describe()

<class 'pandas.core.frame.DataFrame'>
Index: 401 entries, P0006 to P1995
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   store_name                401 non-null    object
 1   mass                      401 non-null    object
 2   dimension_length          398 non-null    object
 3   dimension_width           401 non-null    object
 4   dimension_height          401 non-null    object
 5   days_since_last_purchase  401 non-null    object
 6   package_volume            401 non-null    object
 7   stock_age                 401 non-null    object
 8   quantity_sold             401 non-null    object
 9   last_modified             401 non-null    object
dtypes: object(10)
memory usage: 34.5+ KB


Unnamed: 0,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,quantity_sold,last_modified
count,401,401.0,398.0,401.0,401.0,401,401.0,401,401,401
unique,1,331.0,386.0,391.0,393.0,249,401.0,325,117,401
top,Greenfield_Grocers,3.75,56.33,44.04,55.33,94,101964.18038400002,870,127,2023-01-06
freq,401,4.0,3.0,2.0,2.0,4,1.0,3,14,1


In [11]:

greenfield_data.dtypes
print(greenfield_data.columns)
# conversion auto de toutes les colonnes numériques possibles
for col in [ 'mass', 'dimension_length', 'dimension_width',
       'dimension_height', 'days_since_last_purchase', 'package_volume',
       'stock_age', 'quantity_sold']:
    greenfield_data[col] = pd.to_numeric(greenfield_data[col], errors="ignore")
greenfield_data.dtypes


Index(['store_name', 'mass', 'dimension_length', 'dimension_width',
       'dimension_height', 'days_since_last_purchase', 'package_volume',
       'stock_age', 'quantity_sold', 'last_modified'],
      dtype='object')


  greenfield_data[col] = pd.to_numeric(greenfield_data[col], errors="ignore")


store_name                   object
mass                        float64
dimension_length            float64
dimension_width             float64
dimension_height            float64
days_since_last_purchase      int64
package_volume              float64
stock_age                     int64
quantity_sold                 int64
last_modified                object
dtype: object

#### Outlet_data

In [12]:
# read "SuperSaver_Outlet_data.xlsx"
supersaver_data = pd.read_excel('module4_exercise_train/SuperSaver_Outlet_data.xlsx',sheet_name=None)

supersaver_data_quantity = supersaver_data["Quantity"]
supersaver_data_info = supersaver_data["Info"]
display(supersaver_data_info)
display(supersaver_data_quantity)


Unnamed: 0.1,Unnamed: 0,item code,store name,mass,dimension length,dimension width,dimension height,days_since last_purchase,package volume,stock age
0,P0003,SuperSaver_Outlet,2.40,50.90,8.80,68.82,337,30825.854400,277,
1,P0007,SuperSaver_Outlet,9.88,43.65,48.64,55.78,16,118428.526080,291,
2,P0008,SuperSaver_Outlet,1.45,36.18,10.23,69.14,315,25590.193596,169,
3,P0009,SuperSaver_Outlet,6.98,48.13,45.55,44.66,295,97909.078190,443,
4,P0012,SuperSaver_Outlet,7.20,23.66,65.90,12.05,40,18788.287700,45,
...,...,...,...,...,...,...,...,...,...,...
374,P1976,SuperSaver_Outlet,5.10,62.99,49.16,65.56,293,203012.335504,464,
375,P1980,SuperSaver_Outlet,4.93,74.91,53.25,52.95,285,211215.299625,163,
376,P1986,SuperSaver_Outlet,4.24,31.52,73.46,55.73,330,129040.541216,755,
377,P1993,SuperSaver_Outlet,3.21,97.68,39.88,43.20,293,168284.666880,889,


Unnamed: 0,item_code,quantity_sold
0,P0003,198
1,P0007,211
2,P0008,200
3,P0009,209
4,P0012,186
...,...,...
374,P1976,236
375,P1980,192
376,P1986,193
377,P1993,185


In [13]:
supersaver_data_info.columns=['item code', 'store name', 'mass', 'dimension length', 'dimension width', 'dimension height', 'days_since last_purchase', 'package volume', 'stock age','suppp']
supersaver_data_info=supersaver_data_info[['item code', 'store name', 'mass', 'dimension length', 'dimension width', 'dimension height', 'days_since last_purchase', 'package volume', 'stock age']]

# Normaliser les noms de colonnes (remplacer espaces par underscore)
supersaver_data_info.columns = [c.strip().replace(" ", "_") for c in supersaver_data_info.columns]
supersaver_data_quantity.columns = [c.strip().replace(" ", "_") for c in supersaver_data_quantity.columns]

display(supersaver_data_info)


Unnamed: 0,item_code,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age
0,P0003,SuperSaver_Outlet,2.40,50.90,8.80,68.82,337,30825.854400,277
1,P0007,SuperSaver_Outlet,9.88,43.65,48.64,55.78,16,118428.526080,291
2,P0008,SuperSaver_Outlet,1.45,36.18,10.23,69.14,315,25590.193596,169
3,P0009,SuperSaver_Outlet,6.98,48.13,45.55,44.66,295,97909.078190,443
4,P0012,SuperSaver_Outlet,7.20,23.66,65.90,12.05,40,18788.287700,45
...,...,...,...,...,...,...,...,...,...
374,P1976,SuperSaver_Outlet,5.10,62.99,49.16,65.56,293,203012.335504,464
375,P1980,SuperSaver_Outlet,4.93,74.91,53.25,52.95,285,211215.299625,163
376,P1986,SuperSaver_Outlet,4.24,31.52,73.46,55.73,330,129040.541216,755
377,P1993,SuperSaver_Outlet,3.21,97.68,39.88,43.20,293,168284.666880,889


In [14]:
display(supersaver_data_quantity)

Unnamed: 0,item_code,quantity_sold
0,P0003,198
1,P0007,211
2,P0008,200
3,P0009,209
4,P0012,186
...,...,...
374,P1976,236
375,P1980,192
376,P1986,193
377,P1993,185


In [15]:
supersaver_data_final = supersaver_data_info.merge(
    supersaver_data_quantity,
    on="item_code",   # clé commune
    how="left"        # on garde tous les items du DataFrame "info"
)
supersaver_data_final = supersaver_data_final.set_index("item_code")

display(supersaver_data_final)
print(supersaver_data_final.shape)


Unnamed: 0_level_0,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,quantity_sold
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
P0003,SuperSaver_Outlet,2.40,50.90,8.80,68.82,337,30825.854400,277,198
P0007,SuperSaver_Outlet,9.88,43.65,48.64,55.78,16,118428.526080,291,211
P0008,SuperSaver_Outlet,1.45,36.18,10.23,69.14,315,25590.193596,169,200
P0009,SuperSaver_Outlet,6.98,48.13,45.55,44.66,295,97909.078190,443,209
P0012,SuperSaver_Outlet,7.20,23.66,65.90,12.05,40,18788.287700,45,186
...,...,...,...,...,...,...,...,...,...
P1976,SuperSaver_Outlet,5.10,62.99,49.16,65.56,293,203012.335504,464,236
P1980,SuperSaver_Outlet,4.93,74.91,53.25,52.95,285,211215.299625,163,192
P1986,SuperSaver_Outlet,4.24,31.52,73.46,55.73,330,129040.541216,755,193
P1993,SuperSaver_Outlet,3.21,97.68,39.88,43.20,293,168284.666880,889,185


(379, 9)


In [16]:

supersaver_data_final.dtypes
# conversion auto de toutes les colonnes numériques possibles
for col in supersaver_data_final.columns:
    supersaver_data_final[col] = pd.to_numeric(supersaver_data_final[col], errors="ignore")
supersaver_data_final.dtypes


  supersaver_data_final[col] = pd.to_numeric(supersaver_data_final[col], errors="ignore")


store_name                   object
mass                        float64
dimension_length            float64
dimension_width             float64
dimension_height            float64
days_since_last_purchase      int64
package_volume              float64
stock_age                     int64
quantity_sold                 int64
dtype: object

#### HighStreet_Bazaar

In [17]:
# read 'HighStreet_Bazaar_data.json'
highstreet_data = pd.read_json('module4_exercise_train/HighStreet_Bazaar_data.json')
highstreet_data=highstreet_data.set_index("item_code")
display(highstreet_data)

Unnamed: 0_level_0,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,quantity_sold,last_modified
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
P0001,HighStreet_Bazaar,6.11,75.46,91.62,92.08,78.0,636608.450016,237,346,1672531200000
P0011,HighStreet_Bazaar,4.34,16.97,93.21,54.58,344.0,86333.208546,184,218,1673395200000
P0015,HighStreet_Bazaar,1.37,58.93,11.28,73.87,344.0,49103.634648,946,315,1673740800000
P0017,HighStreet_Bazaar,7.27,51.51,39.21,16.17,138.0,32658.663807,268,228,1673913600000
P0020,HighStreet_Bazaar,0.89,57.50,69.84,6.12,333.0,24576.696000,396,228,1674172800000
...,...,...,...,...,...,...,...,...,...,...
P1978,HighStreet_Bazaar,6.28,72.48,47.71,83.54,322.0,288883.057632,637,235,1843344000000
P1981,HighStreet_Bazaar,0.91,31.13,55.22,71.22,342.0,122427.080292,60,241,1843603200000
P1982,HighStreet_Bazaar,4.65,65.72,82.01,60.75,192.0,327424.104900,925,237,1843689600000
P1987,HighStreet_Bazaar,4.64,93.41,52.53,41.02,131.0,201278.055846,317,256,1844121600000


In [18]:
highstreet_data.dtypes

store_name                   object
mass                        float64
dimension_length            float64
dimension_width             float64
dimension_height            float64
days_since_last_purchase    float64
package_volume              float64
stock_age                     int64
quantity_sold                 int64
last_modified                 int64
dtype: object

#### Aggregate

In [19]:

#display(citymart_data)
#print(citymart_data.columns)
print(citymart_data.dtypes)

#display(greenfield_data)
#print(greenfield_data.columns)
print(greenfield_data.dtypes)

#display(supersaver_data_final)
#print(supersaver_data_final.columns)
print(supersaver_data_final.dtypes)

#display(highstreet_data)
#print(highstreet_data.columns)
print(highstreet_data.dtypes)

store_name                        category
mass                               float64
dimension_length                   float64
dimension_width                    float64
dimension_height                   float64
days_since_last_purchase             int64
package_volume                     float64
stock_age                            int64
quantity_sold                        int64
last_modified               datetime64[ns]
dtype: object
store_name                   object
mass                        float64
dimension_length            float64
dimension_width             float64
dimension_height            float64
days_since_last_purchase      int64
package_volume              float64
stock_age                     int64
quantity_sold                 int64
last_modified                object
dtype: object
store_name                   object
mass                        float64
dimension_length            float64
dimension_width             float64
dimension_height            float64
da

In [20]:
#display(citymart_data)
#print(citymart_data.columns)
print(citymart_data.dtypes)

#display(greenfield_data)
#print(greenfield_data.columns)
print(greenfield_data.dtypes)

#display(supersaver_data_final)
#print(supersaver_data_final.columns)
print(supersaver_data_final.dtypes)

#display(highstreet_data)
#print(highstreet_data.columns)
print(highstreet_data.dtypes)

store_name                        category
mass                               float64
dimension_length                   float64
dimension_width                    float64
dimension_height                   float64
days_since_last_purchase             int64
package_volume                     float64
stock_age                            int64
quantity_sold                        int64
last_modified               datetime64[ns]
dtype: object
store_name                   object
mass                        float64
dimension_length            float64
dimension_width             float64
dimension_height            float64
days_since_last_purchase      int64
package_volume              float64
stock_age                     int64
quantity_sold                 int64
last_modified                object
dtype: object
store_name                   object
mass                        float64
dimension_length            float64
dimension_width             float64
dimension_height            float64
da

In [21]:
data = pd.concat([citymart_data, greenfield_data, supersaver_data_final, highstreet_data], axis=0)
# Supposons que ton DataFrame s'appelle df
display(data)


Unnamed: 0_level_0,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,quantity_sold,last_modified
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
P0019,CityMart,2.81,26.83,38.75,24.89,323.0,25877.199625,253,202,2023-01-19 00:00:00
P0024,CityMart,3.30,59.23,34.99,21.78,321.0,45138.128706,21,225,2023-01-24 00:00:00
P0025,CityMart,2.34,22.60,16.90,60.12,291.0,22962.232800,316,278,2023-01-25 00:00:00
P0034,CityMart,6.54,18.59,68.72,21.99,126.0,28092.330552,612,233,2023-02-03 00:00:00
P0039,CityMart,9.94,57.89,88.33,35.45,312.0,181270.870165,968,203,2023-02-08 00:00:00
...,...,...,...,...,...,...,...,...,...,...
P1978,HighStreet_Bazaar,6.28,72.48,47.71,83.54,322.0,288883.057632,637,235,1843344000000
P1981,HighStreet_Bazaar,0.91,31.13,55.22,71.22,342.0,122427.080292,60,241,1843603200000
P1982,HighStreet_Bazaar,4.65,65.72,82.01,60.75,192.0,327424.104900,925,237,1843689600000
P1987,HighStreet_Bazaar,4.64,93.41,52.53,41.02,131.0,201278.055846,317,256,1844121600000


In [22]:
data.columns

Index(['store_name', 'mass', 'dimension_length', 'dimension_width',
       'dimension_height', 'days_since_last_purchase', 'package_volume',
       'stock_age', 'quantity_sold', 'last_modified'],
      dtype='object')

In [23]:
print("citymart_data shape:", citymart_data.shape)
print("greenfield_data shape:", greenfield_data.shape)
print("supersaver_data_final shape:", supersaver_data_final.shape)
print("highstreet_data shape:", highstreet_data.shape)

print("\nComposition de data par store_name:")
print(data['store_name'].value_counts())

print("\nNb de lignes sans cible (quantity_sold) dans data:")
print(data['quantity_sold'].isna().sum())


citymart_data shape: (415, 10)
greenfield_data shape: (401, 10)
supersaver_data_final shape: (379, 9)
highstreet_data shape: (396, 10)

Composition de data par store_name:
store_name
CityMart              415
Greenfield_Grocers    401
HighStreet_Bazaar     396
SuperSaver_Outlet     379
Name: count, dtype: int64

Nb de lignes sans cible (quantity_sold) dans data:
0


In [24]:
data.dtypes

store_name                   object
mass                        float64
dimension_length            float64
dimension_width             float64
dimension_height            float64
days_since_last_purchase    float64
package_volume              float64
stock_age                     int64
quantity_sold                 int64
last_modified                object
dtype: object

#### Simple baseline

In [25]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import numpy as np

def get_simple_baseline(data, fillna_value=-1, drop_cols=None, k_fold=5, scaler='standard', model='linear', metric='mae', target_col=None, X_data_test=None):
    
    data = data.copy()
    # Handle missing values
    data.fillna(fillna_value, inplace=True)
    if X_data_test is not None:
        X_data_test = X_data_test.copy()
        X_data_test.fillna(fillna_value, inplace=True)
    
    # Drop unwanted columns
    if drop_cols:
        data.drop(drop_cols, axis=1, inplace=True)
        if X_data_test is not None:
            X_data_test.drop(drop_cols, axis=1, inplace=True)

    # Split data into features (X) and target (y)
    y = data[target_col]
    X = data.drop(target_col, axis=1)

    # Feature scaling
    if scaler == 'standard':
        scaler = StandardScaler()
    elif scaler == 'minmax':
        scaler = MinMaxScaler()
    else:
        scaler = None
    
    if scaler:
        X = scaler.fit_transform(X)
        if X_data_test is not None:
            X_data_test = scaler.transform(X_data_test)

    # Initialize the model
    if model == 'linear':
        model = LinearRegression()
    elif model == 'logistic':
        model = LogisticRegression()
    elif model == 'random_forest':
        model = RandomForestClassifier()
    else:
        raise ValueError("Unsupported model type")

    # Initialize cross-validation
    kf = KFold(n_splits=k_fold, shuffle=True, random_state=42)
    scores = []

    # Train and evaluate using k-fold cross-validation
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Evaluate using the specified metric
        if metric == 'mae':
            score = mean_absolute_error(y_test, y_pred)
        elif metric == 'accuracy':
            score = accuracy_score(y_test, np.round(y_pred))
        else:
            raise ValueError("Unsupported metric")

        scores.append(score)

    if X_data_test is not None:
        model.fit(X, y)
        return np.mean(scores), model.predict(X_data_test)
    
    # Return the average score
    return np.mean(scores)

In [26]:
get_simple_baseline(data, fillna_value=-1, drop_cols=['store_name', 'last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')

np.float64(45.04056766700087)

### API sources

In [27]:
def get_api(endpoint_url):
    try:
        # Make the GET request to the mock API
        response = requests.get(endpoint_url)

        if response.status_code == 200:
            data = response.json()
            print(data["message"])
            return data['data']
        else:
            print(f"Failed to retrieve volume data. Status code: {response.status_code}")
    
    except Exception as e:
        print(f"An error occurred: {e}")
password = get_api("https://www.raphaelcousin.com/api/exercise/auth")
print(password)
link=str("https://www.raphaelcousin.com/api/exercise/"+password["password"]+"/prices")

prices = get_api(link)

Authentication successful
{'description': 'This data is for the authenticated course access.', 'password': 'RcUZjhdsYLRzwi4'}
Volume data retrieved successfully


In [28]:
print(prices)

{'P0001': 22.14, 'P0002': 26.91, 'P0003': 16.9, 'P0004': 7.04, 'P0005': 20.84, 'P0006': 15.53, 'P0007': 14.46, 'P0008': 11.72, 'P0009': 14.7, 'P0010': 10.0, 'P0011': 16.85, 'P0012': 27.74, 'P0013': 12.98, 'P0014': 28.94, 'P0015': 16.73, 'P0016': 15.1, 'P0017': 17.03, 'P0018': 13.88, 'P0019': 17.86, 'P0020': 18.05, 'P0021': 3.35, 'P0022': 32.19, 'P0023': 6.95, 'P0024': 14.04, 'P0025': 3.38, 'P0026': 15.47, 'P0027': 13.09, 'P0028': 15.06, 'P0029': 16.91, 'P0030': 17.79, 'P0031': 18.9, 'P0032': 11.54, 'P0033': 19.79, 'P0034': 15.11, 'P0035': 15.84, 'P0036': 18.01, 'P0037': 14.18, 'P0038': 17.84, 'P0039': 14.2, 'P0040': 23.8, 'P0041': 21.45, 'P0042': 19.16, 'P0043': 16.14, 'P0044': 14.83, 'P0045': 23.21, 'P0046': 23.08, 'P0047': 21.37, 'P0048': 16.76, 'P0049': 18.46, 'P0050': 23.64, 'P0051': 10.9, 'P0052': 12.33, 'P0053': 20.17, 'P0054': 10.6, 'P0055': 9.26, 'P0056': 24.86, 'P0057': 26.38, 'P0058': 10.31, 'P0059': 9.87, 'P0060': 12.54, 'P0061': 26.23, 'P0062': 17.59, 'P0063': 18.2, 'P0064'

In [29]:
df_prices = pd.DataFrame.from_dict(prices, orient="index", columns=["price"])
print(df_prices)


       price
P0001  22.14
P0002  26.91
P0003  16.90
P0004   7.04
P0005  20.84
...      ...
P1996  25.80
P1997  26.05
P1998  17.41
P1999   7.57
P2000  14.42

[2000 rows x 1 columns]


#### Aggregate

In [30]:
data = pd.merge(data, df_prices, left_index=True, right_index=True, how='left')
display(data)

Unnamed: 0_level_0,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,quantity_sold,last_modified,price
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
P0019,CityMart,2.81,26.83,38.75,24.89,323.0,25877.199625,253,202,2023-01-19 00:00:00,17.86
P0024,CityMart,3.30,59.23,34.99,21.78,321.0,45138.128706,21,225,2023-01-24 00:00:00,14.04
P0025,CityMart,2.34,22.60,16.90,60.12,291.0,22962.232800,316,278,2023-01-25 00:00:00,3.38
P0034,CityMart,6.54,18.59,68.72,21.99,126.0,28092.330552,612,233,2023-02-03 00:00:00,15.11
P0039,CityMart,9.94,57.89,88.33,35.45,312.0,181270.870165,968,203,2023-02-08 00:00:00,14.20
...,...,...,...,...,...,...,...,...,...,...,...
P1978,HighStreet_Bazaar,6.28,72.48,47.71,83.54,322.0,288883.057632,637,235,1843344000000,15.85
P1981,HighStreet_Bazaar,0.91,31.13,55.22,71.22,342.0,122427.080292,60,241,1843603200000,17.98
P1982,HighStreet_Bazaar,4.65,65.72,82.01,60.75,192.0,327424.104900,925,237,1843689600000,17.75
P1987,HighStreet_Bazaar,4.64,93.41,52.53,41.02,131.0,201278.055846,317,256,1844121600000,22.68


In [31]:
get_simple_baseline(data, fillna_value=-1, drop_cols=['store_name', 'last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')

np.float64(44.10144429622423)

### Scrapping sources

In [32]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time

# Set up the Selenium WebDriver (e.g., Chrome)
driver = webdriver.Chrome()  # Make sure ChromeDriver is installed
# driver = webdriver.Firefox()
# driver = webdriver.Edge()
# driver = webdriver.Safari()

# Open the URL
url = 'https://www.raphaelcousin.com/module4/scrapable-data'
driver.get(url)

# Wait for the page to fully load (increase time if needed)
time.sleep(5)

# Get the fully rendered page source
html = driver.page_source

# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')

# Initialize lists to store scraped data
exercise_data = []

# Find both tables
tables = soup.find_all('table')

# Close the Selenium WebDriver
driver.quit()

# Scrape the second table (Exercise Data)
course_table = tables[1]
for row in course_table.find('tbody').find_all('tr'):
    cols = row.find_all('td')
    #exercise_data.append({ TODO })

# Convert the lists to pandas DataFrames
df_exercise = pd.DataFrame(exercise_data)
df_exercise


#### Aggregate

In [33]:
data = pd.merge(data, df_exercise, left_index=True, right_index=True, how='left')

In [34]:
get_simple_baseline(data, fillna_value=-1, drop_cols=['store_name', 'last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')

np.float64(44.10144429622423)

### Generating Submission File

In [35]:
#X_test =  read  Neighborhood_Market_data

# read
df_StoreN =  pd.read_csv("Neighborhood_Market_data.csv", sep=",", index_col='item_code')
df_StoreN

Unnamed: 0_level_0,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,last_modified
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
P0002,Neighborhood_Market,5.51,51.79,46.72,72.02,344,174261.666176,287,2023-01-02
P0004,Neighborhood_Market,3.97,84.63,39.42,42.46,189,141651.425916,387,2023-01-04
P0005,Neighborhood_Market,5.99,39.33,83.51,5.12,183,16816.375296,382,2023-01-05
P0010,Neighborhood_Market,4.10,77.43,49.56,74.41,208,285543.225828,656,2023-01-10
P0013,Neighborhood_Market,6.96,95.39,34.61,23.24,114,76725.649196,755,2023-01-13
...,...,...,...,...,...,...,...,...,...
P1972,Neighborhood_Market,5.68,17.89,71.87,95.67,296,123008.113881,411,2028-05-25
P1977,Neighborhood_Market,1.53,84.32,64.91,66.30,171,362873.902560,702,2028-05-30
P1991,Neighborhood_Market,7.79,84.46,82.96,52.14,50,365334.635424,190,2028-06-13
P1997,Neighborhood_Market,8.91,66.50,5.79,41.11,336,15828.788850,177,2028-06-19


In [36]:
df_StoreN = pd.merge(df_StoreN, df_prices, left_index=True, right_index=True, how='left')
df_StoreN = pd.merge(df_StoreN, df_exercise, left_index=True, right_index=True, how='left')

In [39]:
_, y_pred = get_simple_baseline(data, fillna_value=-1, drop_cols=['store_name', 'last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold', X_data_test = df_StoreN)

In [40]:
submission = pd.DataFrame({
    'item_code': df_StoreN.index,
    'quantity_sold': y_pred # your_prediction
})

submission.to_csv('submission.csv', index=False, sep=',')
submission.head()

Unnamed: 0,item_code,quantity_sold
0,P0002,176.692597
1,P0004,221.330805
2,P0005,184.800406
3,P0010,224.960678
4,P0013,215.053125
