In [62]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

## Data Collection

### Files sources

In [63]:
import requests
# URLs of the files
train_datas_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module4/exercise/module4_exercise_train.zip'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module4/exercise/Neighborhood_Market_data.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_datas_url, 'module4_exercise_train.zip')
download_file(test_data_url, 'Neighborhood_Market_data.csv')

Downloaded module4_exercise_train.zip from https://www.raphaelcousin.com/modules/data-science-practice/module4/exercise/module4_exercise_train.zip
Downloaded Neighborhood_Market_data.csv from https://www.raphaelcousin.com/modules/data-science-practice/module4/exercise/Neighborhood_Market_data.csv


#### Ouvrir le fichier zippé

In [64]:
import zipfile

# chemin vers ton fichier ZIP
zip_path = 'module4_exercise_train.zip'

# Ouvrir le  zip et afficher le nom des fichiers
with zipfile.ZipFile(zip_path, "r") as zip_ref:
    # lister les fichiers contenus
    print("Fichiers dans le zip :", zip_ref.namelist())

Fichiers dans le zip : ['CityMart_data.csv', 'Greenfield_Grocers_data.csv', 'HighStreet_Bazaar_data.json', 'SuperSaver_Outlet_data.xlsx']


#### CityMart

In [65]:
# read "CityMart_data.csv"

# premier fichier CSV à l'intérieur du zip
csv_filename = zip_ref.namelist()[0]
print(csv_filename)

# Ouvire le zip et ouvrir le fichier
with zipfile.ZipFile(zip_path, "r") as zip_ref:
    # lire le CSV directement sans l'extraire
    with zip_ref.open(csv_filename) as f:
        df_CityMart = pd.read_csv(f)
print(df_CityMart.head())

CityMart_data.csv
  item_code store_name  mass  dimension_length  dimension_width  \
0     P0019   CityMart  2.81             26.83            38.75   
1     P0024   CityMart  3.30             59.23            34.99   
2     P0025   CityMart  2.34             22.60            16.90   
3     P0034   CityMart  6.54             18.59            68.72   
4     P0039   CityMart  9.94             57.89            88.33   

   dimension_height  days_since_last_purchase  package_volume  stock_age  \
0             24.89                       323    25877.199625        253   
1             21.78                       321    45138.128706         21   
2             60.12                       291    22962.232800        316   
3             21.99                       126    28092.330552        612   
4             35.45                       312   181270.870165        968   

   quantity_sold last_modified  
0            202    2023-01-19  
1            225    2023-01-24  
2            278    202

#### Greenfield_Grocers

In [66]:
# read "Greenfield_Grocers_data.csv"

# second fichier CSV à l'intérieur du zip
csv_filename2 = zip_ref.namelist()[1]
print(csv_filename2)

# Ouvire le zip et ouvrir le fichier
with zipfile.ZipFile(zip_path, "r") as zip_ref:
    # lire le CSV directement sans l'extraire
    with zip_ref.open(csv_filename2) as f:
        df_Greenfield = pd.read_csv(f, sep="|", header=3)
print(df_Greenfield.head())
print(df_Greenfield.shape)

# On supprime les deux dernières colonnes
df_Greenfield = df_Greenfield.drop(['Unnamed: 12','1'], axis=1)
print(df_Greenfield.shape)

# On mets les noms des colonnes en minuscule
df_Greenfield.columns = df_Greenfield.columns.str.lower()
print(df_Greenfield.head())

Greenfield_Grocers_data.csv
  ITEM_CODE          STORE_NAME  MASS  DIMENSION_LENGTH  DIMENSION_WIDTH  \
0     P0006  Greenfield_Grocers  5.02             86.68            71.64   
1     P0014  Greenfield_Grocers  9.91             21.67            54.91   
2     P0016  Greenfield_Grocers  1.13             60.03            97.39   
3     P0021  Greenfield_Grocers  0.95             40.36            67.91   
4     P0028  Greenfield_Grocers  5.24             22.37            61.78   

   DIMENSION_HEIGHT  DAYS_SINCE_LAST_PURCHASE  PACKAGE_VOLUME  STOCK_AGE  \
0             16.42                        66   101964.180384        450   
1             84.80                        99   100903.494560        827   
2             90.00                       348   526168.953000        981   
3             99.76                       301   273426.956576        289   
4             68.67                        15    94903.217262        423   

   QUANTITY_SOLD LAST_MODIFIED   1  Unnamed: 12  
0       

#### Outlet_data

In [67]:
# read "SuperSaver_Outlet_data.xlsx"

# fichier xlsx à l'intérieur du zip
xlsx_filename = zip_ref.namelist()[3]
print(xlsx_filename)

# Ouvire le zip et ouvrir le fichier
with zipfile.ZipFile(zip_path, "r") as zip_ref:
    # lire le xlsx directement sans l'extraire
    with zip_ref.open(xlsx_filename) as f:
        df_SuperSaver = pd.read_excel(f)
print(df_SuperSaver.head())

SuperSaver_Outlet_data.xlsx
  item_code  quantity_sold
0     P0003            198
1     P0007            211
2     P0008            200
3     P0009            209
4     P0012            186


#### HighStreet_Bazaar

In [68]:
# read 'HighStreet_Bazaar_data.json'

# fichier json à l'intérieur du zip
json_filename = zip_ref.namelist()[2]
print(json_filename)

# Ouvire le zip et ouvrir le fichier
with zipfile.ZipFile(zip_path, "r") as zip_ref:
    # lire le xlsx directement sans l'extraire
    with zip_ref.open(json_filename) as f:
        df_HighStreet = pd.read_json(f)
print(df_HighStreet.head())

# Mettre les dates en format date
print(df_HighStreet['last_modified'].dtype)
df_HighStreet['last_modified'] = pd.to_datetime(df_HighStreet['last_modified'], unit='ms').dt.date # ms = millisecond
print(df_HighStreet.head())

HighStreet_Bazaar_data.json
  item_code         store_name  mass  dimension_length  dimension_width  \
0     P0001  HighStreet_Bazaar  6.11             75.46            91.62   
1     P0011  HighStreet_Bazaar  4.34             16.97            93.21   
2     P0015  HighStreet_Bazaar  1.37             58.93            11.28   
3     P0017  HighStreet_Bazaar  7.27             51.51            39.21   
4     P0020  HighStreet_Bazaar  0.89             57.50            69.84   

   dimension_height  days_since_last_purchase  package_volume  stock_age  \
0             92.08                      78.0   636608.450016        237   
1             54.58                     344.0    86333.208546        184   
2             73.87                     344.0    49103.634648        946   
3             16.17                     138.0    32658.663807        268   
4              6.12                     333.0    24576.696000        396   

   quantity_sold  last_modified  
0            346  167253120000

#### Concaténation des fichiers

In [69]:
# Concat: Stack DataFrames vertically or horizontally
dfs = [
    df_CityMart.set_index("item_code"),
    df_Greenfield.set_index("item_code"),
    df_HighStreet.set_index("item_code"),
    df_SuperSaver.set_index("item_code")
]

df_vertical = pd.concat(dfs, axis=0)

print(df_vertical.head())
print(df_vertical.shape)
print(len(df_vertical))
print(len(df_CityMart) + len(df_Greenfield) + len(df_HighStreet) + len(df_SuperSaver))

          store_name  mass  dimension_length  dimension_width  \
item_code                                                       
P0019       CityMart  2.81             26.83            38.75   
P0024       CityMart  3.30             59.23            34.99   
P0025       CityMart  2.34             22.60            16.90   
P0034       CityMart  6.54             18.59            68.72   
P0039       CityMart  9.94             57.89            88.33   

           dimension_height  days_since_last_purchase  package_volume  \
item_code                                                               
P0019                 24.89                     323.0    25877.199625   
P0024                 21.78                     321.0    45138.128706   
P0025                 60.12                     291.0    22962.232800   
P0034                 21.99                     126.0    28092.330552   
P0039                 35.45                     312.0   181270.870165   

           stock_age  quantity_s

#### Aggregate

#### Simple baseline

In [70]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import numpy as np

def get_simple_baseline(data, fillna_value=-1, drop_cols=None, k_fold=5, scaler='standard', model='linear', metric='mae', target_col=None, X_data_test=None):
    
    data = data.copy()
    # Handle missing values
    data.fillna(fillna_value, inplace=True)
    if X_data_test is not None:
        X_data_test = X_data_test.copy()
        X_data_test.fillna(fillna_value, inplace=True)
    
    # Drop unwanted columns
    if drop_cols:
        data.drop(drop_cols, axis=1, inplace=True)
        if X_data_test is not None:
            X_data_test.drop(drop_cols, axis=1, inplace=True)

    # Split data into features (X) and target (y)
    y = data[target_col]
    X = data.drop(target_col, axis=1)

    # Feature scaling
    if scaler == 'standard':
        scaler = StandardScaler()
    elif scaler == 'minmax':
        scaler = MinMaxScaler()
    else:
        scaler = None
    
    if scaler:
        X = scaler.fit_transform(X)
        if X_data_test is not None:
            X_data_test = scaler.transform(X_data_test)

    # Initialize the model
    if model == 'linear':
        model = LinearRegression()
    elif model == 'logistic':
        model = LogisticRegression()
    elif model == 'random_forest':
        model = RandomForestClassifier()
    else:
        raise ValueError("Unsupported model type")

    # Initialize cross-validation
    kf = KFold(n_splits=k_fold, shuffle=True, random_state=42)
    scores = []

    # Train and evaluate using k-fold cross-validation
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Evaluate using the specified metric
        if metric == 'mae':
            score = mean_absolute_error(y_test, y_pred)
        elif metric == 'accuracy':
            score = accuracy_score(y_test, np.round(y_pred))
        else:
            raise ValueError("Unsupported metric")

        scores.append(score)

    if X_data_test is not None:
        model.fit(X, y)
        return np.mean(scores), model.predict(X_data_test)
    
    # Return the average score
    return np.mean(scores)

In [71]:
# get_simple_baseline(data, fillna_value=-1, drop_cols=['store_name', 'last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')

get_simple_baseline(df_vertical, fillna_value=-1, drop_cols=['store_name', 'last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')

np.float64(45.25623515880098)

### API sources

In [72]:
import requests

def get_api(endpoint_url):
    try:
        # Make the GET request to the mock API
        response = requests.get(endpoint_url)

        if response.status_code == 200:
            data = response.json()
            print(data["message"])
            return data['data']
        else:
            print(f"Failed to retrieve volume data. Status code: {response.status_code}")
    
    except Exception as e:
        print(f"An error occurred: {e}")
        
password = get_api("https://www.raphaelcousin.com/api/exercise/auth")["password"]
print(password)
prices = get_api(f"https://www.raphaelcousin.com/api/exercise/{password}/prices")
print(len(prices))
print(type(prices))

Authentication successful
RcUZjhdsYLRzwi4
Volume data retrieved successfully
2000
<class 'dict'>


In [73]:
# df_prices = 
df_prices = pd.DataFrame.from_dict(prices, orient="index", columns=["price"])
print(len(df_prices))
print(df_prices.head())

2000
       price
P0001  22.14
P0002  26.91
P0003  16.90
P0004   7.04
P0005  20.84


#### Aggregate

In [74]:
data = pd.merge(df_vertical, df_prices, left_index=True, right_index=True, how='left')
print(data.shape)
print(data.head())

(1591, 11)
          store_name  mass  dimension_length  dimension_width  \
item_code                                                       
P0019       CityMart  2.81             26.83            38.75   
P0024       CityMart  3.30             59.23            34.99   
P0025       CityMart  2.34             22.60            16.90   
P0034       CityMart  6.54             18.59            68.72   
P0039       CityMart  9.94             57.89            88.33   

           dimension_height  days_since_last_purchase  package_volume  \
item_code                                                               
P0019                 24.89                     323.0    25877.199625   
P0024                 21.78                     321.0    45138.128706   
P0025                 60.12                     291.0    22962.232800   
P0034                 21.99                     126.0    28092.330552   
P0039                 35.45                     312.0   181270.870165   

           stock_age 

In [75]:
# get_simple_baseline(data, fillna_value=-1, drop_cols=['store_name', 'last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')
get_simple_baseline(data, fillna_value=-1, drop_cols=['store_name', 'last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')

np.float64(44.454488170296564)

### Scrapping sources

In [76]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time

# Set up the Selenium WebDriver (e.g., Chrome)
driver = webdriver.Chrome()  # Make sure ChromeDriver is installed
# driver = webdriver.Firefox()
# driver = webdriver.Edge()
# driver = webdriver.Safari()

# Open the URL
url = 'https://www.raphaelcousin.com/module4/scrapable-data'
driver.get(url)

# Wait for the page to fully load (increase time if needed)
time.sleep(5)

# Get the fully rendered page source
html = driver.page_source

# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')

# Initialize lists to store scraped data
exercise_data = []

# Find both tables
tables = soup.find_all('table')

# Close the Selenium WebDriver
driver.quit()

# Scrape the second table (Exercise Data)
course_table = tables[1]
for row in course_table.find('tbody').find_all('tr'):
    cols = row.find_all('td')
    exercise_data.append({ 
        'Item Code': cols[0].text,
        'Customer Score': cols[1].text,
        'Total Reviews': cols[2].text,
                            })

# Convert the lists to pandas DataFrames
df_exercise = pd.DataFrame(exercise_data)
print(df_exercise.head())

# On mets les noms des colonnes en minuscule
df_exercise.columns = df_exercise.columns.str.lower()
# Fixe item code comme index
df_exercise = df_exercise.set_index("item code")
print(df_exercise.head())


  Item Code Customer Score Total Reviews
0     P0001              2           972
1     P0002              3           260
2     P0003              2           285
3     P0004              5           512
4     P0005              3            85
          customer score total reviews
item code                             
P0001                  2           972
P0002                  3           260
P0003                  2           285
P0004                  5           512
P0005                  3            85


#### Aggregate

In [77]:
data = pd.merge(data, df_exercise, left_index=True, right_index=True, how='left')
print(data.shape)
print(data.head())

(1591, 13)
          store_name  mass  dimension_length  dimension_width  \
item_code                                                       
P0019       CityMart  2.81             26.83            38.75   
P0024       CityMart  3.30             59.23            34.99   
P0025       CityMart  2.34             22.60            16.90   
P0034       CityMart  6.54             18.59            68.72   
P0039       CityMart  9.94             57.89            88.33   

           dimension_height  days_since_last_purchase  package_volume  \
item_code                                                               
P0019                 24.89                     323.0    25877.199625   
P0024                 21.78                     321.0    45138.128706   
P0025                 60.12                     291.0    22962.232800   
P0034                 21.99                     126.0    28092.330552   
P0039                 35.45                     312.0   181270.870165   

           stock_age 

In [78]:
get_simple_baseline(data, fillna_value=-1, drop_cols=['store_name', 'last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')

np.float64(40.91896306914744)

## Data Pre-Processing

In [79]:
# Find out if there is at least one missing value 
null = data.isnull().values
if null.any() >=1 :
    count = len(data[data.isnull().any(axis=1)])
    print(f"Number of rows with at least one missing value: {count}")
    proportion = count / len(data) * 100
    print(f"Proportion of rows with missing values in the dataset: {np.round(proportion,2)}")

# On remplace les valeurs manquantes des variables numériques par la médiane
data2=data.fillna(data.median(numeric_only=True))


Number of rows with at least one missing value: 388
Proportion of rows with missing values in the dataset: 24.39


In [80]:
get_simple_baseline(data2, fillna_value=-1, drop_cols=['store_name', 'last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')

np.float64(40.861081461478406)

In [81]:
# On change la manière de traiter les valeurs manquantes
get_simple_baseline(data, fillna_value=data.median(numeric_only=True), drop_cols=['store_name', 'last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')

np.float64(40.861081461478406)

In [82]:
# On diminue le nombre de folds (5 -> 3)
get_simple_baseline(data, fillna_value=data.median(numeric_only=True), drop_cols=['store_name', 'last_modified'], k_fold=3, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')

np.float64(40.81888740766508)

### Generating Submission File

In [83]:
# X_test =  read  Neighborhood_Market_data

# read
df_StoreN =  pd.read_csv("Neighborhood_Market_data.csv", sep=",", index_col='item_code')
df_StoreN

Unnamed: 0_level_0,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,last_modified
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
P0002,Neighborhood_Market,5.51,51.79,46.72,72.02,344,174261.666176,287,2023-01-02
P0004,Neighborhood_Market,3.97,84.63,39.42,42.46,189,141651.425916,387,2023-01-04
P0005,Neighborhood_Market,5.99,39.33,83.51,5.12,183,16816.375296,382,2023-01-05
P0010,Neighborhood_Market,4.10,77.43,49.56,74.41,208,285543.225828,656,2023-01-10
P0013,Neighborhood_Market,6.96,95.39,34.61,23.24,114,76725.649196,755,2023-01-13
...,...,...,...,...,...,...,...,...,...
P1972,Neighborhood_Market,5.68,17.89,71.87,95.67,296,123008.113881,411,2028-05-25
P1977,Neighborhood_Market,1.53,84.32,64.91,66.30,171,362873.902560,702,2028-05-30
P1991,Neighborhood_Market,7.79,84.46,82.96,52.14,50,365334.635424,190,2028-06-13
P1997,Neighborhood_Market,8.91,66.50,5.79,41.11,336,15828.788850,177,2028-06-19


In [84]:
df_StoreN = pd.merge(df_StoreN, df_prices, left_index=True, right_index=True, how='left')
df_StoreN = pd.merge(df_StoreN, df_exercise, left_index=True, right_index=True, how='left')

In [85]:
m, y_pred = get_simple_baseline(data, fillna_value=data.median(numeric_only=True), drop_cols=['store_name', 'last_modified'], k_fold=3, scaler='standard', model='linear', metric='mae', target_col='quantity_sold', X_data_test = df_StoreN)

In [86]:
print(f"mae: {m}")

mae: 40.81888740766508


In [87]:
submission = pd.DataFrame({
    'item_code': df_StoreN.index,
    'quantity_sold': y_pred # your_prediction
})

submission.to_csv('submission.csv', index=False, sep=',')
submission.head()

Unnamed: 0,item_code,quantity_sold
0,P0002,164.462781
1,P0004,242.834845
2,P0005,162.354364
3,P0010,226.819797
4,P0013,238.92656
