In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns


## Data Collection

### Files sources

In [3]:
import requests

# URLs of the files
train_data_url_StoreA = 'https://www.raphaelcousin.com/modules/data-science-practice/module4/course/StoreA_data.csv'
train_data_url_StoreB = 'https://www.raphaelcousin.com/modules/data-science-practice/module4/course/StoreB_data.csv'
train_data_url_StoreC = 'https://www.raphaelcousin.com/modules/data-science-practice/module4/course/StoreC_data.xlsx'
train_data_url_StoreD = 'https://www.raphaelcousin.com/modules/data-science-practice/module4/course/StoreD_data.json'

test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module4/course/StoreE_data.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_data_url_StoreA, 'StoreA_data.csv')
download_file(train_data_url_StoreB, 'StoreB_data.csv')
download_file(train_data_url_StoreC, 'StoreC_data.xlsx')
download_file(train_data_url_StoreD, 'StoreD_data.json')

download_file(test_data_url, 'StoreE_data.csv')

Downloaded StoreA_data.csv from https://www.raphaelcousin.com/modules/data-science-practice/module4/course/StoreA_data.csv
Downloaded StoreB_data.csv from https://www.raphaelcousin.com/modules/data-science-practice/module4/course/StoreB_data.csv
Downloaded StoreC_data.xlsx from https://www.raphaelcousin.com/modules/data-science-practice/module4/course/StoreC_data.xlsx
Downloaded StoreD_data.json from https://www.raphaelcousin.com/modules/data-science-practice/module4/course/StoreD_data.json
Downloaded StoreE_data.csv from https://www.raphaelcousin.com/modules/data-science-practice/module4/course/StoreE_data.csv


#### StoreA

In [4]:
# read
df_StoreA =  pd.read_csv("StoreA_data.csv", sep=",", index_col='product_id')
df_StoreA

Unnamed: 0_level_0,store,weight,length,width,days_since_last_sale,days_in_stock,price,number_sold,last_updated
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
P0019,StoreA,8.71,12.81,58.68,190,651,17.23,214,2023-01-19
P0024,StoreA,3.42,41.28,38.92,115,317,10.63,188,2023-01-24
P0025,StoreA,8.05,24.54,92.99,337,123,21.49,176,2023-01-25
P0034,StoreA,4.58,32.82,7.77,85,717,7.68,169,2023-02-03
P0039,StoreA,5.03,82.12,42.79,266,542,7.71,193,2023-02-08
...,...,...,...,...,...,...,...,...,...
P0956,StoreA,2.23,69.04,91.37,168,38,12.37,115,2025-08-13
P0960,StoreA,8.04,32.60,23.11,163,571,16.67,131,2025-08-17
P0967,StoreA,,64.94,55.49,84,569,1.00,192,2025-08-24
P0997,StoreA,5.17,59.11,61.51,316,366,15.79,90,2025-09-23


#### StoreB

In [5]:
# read
df_StoreB =  pd.read_csv("StoreB_data.csv", sep=";", header=1, index_col='PRODUCT_ID')
df_StoreB

Unnamed: 0_level_0,STORE,WEIGHT,LENGTH,WIDTH,DAYS_SINCE_LAST_SALE,DAYS_IN_STOCK,PRICE,NUMBER_SOLD,LAST_UPDATED,Unnamed: 10
PRODUCT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
P0006,StoreB,1.61,75.06,91.25,301.0,783,8.26,220,2023-01-06,
P0014,StoreB,1.22,14.35,73.24,272.0,657,6.84,71,2023-01-14,
P0016,StoreB,8.41,77.62,65.86,255.0,866,16.70,119,2023-01-16,
P0021,StoreB,5.95,33.07,52.40,24.0,810,17.04,126,2023-01-21,
P0028,StoreB,4.04,78.59,26.10,136.0,593,6.23,94,2023-01-28,
...,...,...,...,...,...,...,...,...,...,...
P0984,StoreB,8.42,24.87,90.35,276.0,490,8.24,171,2025-09-10,
P0990,StoreB,8.96,27.99,14.92,342.0,413,12.90,136,2025-09-16,
P0991,StoreB,6.40,96.26,59.80,104.0,434,15.06,168,2025-09-17,
P0994,StoreB,5.23,99.51,98.15,32.0,952,19.71,195,2025-09-20,


In [6]:
# format
df_StoreB.drop('Unnamed: 10', axis=1, inplace=True)
df_StoreB.columns = [col.lower() for col in df_StoreB.columns]
df_StoreB.rename_axis(df_StoreB.index.name.lower(), inplace=True)
df_StoreB

Unnamed: 0_level_0,store,weight,length,width,days_since_last_sale,days_in_stock,price,number_sold,last_updated
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
P0006,StoreB,1.61,75.06,91.25,301.0,783,8.26,220,2023-01-06
P0014,StoreB,1.22,14.35,73.24,272.0,657,6.84,71,2023-01-14
P0016,StoreB,8.41,77.62,65.86,255.0,866,16.70,119,2023-01-16
P0021,StoreB,5.95,33.07,52.40,24.0,810,17.04,126,2023-01-21
P0028,StoreB,4.04,78.59,26.10,136.0,593,6.23,94,2023-01-28
...,...,...,...,...,...,...,...,...,...
P0984,StoreB,8.42,24.87,90.35,276.0,490,8.24,171,2025-09-10
P0990,StoreB,8.96,27.99,14.92,342.0,413,12.90,136,2025-09-16
P0991,StoreB,6.40,96.26,59.80,104.0,434,15.06,168,2025-09-17
P0994,StoreB,5.23,99.51,98.15,32.0,952,19.71,195,2025-09-20


#### StoreC

In [7]:
# read
# !pip install openpyxl
df_StoreCs =  pd.read_excel("StoreC_data.xlsx", sheet_name=None)
df_StoreCs

{'Sheet1':     product_id  number_sold
 0        P0003           28
 1        P0007           26
 2        P0008           34
 3        P0009           17
 4        P0012            9
 ..         ...          ...
 185      P0985           46
 186      P0986           25
 187      P0989           12
 188      P0992            3
 189      P1000           54
 
 [190 rows x 2 columns],
 'Sheet2':     product id   store  weight  length  width  days_since_last_sale  \
 0        P0003  StoreC    7.00   11.77  98.21                   120   
 1        P0007  StoreC    5.81   38.56  94.65                   254   
 2        P0008  StoreC    6.11   33.17  96.21                   121   
 3        P0009  StoreC    4.30   38.22  54.54                    30   
 4        P0012  StoreC    9.26   22.59  20.36                   105   
 ..         ...     ...     ...     ...    ...                   ...   
 185      P0985  StoreC    0.48    9.75  42.95                    64   
 186      P0986  StoreC    9.

In [8]:
# format
df_StoreC = pd.merge(df_StoreCs['Sheet1'], df_StoreCs['Sheet2'], left_on = 'product_id' , right_on = 'product id')
df_StoreC = df_StoreC[['product_id', 'store', 'weight', 'length', 'width', 'days_since_last_sale', 'days_in_stock',	'price']]
df_StoreC.set_index('product_id', inplace=True)
df_StoreC

Unnamed: 0_level_0,store,weight,length,width,days_since_last_sale,days_in_stock,price
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
P0003,StoreC,7.00,11.77,98.21,120,513,17.85
P0007,StoreC,5.81,38.56,94.65,254,753,11.94
P0008,StoreC,6.11,33.17,96.21,121,540,7.24
P0009,StoreC,4.30,38.22,54.54,30,735,8.37
P0012,StoreC,9.26,22.59,20.36,105,621,16.36
...,...,...,...,...,...,...,...
P0985,StoreC,0.48,9.75,42.95,64,746,3.85
P0986,StoreC,9.03,85.15,36.05,270,643,19.46
P0989,StoreC,6.63,86.11,81.09,137,316,9.98
P0992,StoreC,6.18,23.71,49.08,33,2,10.66


#### StoreD

In [9]:
# read
df_StoreD = pd.read_json('StoreD_data.json', orient='records')
df_StoreD

Unnamed: 0,product_id,store,weight,length,width,days_since_last_sale,days_in_stock,price,number_sold,last_updated
0,P0001,StoreD,9.33,73.05,11.94,236,65,12.00,40,1672531200000
1,P0011,StoreD,9.35,67.83,76.94,284,496,15.07,42,1673395200000
2,P0015,StoreD,9.85,67.73,28.50,231,369,13.11,30,1673740800000
3,P0017,StoreD,1.33,30.18,68.33,23,909,3.21,54,1673913600000
4,P0020,StoreD,5.24,96.95,78.30,346,796,9.94,65,1674172800000
...,...,...,...,...,...,...,...,...,...,...
201,P0982,StoreD,4.43,20.75,58.83,256,257,9.67,34,1757289600000
202,P0983,StoreD,2.62,47.33,67.42,294,672,6.90,30,1757376000000
203,P0988,StoreD,6.41,80.35,83.52,316,659,11.00,72,1757808000000
204,P0993,StoreD,0.76,95.39,16.28,109,841,11.28,78,1758240000000


In [10]:
# format
df_StoreD.set_index('product_id', inplace=True)
df_StoreD

Unnamed: 0_level_0,store,weight,length,width,days_since_last_sale,days_in_stock,price,number_sold,last_updated
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
P0001,StoreD,9.33,73.05,11.94,236,65,12.00,40,1672531200000
P0011,StoreD,9.35,67.83,76.94,284,496,15.07,42,1673395200000
P0015,StoreD,9.85,67.73,28.50,231,369,13.11,30,1673740800000
P0017,StoreD,1.33,30.18,68.33,23,909,3.21,54,1673913600000
P0020,StoreD,5.24,96.95,78.30,346,796,9.94,65,1674172800000
...,...,...,...,...,...,...,...,...,...
P0982,StoreD,4.43,20.75,58.83,256,257,9.67,34,1757289600000
P0983,StoreD,2.62,47.33,67.42,294,672,6.90,30,1757376000000
P0988,StoreD,6.41,80.35,83.52,316,659,11.00,72,1757808000000
P0993,StoreD,0.76,95.39,16.28,109,841,11.28,78,1758240000000


#### Aggregate

In [11]:
data = pd.concat([df_StoreA, df_StoreB, df_StoreC, df_StoreD], axis=0)
data

Unnamed: 0_level_0,store,weight,length,width,days_since_last_sale,days_in_stock,price,number_sold,last_updated
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
P0019,StoreA,8.71,12.81,58.68,190.0,651,17.23,214.0,2023-01-19
P0024,StoreA,3.42,41.28,38.92,115.0,317,10.63,188.0,2023-01-24
P0025,StoreA,8.05,24.54,92.99,337.0,123,21.49,176.0,2023-01-25
P0034,StoreA,4.58,32.82,7.77,85.0,717,7.68,169.0,2023-02-03
P0039,StoreA,5.03,82.12,42.79,266.0,542,7.71,193.0,2023-02-08
...,...,...,...,...,...,...,...,...,...
P0982,StoreD,4.43,20.75,58.83,256.0,257,9.67,34.0,1757289600000
P0983,StoreD,2.62,47.33,67.42,294.0,672,6.90,30.0,1757376000000
P0988,StoreD,6.41,80.35,83.52,316.0,659,11.00,72.0,1757808000000
P0993,StoreD,0.76,95.39,16.28,109.0,841,11.28,78.0,1758240000000


#### Simple baseline

In [12]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import numpy as np

def get_simple_baseline(data, fillna_value=-1, drop_cols=None, k_fold=5, scaler='standard', model='linear', metric='mae', target_col=None, X_data_test=None):
    
    data = data.copy()
    # Handle missing values
    data.fillna(fillna_value, inplace=True)
    if X_data_test is not None:
        X_data_test = X_data_test.copy()
        X_data_test.fillna(fillna_value, inplace=True)
    
    # Drop unwanted columns
    if drop_cols:
        data.drop(drop_cols, axis=1, inplace=True)
        if X_data_test is not None:
            X_data_test.drop(drop_cols, axis=1, inplace=True)

    # Split data into features (X) and target (y)
    y = data[target_col]
    X = data.drop(target_col, axis=1)

    # Feature scaling
    if scaler == 'standard':
        scaler = StandardScaler()
    elif scaler == 'minmax':
        scaler = MinMaxScaler()
    else:
        scaler = None
    
    if scaler:
        X = scaler.fit_transform(X)
        if X_data_test is not None:
            X_data_test = scaler.transform(X_data_test)

    # Initialize the model
    if model == 'linear':
        model = LinearRegression()
    elif model == 'logistic':
        model = LogisticRegression()
    elif model == 'random_forest':
        model = RandomForestClassifier()
    else:
        raise ValueError("Unsupported model type")

    # Initialize cross-validation
    kf = KFold(n_splits=k_fold, shuffle=True, random_state=42)
    scores = []

    # Train and evaluate using k-fold cross-validation
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Evaluate using the specified metric
        if metric == 'mae':
            score = mean_absolute_error(y_test, y_pred)
        elif metric == 'accuracy':
            score = accuracy_score(y_test, np.round(y_pred))
        else:
            raise ValueError("Unsupported metric")

        scores.append(score)

    if X_data_test is not None:
        model.fit(X, y)
        return np.mean(scores), model.predict(X_data_test)
    
    # Return the average score
    return np.mean(scores)

In [13]:
get_simple_baseline(data, fillna_value=-1, drop_cols=['store', 'last_updated'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='number_sold')

np.float64(63.890089223694645)

### API sources

In [14]:
def get_api(endpoint_url):
    try:
        # Make the GET request to the mock API
        response = requests.get(endpoint_url)

        if response.status_code == 200:
            data = response.json()
            print(data["message"])
            return data['data']
        else:
            print(f"Failed to retrieve volume data. Status code: {response.status_code}")
    
    except Exception as e:
        print(f"An error occurred: {e}")
password = get_api("https://www.raphaelcousin.com/api/course/auth")["password"]
print(password)
volumes = get_api(f"https://www.raphaelcousin.com/api/course/{password}/volumes")
volumes

Authentication successful
jpyq6ouAjqu74A0
Volume data retrieved successfully


{'P0001': 48469.09869,
 'P0002': 11252.96382,
 'P0003': 24725.379062999997,
 'P0004': 8361.648239999999,
 'P0005': 255200.66715,
 'P0006': 644786.0415,
 'P0007': 302122.4971200001,
 'P0008': 314150.164308,
 'P0009': 61847.672796,
 'P0010': 746779.532672,
 'P0011': 239910.083994,
 'P0012': 17482.030523999998,
 'P0013': 10598.364972,
 'P0014': 10520.449939999999,
 'P0015': 136549.7757,
 'P0016': 247934.58020000003,
 'P0017': 54689.528088,
 'P0018': 9057.165282,
 'P0019': 21814.067016,
 'P0020': 454560.15780000004,
 'P0021': 67789.79616,
 'P0022': 39971.18208,
 'P0023': 31282.378228000005,
 'P0024': 80893.19616,
 'P0025': 87468.08641799999,
 'P0026': 17419.922304,
 'P0027': 337530.08437200007,
 'P0028': 74212.37982,
 'P0029': 58495.709272,
 'P0030': 61834.98204000001,
 'P0031': 32930.384384000005,
 'P0032': 69289.19848,
 'P0033': 313977.75333599997,
 'P0034': 6635.3966279999995,
 'P0035': 204784.65251999997,
 'P0036': 345513.98932000005,
 'P0037': 81655.593192,
 'P0038': 102508.7066319999

In [15]:
df_volumes = pd.DataFrame.from_dict(volumes, orient='index', columns=['volume'])

In [16]:
df_volumes

Unnamed: 0,volume
P0001,48469.098690
P0002,11252.963820
P0003,24725.379063
P0004,8361.648240
P0005,255200.667150
...,...
P0996,106905.739283
P0997,38976.377392
P0998,74218.906112
P0999,12521.682656


#### Aggregate

In [17]:
data = pd.merge(data, df_volumes, left_index=True, right_index=True, how='left')
data

Unnamed: 0_level_0,store,weight,length,width,days_since_last_sale,days_in_stock,price,number_sold,last_updated,volume
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
P0019,StoreA,8.71,12.81,58.68,190.0,651,17.23,214.0,2023-01-19,21814.067016
P0024,StoreA,3.42,41.28,38.92,115.0,317,10.63,188.0,2023-01-24,80893.196160
P0025,StoreA,8.05,24.54,92.99,337.0,123,21.49,176.0,2023-01-25,87468.086418
P0034,StoreA,4.58,32.82,7.77,85.0,717,7.68,169.0,2023-02-03,6635.396628
P0039,StoreA,5.03,82.12,42.79,266.0,542,7.71,193.0,2023-02-08,170846.537576
...,...,...,...,...,...,...,...,...,...,...
P0982,StoreD,4.43,20.75,58.83,256.0,257,9.67,34.0,1757289600000,41797.538400
P0983,StoreD,2.62,47.33,67.42,294.0,672,6.90,30.0,1757376000000,89060.491826
P0988,StoreD,6.41,80.35,83.52,316.0,659,11.00,72.0,1757808000000,219309.989760
P0993,StoreD,0.76,95.39,16.28,109.0,841,11.28,78.0,1758240000000,80489.357036


In [18]:
get_simple_baseline(data, fillna_value=-1, drop_cols=['store', 'last_updated'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='number_sold')

np.float64(63.36403709446334)

### Scrapping sources

In [19]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time

# Set up the Selenium WebDriver (e.g., Chrome)
driver = webdriver.Chrome()  # Make sure ChromeDriver is installed
# driver = webdriver.Firefox()
# driver = webdriver.Edge()
# driver = webdriver.Safari()

# Open the URL
url = 'https://www.raphaelcousin.com/module4/scrapable-data'
driver.get(url)

# Wait for the page to fully load (increase time if needed)
time.sleep(5)

# Get the fully rendered page source
html = driver.page_source

# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')

# Initialize lists to store scraped data
course_data = []
exercise_data = []

# Find both tables
tables = soup.find_all('table')

# Close the Selenium WebDriver
driver.quit()

# Scrape the first table (Course Data)
course_table = tables[1]
for row in course_table.find('tbody').find_all('tr'):
    cols = row.find_all('td')
    course_data.append({
        'Product ID': cols[0].text,
        'Rating': float(cols[1].text),
        'Number of Reviews': int(cols[2].text),
        # 'Updated Timestamp': cols[3].text
    })

# Convert the lists to pandas DataFrames
df_course = pd.DataFrame(course_data)
df_course


Unnamed: 0,Product ID,Rating,Number of Reviews
0,P0001,1.0,897
1,P0002,3.0,209
2,P0003,3.0,125
3,P0004,1.0,491
4,P0005,4.0,508
...,...,...,...
995,P0996,1.0,100
996,P0997,3.0,197
997,P0998,4.0,812
998,P0999,3.0,225


In [20]:
### format
df_course.set_index('Product ID', inplace=True)
# df_course.drop('Updated Timestamp', axis=1, inplace=True)

In [21]:
df_course

Unnamed: 0_level_0,Rating,Number of Reviews
Product ID,Unnamed: 1_level_1,Unnamed: 2_level_1
P0001,1.0,897
P0002,3.0,209
P0003,3.0,125
P0004,1.0,491
P0005,4.0,508
...,...,...
P0996,1.0,100
P0997,3.0,197
P0998,4.0,812
P0999,3.0,225


#### Aggregate

In [22]:
data = pd.merge(data, df_course, left_index=True, right_index=True, how='left')

In [23]:
data

Unnamed: 0_level_0,store,weight,length,width,days_since_last_sale,days_in_stock,price,number_sold,last_updated,volume,Rating,Number of Reviews
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
P0019,StoreA,8.71,12.81,58.68,190.0,651,17.23,214.0,2023-01-19,21814.067016,5.0,675
P0024,StoreA,3.42,41.28,38.92,115.0,317,10.63,188.0,2023-01-24,80893.196160,2.0,837
P0025,StoreA,8.05,24.54,92.99,337.0,123,21.49,176.0,2023-01-25,87468.086418,4.0,688
P0034,StoreA,4.58,32.82,7.77,85.0,717,7.68,169.0,2023-02-03,6635.396628,5.0,254
P0039,StoreA,5.03,82.12,42.79,266.0,542,7.71,193.0,2023-02-08,170846.537576,3.0,765
...,...,...,...,...,...,...,...,...,...,...,...,...
P0982,StoreD,4.43,20.75,58.83,256.0,257,9.67,34.0,1757289600000,41797.538400,3.0,664
P0983,StoreD,2.62,47.33,67.42,294.0,672,6.90,30.0,1757376000000,89060.491826,1.0,65
P0988,StoreD,6.41,80.35,83.52,316.0,659,11.00,72.0,1757808000000,219309.989760,3.0,933
P0993,StoreD,0.76,95.39,16.28,109.0,841,11.28,78.0,1758240000000,80489.357036,3.0,992


In [24]:
get_simple_baseline(data, fillna_value=-1, drop_cols=['store', 'last_updated'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='number_sold')

np.float64(62.37499476349645)

# Get test data and Predict baseline

In [25]:
# read
df_StoreE =  pd.read_csv("StoreE_data.csv", sep=",", index_col='product_id')
df_StoreE

Unnamed: 0_level_0,store,weight,length,width,days_since_last_sale,days_in_stock,price,last_updated
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
P0002,StoreE,5.69,11.85,64.12,25,333,13.90,2023-01-02
P0004,StoreE,9.23,6.15,23.06,181,133,9.67,2023-01-04
P0005,StoreE,7.10,95.87,80.30,198,592,1.11,2023-01-05
P0010,StoreE,7.39,78.59,97.84,314,535,18.03,2023-01-10
P0013,StoreE,4.56,21.54,50.31,223,619,10.50,2023-01-13
...,...,...,...,...,...,...,...,...
P0973,StoreE,3.82,52.56,20.62,103,56,8.57,2025-08-30
P0974,StoreE,4.01,69.56,16.08,229,577,16.72,2025-08-31
P0977,StoreE,8.93,81.66,37.68,126,728,9.41,2025-09-03
P0987,StoreE,4.67,98.21,6.03,187,49,9.69,2025-09-13


In [26]:
df_StoreE = pd.merge(df_StoreE, df_volumes, left_index=True, right_index=True, how='left')
df_StoreE = pd.merge(df_StoreE, df_course, left_index=True, right_index=True, how='left')
df_StoreE

Unnamed: 0_level_0,store,weight,length,width,days_since_last_sale,days_in_stock,price,last_updated,volume,Rating,Number of Reviews
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
P0002,StoreE,5.69,11.85,64.12,25,333,13.90,2023-01-02,11252.963820,3.0,209
P0004,StoreE,9.23,6.15,23.06,181,133,9.67,2023-01-04,8361.648240,1.0,491
P0005,StoreE,7.10,95.87,80.30,198,592,1.11,2023-01-05,255200.667150,4.0,508
P0010,StoreE,7.39,78.59,97.84,314,535,18.03,2023-01-10,746779.532672,1.0,257
P0013,StoreE,4.56,21.54,50.31,223,619,10.50,2023-01-13,10598.364972,5.0,229
...,...,...,...,...,...,...,...,...,...,...,...
P0973,StoreE,3.82,52.56,20.62,103,56,8.57,2025-08-30,78151.894992,3.0,938
P0974,StoreE,4.01,69.56,16.08,229,577,16.72,2025-08-31,42313.793184,4.0,227
P0977,StoreE,8.93,81.66,37.68,126,728,9.41,2025-09-03,84000.702240,4.0,808
P0987,StoreE,4.67,98.21,6.03,187,49,9.69,2025-09-13,34217.680014,4.0,828


In [27]:
_, y_pred = get_simple_baseline(data,
                    fillna_value=-1,
                    drop_cols=['store', 'last_updated'],
                    k_fold=5,
                    scaler='standard',
                    model='linear',
                    metric='mae',
                    target_col='number_sold',
                    X_data_test=df_StoreE)

In [28]:
y_pred

array([ 67.90453545,  46.95558007, 127.92489331, 137.97583151,
        74.31243302,  48.77703122,  45.64402492,  70.26889329,
        73.7884928 , 115.08914847, 107.47438371,  89.83073251,
        59.78908047,  68.22912505, 133.66609817,  85.97293463,
        62.13011634,  37.7857744 , 135.55357179,  58.24573651,
        61.07321795,  55.36755351, 124.34572987, 112.7262506 ,
        97.65015884,  71.28023515, 143.31507255,  42.84092642,
        82.19764963, 131.00549092,  78.8360927 ,  93.90120972,
        47.00528043,  57.03465577,  90.27071828,  46.70520914,
       123.95342151,  31.89118969, 100.60938102,  86.1941139 ,
        82.0566231 ,  65.57191185,  79.4951426 ,  37.88752788,
        48.48075477, 153.81045922, 109.18993412,  50.10584081,
        79.8012506 ,  86.21810961,  40.17180922,  95.50665851,
        61.37097124, 105.98693691,  82.56402473,  96.50450754,
        78.01877128,  61.45965184,  59.80601001, 103.93980409,
        66.37224847,  75.67271765, 108.71489226, 136.81