In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import requests

In [4]:
# URLs of the files
train_datas_url = 'https://www.raphaelcousin.com/modules/module4/exercise/module4_exercise_train.zip'
test_data_url = 'https://www.raphaelcousin.com/modules/module4/exercise/Neighborhood_Market_data.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_datas_url, 'module4_exercise_train.zip')
download_file(test_data_url, 'Neighborhood_Market_data.csv')

Downloaded module4_exercise_train.zip from https://www.raphaelcousin.com/modules/module4/exercise/module4_exercise_train.zip
Downloaded Neighborhood_Market_data.csv from https://www.raphaelcousin.com/modules/module4/exercise/Neighborhood_Market_data.csv


## CityMart


In [7]:
# Basic reading
df_CM = pd.read_csv('module4_exercise_train/CityMart_data.csv')
print(df_CM.head())

  item_code store_name  mass  dimension_length  dimension_width  \
0     P0019   CityMart  2.81             26.83            38.75   
1     P0024   CityMart  3.30             59.23            34.99   
2     P0025   CityMart  2.34             22.60            16.90   
3     P0034   CityMart  6.54             18.59            68.72   
4     P0039   CityMart  9.94             57.89            88.33   

   dimension_height  days_since_last_purchase  package_volume  stock_age  \
0             24.89                       323    25877.199625        253   
1             21.78                       321    45138.128706         21   
2             60.12                       291    22962.232800        316   
3             21.99                       126    28092.330552        612   
4             35.45                       312   181270.870165        968   

   quantity_sold last_modified  
0            202    2023-01-19  
1            225    2023-01-24  
2            278    2023-01-25  
3       

## Greenfield_Grocers

In [9]:
# With specific options
df_GG = pd.read_csv('module4_exercise_train/Greenfield_Grocers_data.csv', 
                 delimiter='|',
                 skiprows=3) 
df_GG = df_GG.iloc[:, :-2]
df_GG.columns = df_GG.columns.str.lower()
print(df_GG.head())

  item_code          store_name  mass  dimension_length  dimension_width  \
0     P0006  Greenfield_Grocers  5.02             86.68            71.64   
1     P0014  Greenfield_Grocers  9.91             21.67            54.91   
2     P0016  Greenfield_Grocers  1.13             60.03            97.39   
3     P0021  Greenfield_Grocers  0.95             40.36            67.91   
4     P0028  Greenfield_Grocers  5.24             22.37            61.78   

   dimension_height  days_since_last_purchase  package_volume  stock_age  \
0             16.42                        66   101964.180384        450   
1             84.80                        99   100903.494560        827   
2             90.00                       348   526168.953000        981   
3             99.76                       301   273426.956576        289   
4             68.67                        15    94903.217262        423   

   quantity_sold last_modified  
0            130    2023-01-06  
1            118    

## Outlet_data

In [11]:
df_OL_Q = pd.read_excel('module4_exercise_train/SuperSaver_Outlet_data.xlsx', 
                   sheet_name='Quantity') # specify sheet name or index)  
df_OL_Inf = pd.read_excel('module4_exercise_train/SuperSaver_Outlet_data.xlsx', 
                   sheet_name='Info') # specify sheet name or index)  
newCol = ['item_code', 'store_name', 'mass', 'dimension_length',
       'dimension_width', 'dimension_height', 'days_since_last_purchase',
       'package_volume', 'stock_age','last_modified']
df_OL_Inf.columns=newCol

df_OL_merged = pd.merge(
    left=df_OL_Q,
    right=df_OL_Inf,
    on='item_code',          # Column to join on
    how='left'                   # Join type: left, right, inner, outer
)
print(df_OL_merged.head())

  item_code  quantity_sold         store_name  mass  dimension_length  \
0     P0003            198  SuperSaver_Outlet  2.40             50.90   
1     P0007            211  SuperSaver_Outlet  9.88             43.65   
2     P0008            200  SuperSaver_Outlet  1.45             36.18   
3     P0009            209  SuperSaver_Outlet  6.98             48.13   
4     P0012            186  SuperSaver_Outlet  7.20             23.66   

   dimension_width  dimension_height  days_since_last_purchase  \
0             8.80             68.82                       337   
1            48.64             55.78                        16   
2            10.23             69.14                       315   
3            45.55             44.66                       295   
4            65.90             12.05                        40   

   package_volume  stock_age  last_modified  
0    30825.854400        277            NaN  
1   118428.526080        291            NaN  
2    25590.193596        1

## HighStreet_Bazaar

In [13]:
df_HB = pd.read_json('module4_exercise_train/HighStreet_Bazaar_data.json')
print(df_HB.shape)
print(df_HB.head())

(396, 11)
  item_code         store_name  mass  dimension_length  dimension_width  \
0     P0001  HighStreet_Bazaar  6.11             75.46            91.62   
1     P0011  HighStreet_Bazaar  4.34             16.97            93.21   
2     P0015  HighStreet_Bazaar  1.37             58.93            11.28   
3     P0017  HighStreet_Bazaar  7.27             51.51            39.21   
4     P0020  HighStreet_Bazaar  0.89             57.50            69.84   

   dimension_height  days_since_last_purchase  package_volume  stock_age  \
0             92.08                      78.0   636608.450016        237   
1             54.58                     344.0    86333.208546        184   
2             73.87                     344.0    49103.634648        946   
3             16.17                     138.0    32658.663807        268   
4              6.12                     333.0    24576.696000        396   

   quantity_sold  last_modified  
0            346  1672531200000  
1            2

## Neighborhood_Market

In [202]:
# read
df_StoreN =  pd.read_csv("Neighborhood_Market_data.csv", sep=",")
df_StoreN

Unnamed: 0,item_code,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,last_modified
0,P0002,Neighborhood_Market,5.51,51.79,46.72,72.02,344,174261.666176,287,2023-01-02
1,P0004,Neighborhood_Market,3.97,84.63,39.42,42.46,189,141651.425916,387,2023-01-04
2,P0005,Neighborhood_Market,5.99,39.33,83.51,5.12,183,16816.375296,382,2023-01-05
3,P0010,Neighborhood_Market,4.10,77.43,49.56,74.41,208,285543.225828,656,2023-01-10
4,P0013,Neighborhood_Market,6.96,95.39,34.61,23.24,114,76725.649196,755,2023-01-13
...,...,...,...,...,...,...,...,...,...,...
404,P1972,Neighborhood_Market,5.68,17.89,71.87,95.67,296,123008.113881,411,2028-05-25
405,P1977,Neighborhood_Market,1.53,84.32,64.91,66.30,171,362873.902560,702,2028-05-30
406,P1991,Neighborhood_Market,7.79,84.46,82.96,52.14,50,365334.635424,190,2028-06-13
407,P1997,Neighborhood_Market,8.91,66.50,5.79,41.11,336,15828.788850,177,2028-06-19


## Concat

In [48]:
df_aggr1 = pd.concat(
    [df_CM, df_GG,df_OL_merged,df_HB],
    axis=0,                      # 0 for vertical, 1 for horizontal
    ignore_index=True            # Reset index
)
df_aggr1


Unnamed: 0,item_code,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,quantity_sold,last_modified
0,P0019,CityMart,2.81,26.83,38.75,24.89,323.0,25877.199625,253,202,2023-01-19
1,P0024,CityMart,3.30,59.23,34.99,21.78,321.0,45138.128706,21,225,2023-01-24
2,P0025,CityMart,2.34,22.60,16.90,60.12,291.0,22962.232800,316,278,2023-01-25
3,P0034,CityMart,6.54,18.59,68.72,21.99,126.0,28092.330552,612,233,2023-02-03
4,P0039,CityMart,9.94,57.89,88.33,35.45,312.0,181270.870165,968,203,2023-02-08
...,...,...,...,...,...,...,...,...,...,...,...
1586,P1978,HighStreet_Bazaar,6.28,72.48,47.71,83.54,322.0,288883.057632,637,235,1843344000000
1587,P1981,HighStreet_Bazaar,0.91,31.13,55.22,71.22,342.0,122427.080292,60,241,1843603200000
1588,P1982,HighStreet_Bazaar,4.65,65.72,82.01,60.75,192.0,327424.104900,925,237,1843689600000
1589,P1987,HighStreet_Bazaar,4.64,93.41,52.53,41.02,131.0,201278.055846,317,256,1844121600000


## API sources

In [26]:
def get_api(endpoint_url):
    try:
        # Make the GET request to the mock API
        response = requests.get(endpoint_url)

        if response.status_code == 200:
            data = response.json()
            print(data["message"])
            return data['data']
        else:
            print(f"Failed to retrieve volume data. Status code: {response.status_code}")
    
    except Exception as e:
        print(f"An error occurred: {e}")

In [28]:
get_api('https://www.raphaelcousin.com/api/exercise/auth')

Authentication successful


{'description': 'This data is for the authenticated course access.',
 'password': 'RcUZjhdsYLRzwi4'}

In [30]:
df_prices = get_api('https://www.raphaelcousin.com/api/exercise/RcUZjhdsYLRzwi4/prices')
df_prices = pd.DataFrame.from_dict(df_prices, orient='index', columns=['price'])
df_prices = df_prices.rename_axis('item_code').reset_index()
print(df_prices.head())

Volume data retrieved successfully
  item_code  price
0     P0001  22.14
1     P0002  26.91
2     P0003  16.90
3     P0004   7.04
4     P0005  20.84


## Merge

In [50]:
df_merge1 = pd.merge(
    left=df_aggr1,
    right=df_prices,
    on='item_code',          # Column to join on
    how='left'            # Reset index
)
df_merge1

Unnamed: 0,item_code,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,quantity_sold,last_modified,price
0,P0019,CityMart,2.81,26.83,38.75,24.89,323.0,25877.199625,253,202,2023-01-19,17.86
1,P0024,CityMart,3.30,59.23,34.99,21.78,321.0,45138.128706,21,225,2023-01-24,14.04
2,P0025,CityMart,2.34,22.60,16.90,60.12,291.0,22962.232800,316,278,2023-01-25,3.38
3,P0034,CityMart,6.54,18.59,68.72,21.99,126.0,28092.330552,612,233,2023-02-03,15.11
4,P0039,CityMart,9.94,57.89,88.33,35.45,312.0,181270.870165,968,203,2023-02-08,14.20
...,...,...,...,...,...,...,...,...,...,...,...,...
1586,P1978,HighStreet_Bazaar,6.28,72.48,47.71,83.54,322.0,288883.057632,637,235,1843344000000,15.85
1587,P1981,HighStreet_Bazaar,0.91,31.13,55.22,71.22,342.0,122427.080292,60,241,1843603200000,17.98
1588,P1982,HighStreet_Bazaar,4.65,65.72,82.01,60.75,192.0,327424.104900,925,237,1843689600000,17.75
1589,P1987,HighStreet_Bazaar,4.64,93.41,52.53,41.02,131.0,201278.055846,317,256,1844121600000,22.68


## Scraping Source

In [34]:
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

# Set up the Selenium WebDriver (e.g., Chrome)
#driver = webdriver.Chrome()  # Make sure   is installed
# driver = webdriver.Firefox()
# driver = webdriver.Edge()
# driver = webdriver.Safari()
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)
# Open the URL
url = 'https://www.raphaelcousin.com/module4/scrapable-data'
driver.get(url)

# Wait for the page to fully load (increase time if needed)
time.sleep(5)

# Get the fully rendered page source
html = driver.page_source

# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')

# Initialize lists to store scraped data
exercise_data = []

# Find both tables
tables = soup.find_all('table')

# Close the Selenium WebDriver
driver.quit()

# Scrape the second table (Exercise Data)
course_table = tables[1]
for row in course_table.find('tbody').find_all('tr'):
    cols = row.find_all('td')
    exercise_data.append({
        'item_code': cols[0].text,
        'customer_score': float(cols[1].text),
        'total_reviews': int(cols[2].text),
        #'updated_timestamp': cols[3].text
    })
# Convert the lists to pandas DataFrames
df_exercise = pd.DataFrame(exercise_data)
df_exercise

Unnamed: 0,item_code,customer_score,total_reviews
0,P0001,2.0,972
1,P0002,3.0,260
2,P0003,2.0,285
3,P0004,5.0,512
4,P0005,3.0,85
...,...,...,...
1995,P1996,5.0,512
1996,P1997,2.0,989
1997,P1998,2.0,440
1998,P1999,4.0,37


## Merge

In [257]:
df_merge2 = pd.merge(
    left=df_merge1,
    right=df_exercise,
    on='item_code',          # Column to join on
    how='left'            # Reset index
)
df_merge2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1591 entries, 0 to 1590
Data columns (total 14 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   item_code                 1591 non-null   object 
 1   store_name                1591 non-null   object 
 2   mass                      1591 non-null   float64
 3   dimension_length          1588 non-null   float64
 4   dimension_width           1588 non-null   float64
 5   dimension_height          1591 non-null   float64
 6   days_since_last_purchase  1588 non-null   float64
 7   package_volume            1588 non-null   float64
 8   stock_age                 1591 non-null   int64  
 9   quantity_sold             1591 non-null   int64  
 10  last_modified             1212 non-null   object 
 11  price                     1591 non-null   float64
 12  customer_score            1591 non-null   float64
 13  total_reviews             1591 non-null   int64  
dtypes: float

In [218]:
df_merge2.isnull().sum()

item_code                     0
store_name                    0
mass                          0
dimension_length              3
dimension_width               3
dimension_height              0
days_since_last_purchase      3
package_volume                3
stock_age                     0
quantity_sold                 0
last_modified               379
price                         0
customer_score                0
total_reviews                 0
dtype: int64

## Train

In [269]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, accuracy_score,make_scorer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRegressor
import numpy as np

def get_simple_baseline(data, fillna_value=-1, drop_cols=None, k_fold=5, scaler='standard', model='linear', metric='mae', target_col=None, X_data_test=None):
    data = data.copy()
    # Handle missing values
    data.fillna(fillna_value, inplace=True)
    if X_data_test is not None:
        X_data_test = X_data_test.copy()
        X_data_test.fillna(fillna_value, inplace=True)
        
    # Drop unwanted columns
    if drop_cols:
        data.drop(drop_cols, axis=1, inplace=True)
        if X_data_test is not None:
            X_data_test.drop(drop_cols, axis=1, inplace=True)
    
    # Split data into features (X) and target (y)
    y = data[target_col]
    X = data.drop(target_col, axis=1)

    # Feature scaling
    if scaler == 'standard':
        scaler = StandardScaler()
    elif scaler == 'minmax':
        scaler = MinMaxScaler()
    else:
        scaler = None
    
    if scaler:
        X = scaler.fit_transform(X)
        if X_data_test is not None:
            X_data_test = scaler.transform(X_data_test)

    # Initialize the model
    if model == 'linear':
        model = LinearRegression()
    elif model == 'logistic':
        model = LogisticRegression()
    elif model == 'random_forest':
        model = RandomForestClassifier()
    elif model == 'XGB':
        model = XGBRegressor()
    else:
        raise ValueError("Unsupported model type")

    # Initialize cross-validation
    kf = KFold(n_splits=k_fold, shuffle=True, random_state=42)
    scores = []

    # Train and evaluate using k-fold cross-validation
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Evaluate using the specified metric
        if metric == 'mae':
            score = mean_absolute_error(y_test, y_pred)
        elif metric == 'accuracy':
            score = accuracy_score(y_test, np.round(y_pred))
        else:
            raise ValueError("Unsupported metric")

        scores.append(score)

    if X_data_test is not None:
        model.fit(X, y)
        return np.mean(scores), model.predict(X_data_test)
    
    # Return the average score
    return np.mean(scores)


In [273]:
get_simple_baseline(df_merge2, fillna_value=-1, drop_cols=['item_code','store_name', 'last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')


40.0276950895021

In [263]:
'''
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from skopt.callbacks import DeltaYStopper
# Define the search space with diverse priors
search_spaces = {
    'n_estimators': (10, 200),                
    'max_depth': (1, 20),                    
    'min_samples_split': (2, 20),            
    'min_samples_leaf': (1, 20),             
    'max_features': ['sqrt', 'log2', None] 
}
# Create a random forest regressor
rf = RandomForestRegressor(random_state=42)
# Define the DeltaYStopper callback
# This stops the optimization early if the change in the objective function (score) 
# is less than 0.001 for the best 3 consecutive iterations.
delta_stopper = DeltaYStopper(delta=0.001, n_best=3)

mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)  
# Perform Bayesian optimization
opt = BayesSearchCV(
    rf,
    search_spaces,
    n_iter=50, # The algorithm will try up to 50 different hyperparameter combinations.
    cv=5,
    n_jobs=-1,
    random_state=42,
    scoring=mae_scorer
)
y = df_merge2['quantity_sold']
X = df_merge2.drop(['item_code','store_name', 'last_modified'], axis=1)
opt.fit(X, y, callback=[delta_stopper])

print("Best parameters:", opt.best_params_)
print("Best cross-validation score:", opt.best_score_)
'''

Best parameters: OrderedDict({'max_depth': 9, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 197})
Best cross-validation score: -0.2686206609497739


## Prediction

In [204]:
df_StoreN = pd.merge(left = df_StoreN, right = df_prices, on = 'item_code', how='left')
df_StoreN = pd.merge(left = df_StoreN, right = df_exercise, on = 'item_code', how='left')

In [206]:
df_StoreN

Unnamed: 0,item_code,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,last_modified,price,customer_score,total_reviews
0,P0002,Neighborhood_Market,5.51,51.79,46.72,72.02,344,174261.666176,287,2023-01-02,26.91,3.0,260
1,P0004,Neighborhood_Market,3.97,84.63,39.42,42.46,189,141651.425916,387,2023-01-04,7.04,5.0,512
2,P0005,Neighborhood_Market,5.99,39.33,83.51,5.12,183,16816.375296,382,2023-01-05,20.84,3.0,85
3,P0010,Neighborhood_Market,4.10,77.43,49.56,74.41,208,285543.225828,656,2023-01-10,10.00,2.0,709
4,P0013,Neighborhood_Market,6.96,95.39,34.61,23.24,114,76725.649196,755,2023-01-13,12.98,3.0,984
...,...,...,...,...,...,...,...,...,...,...,...,...,...
404,P1972,Neighborhood_Market,5.68,17.89,71.87,95.67,296,123008.113881,411,2028-05-25,15.88,2.0,854
405,P1977,Neighborhood_Market,1.53,84.32,64.91,66.30,171,362873.902560,702,2028-05-30,16.21,1.0,991
406,P1991,Neighborhood_Market,7.79,84.46,82.96,52.14,50,365334.635424,190,2028-06-13,15.32,1.0,572
407,P1997,Neighborhood_Market,8.91,66.50,5.79,41.11,336,15828.788850,177,2028-06-19,26.05,2.0,989


In [None]:
score, x_pred = get_simple_baseline(data, fillna_value=-1, drop_cols=['store_name', 'last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold', X_data_test = df_StoreN)

In [None]:
submission = pd.DataFrame({
    'item_code': df_StoreN.index,
    'quantity_sold': x_pred # your_prediction
})

submission.to_csv('submission.csv', index=False, sep=',')
submission.head()