In [53]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from bs4 import BeautifulSoup 
import requests 

## Data Collection

### Files sources

In [55]:
# URLs of the files
train_datas_url = 'https://www.raphaelcousin.com/modules/module4/exercise/module4_exercise_train.zip'
test_data_url = 'https://www.raphaelcousin.com/modules/module4/exercise/Neighborhood_Market_data.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_datas_url, 'module4_exercise_train.zip')
download_file(test_data_url, 'Neighborhood_Market_data.csv')

Downloaded module4_exercise_train.zip from https://www.raphaelcousin.com/modules/module4/exercise/module4_exercise_train.zip
Downloaded Neighborhood_Market_data.csv from https://www.raphaelcousin.com/modules/module4/exercise/Neighborhood_Market_data.csv


#### CityMart

In [57]:
# read "CityMart_data.csv"
df_1=pd.read_csv("CityMart_data.csv")
df_1=df_1.drop(columns='last_modified')

#### Greenfield_Grocers

In [59]:
df_2=pd.read_csv("Greenfield_Grocers_data.csv",sep='|',header=3).drop(columns=["Unnamed: 12","1"])
df_2.columns=[col.lower() for col in df_2.columns]
df_2=df_2.drop(columns='last_modified')
df_2

Unnamed: 0,item_code,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,quantity_sold
0,P0006,Greenfield_Grocers,5.02,86.68,71.64,16.42,66,101964.180384,450,130
1,P0014,Greenfield_Grocers,9.91,21.67,54.91,84.80,99,100903.494560,827,118
2,P0016,Greenfield_Grocers,1.13,60.03,97.39,90.00,348,526168.953000,981,136
3,P0021,Greenfield_Grocers,0.95,40.36,67.91,99.76,301,273426.956576,289,155
4,P0028,Greenfield_Grocers,5.24,22.37,61.78,68.67,15,94903.217262,423,177
...,...,...,...,...,...,...,...,...,...,...
396,P1983,Greenfield_Grocers,3.13,37.39,74.87,84.11,168,235456.634023,31,154
397,P1985,Greenfield_Grocers,0.66,30.04,15.77,70.62,5,33454.869096,76,73
398,P1988,Greenfield_Grocers,7.28,7.34,89.94,37.33,92,24643.757868,770,162
399,P1990,Greenfield_Grocers,6.71,73.91,10.24,89.64,9,67842.994176,968,115


#### Outlet_data

In [61]:
# read "SuperSaver_Outlet_data.xlsx"
df_3=pd.read_excel("SuperSaver_Outlet_data.xlsx",sheet_name=None)
df_3=pd.merge(df_3['Quantity'],df_3['Info'],left_on='item_code',right_on='Unnamed: 0')
df_3=df_3.drop(columns="Unnamed: 0")
df_3.rename(columns={'item code': 'store_name','store name':'mass','mass':'dimension_length','dimension length':'dimension_width','dimension width':'dimension_height','dimension height':'days_since_last_purchase','days_since last_purchase':'package_volume','package volume':'stock_age'}, inplace=True)
df_3=df_3.drop(columns='stock age')
df_3 = df_3[[col for col in df_3.columns if col != 'quantity_sold'] + ['quantity_sold']]
df_3

Unnamed: 0,item_code,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,quantity_sold
0,P0003,SuperSaver_Outlet,2.40,50.90,8.80,68.82,337,30825.854400,277,198
1,P0007,SuperSaver_Outlet,9.88,43.65,48.64,55.78,16,118428.526080,291,211
2,P0008,SuperSaver_Outlet,1.45,36.18,10.23,69.14,315,25590.193596,169,200
3,P0009,SuperSaver_Outlet,6.98,48.13,45.55,44.66,295,97909.078190,443,209
4,P0012,SuperSaver_Outlet,7.20,23.66,65.90,12.05,40,18788.287700,45,186
...,...,...,...,...,...,...,...,...,...,...
374,P1976,SuperSaver_Outlet,5.10,62.99,49.16,65.56,293,203012.335504,464,236
375,P1980,SuperSaver_Outlet,4.93,74.91,53.25,52.95,285,211215.299625,163,192
376,P1986,SuperSaver_Outlet,4.24,31.52,73.46,55.73,330,129040.541216,755,193
377,P1993,SuperSaver_Outlet,3.21,97.68,39.88,43.20,293,168284.666880,889,185


#### HighStreet_Bazaar

In [63]:
# read 'HighStreet_Bazaar_data.json'
df_4=pd.read_json('HighStreet_Bazaar_data.json')
df_4.rename(columns={'stock_age':'stockage'})
df_4=df_4.drop(columns='last_modified')

#### Aggregate

#### Simple baseline

In [65]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import numpy as np

def get_simple_baseline(data, fillna_value=-1, drop_cols=None, k_fold=5, scaler='standard', model='linear', metric='mae', target_col=None, X_data_test=None):
    
    data = data.copy()
    # Handle missing values
    data.fillna(fillna_value, inplace=True)
    if X_data_test is not None:
        X_data_test = X_data_test.copy()
        X_data_test.fillna(fillna_value, inplace=True)
    
    # Drop unwanted columns
    if drop_cols:
        data.drop(drop_cols, axis=1, inplace=True)
        if X_data_test is not None:
            X_data_test.drop(drop_cols, axis=1, inplace=True)

    # Split data into features (X) and target (y)
    y = data[target_col]
    X = data.drop(target_col, axis=1)

    # Feature scaling
    if scaler == 'standard':
        scaler = StandardScaler()
    elif scaler == 'minmax':
        scaler = MinMaxScaler()
    else:
        scaler = None
    
    if scaler:
        X = scaler.fit_transform(X)
        if X_data_test is not None:
            X_data_test = scaler.transform(X_data_test)

    # Initialize the model
    if model == 'linear':
        model = LinearRegression()
    elif model == 'logistic':
        model = LogisticRegression()
    elif model == 'random_forest':
        model = RandomForestClassifier()
    else:
        raise ValueError("Unsupported model type")

    # Initialize cross-validation
    kf = KFold(n_splits=k_fold, shuffle=True, random_state=42)
    scores = []

    # Train and evaluate using k-fold cross-validation
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Evaluate using the specified metric
        if metric == 'mae':
            score = mean_absolute_error(y_test, y_pred)
        elif metric == 'accuracy':
            score = accuracy_score(y_test, np.round(y_pred))
        else:
            raise ValueError("Unsupported metric")

        scores.append(score)

    if X_data_test is not None:
        model.fit(X, y)
        return np.mean(scores), model.predict(X_data_test)
    
    # Return the average score
    return np.mean(scores)

In [67]:
data=pd.concat([df_1,df_2,df_3,df_4],axis=0)
get_simple_baseline(data, fillna_value=-1, drop_cols=['store_name','item_code'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')

45.04056766700087

### API sources

In [69]:
def get_api(endpoint_url):
    try:
        # Make the GET request to the mock API
        response = requests.get(endpoint_url)

        if response.status_code == 200:
            data = response.json()
            print(data["message"])
            return data['data']
        else:
            print(f"Failed to retrieve volume data. Status code: {response.status_code}")
    
    except Exception as e:
        print(f"An error occurred: {e}")
password =get_api('https://www.raphaelcousin.com/api/exercise/auth')

prices=get_api("https://www.raphaelcousin.com/api/exercise/RcUZjhdsYLRzwi4/prices")

Authentication successful
Volume data retrieved successfully


In [71]:
df_prices = pd.DataFrame.from_dict(prices,orient='index',columns=['prices'])
df_prices

Unnamed: 0,prices
P0001,22.14
P0002,26.91
P0003,16.90
P0004,7.04
P0005,20.84
...,...
P1996,25.80
P1997,26.05
P1998,17.41
P1999,7.57


#### Aggregate

In [73]:
data.set_index('item_code',inplace=True)
data = pd.merge(data, df_prices, left_index=True, right_index=True, how='left')
data

Unnamed: 0_level_0,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,quantity_sold,prices
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
P0019,CityMart,2.81,26.83,38.75,24.89,323.0,25877.199625,253,202,17.86
P0024,CityMart,3.30,59.23,34.99,21.78,321.0,45138.128706,21,225,14.04
P0025,CityMart,2.34,22.60,16.90,60.12,291.0,22962.232800,316,278,3.38
P0034,CityMart,6.54,18.59,68.72,21.99,126.0,28092.330552,612,233,15.11
P0039,CityMart,9.94,57.89,88.33,35.45,312.0,181270.870165,968,203,14.20
...,...,...,...,...,...,...,...,...,...,...
P1978,HighStreet_Bazaar,6.28,72.48,47.71,83.54,322.0,288883.057632,637,235,15.85
P1981,HighStreet_Bazaar,0.91,31.13,55.22,71.22,342.0,122427.080292,60,241,17.98
P1982,HighStreet_Bazaar,4.65,65.72,82.01,60.75,192.0,327424.104900,925,237,17.75
P1987,HighStreet_Bazaar,4.64,93.41,52.53,41.02,131.0,201278.055846,317,256,22.68


In [75]:
get_simple_baseline(data, fillna_value=-1, drop_cols=['store_name'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')

44.10144429622423

### Scrapping sources

In [79]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time

# Set up the Selenium WebDriver (e.g., Chrome)
#driver = webdriver.Chrome()  # Make sure ChromeDriver is installed
# driver = webdriver.Firefox()
driver = webdriver.Edge()
# driver = webdriver.Safari()

# Open the URL
url = 'https://www.raphaelcousin.com/module4/scrapable-data'
driver.get(url)

# Wait for the page to fully load (increase time if needed)
time.sleep(5)

# Get the fully rendered page source
html = driver.page_source

# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')

# Initialize lists to store scraped data
exercise_data = []

# Find both tables
tables = soup.find_all('table')

# Close the Selenium WebDriver
driver.quit()

# Scrape the second table (Exercise Data)
course_table = tables[1]
for row in course_table.find('tbody').find_all('tr'):
    cols = row.find_all('td')
    exercise_data.append({'Product ID': cols[0].text,
        'Rating': float(cols[1].text),
        'Number of Reviews': int(cols[2].text)})
# Convert the lists to pandas DataFrames
df_exercise = pd.DataFrame(exercise_data)
df_exercise


Unnamed: 0,Product ID,Rating,Number of Reviews
0,P0001,2.0,972
1,P0002,3.0,260
2,P0003,2.0,285
3,P0004,5.0,512
4,P0005,3.0,85
...,...,...,...
1995,P1996,5.0,512
1996,P1997,2.0,989
1997,P1998,2.0,440
1998,P1999,4.0,37


#### Aggregate

In [80]:
df_exercise.set_index('Product ID', inplace=True)


In [83]:
data = pd.merge(data, df_exercise, left_index=True, right_index=True, how='left')
data


Unnamed: 0_level_0,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,quantity_sold,prices,Rating,Number of Reviews
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
P0019,CityMart,2.81,26.83,38.75,24.89,323.0,25877.199625,253,202,17.86,1.0,948
P0024,CityMart,3.30,59.23,34.99,21.78,321.0,45138.128706,21,225,14.04,3.0,436
P0025,CityMart,2.34,22.60,16.90,60.12,291.0,22962.232800,316,278,3.38,5.0,703
P0034,CityMart,6.54,18.59,68.72,21.99,126.0,28092.330552,612,233,15.11,2.0,935
P0039,CityMart,9.94,57.89,88.33,35.45,312.0,181270.870165,968,203,14.20,2.0,188
...,...,...,...,...,...,...,...,...,...,...,...,...
P1978,HighStreet_Bazaar,6.28,72.48,47.71,83.54,322.0,288883.057632,637,235,15.85,3.0,306
P1981,HighStreet_Bazaar,0.91,31.13,55.22,71.22,342.0,122427.080292,60,241,17.98,5.0,542
P1982,HighStreet_Bazaar,4.65,65.72,82.01,60.75,192.0,327424.104900,925,237,17.75,2.0,6
P1987,HighStreet_Bazaar,4.64,93.41,52.53,41.02,131.0,201278.055846,317,256,22.68,2.0,754


In [89]:
get_simple_baseline(data, fillna_value=-1, drop_cols=['store_name'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')

40.0276950895021

### Generating Submission File

In [143]:
# X_test =  read  Neighborhood_Market_data

# read
df_StoreN =  pd.read_csv("Neighborhood_Market_data.csv", sep=",", index_col='item_code')
df_StoreN

Unnamed: 0_level_0,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,last_modified
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
P0002,Neighborhood_Market,5.51,51.79,46.72,72.02,344,174261.666176,287,2023-01-02
P0004,Neighborhood_Market,3.97,84.63,39.42,42.46,189,141651.425916,387,2023-01-04
P0005,Neighborhood_Market,5.99,39.33,83.51,5.12,183,16816.375296,382,2023-01-05
P0010,Neighborhood_Market,4.10,77.43,49.56,74.41,208,285543.225828,656,2023-01-10
P0013,Neighborhood_Market,6.96,95.39,34.61,23.24,114,76725.649196,755,2023-01-13
...,...,...,...,...,...,...,...,...,...
P1972,Neighborhood_Market,5.68,17.89,71.87,95.67,296,123008.113881,411,2028-05-25
P1977,Neighborhood_Market,1.53,84.32,64.91,66.30,171,362873.902560,702,2028-05-30
P1991,Neighborhood_Market,7.79,84.46,82.96,52.14,50,365334.635424,190,2028-06-13
P1997,Neighborhood_Market,8.91,66.50,5.79,41.11,336,15828.788850,177,2028-06-19


In [145]:
df_StoreN = pd.merge(df_StoreN, df_prices, left_index=True, right_index=True, how='left')
df_StoreN = pd.merge(df_StoreN, df_exercise, left_index=True, right_index=True, how='left')
df_StoreN=df_StoreN.drop(columns='last_modified')

In [147]:
df_StoreN

Unnamed: 0_level_0,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,prices,Rating,Number of Reviews
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
P0002,Neighborhood_Market,5.51,51.79,46.72,72.02,344,174261.666176,287,26.91,3.0,260
P0004,Neighborhood_Market,3.97,84.63,39.42,42.46,189,141651.425916,387,7.04,5.0,512
P0005,Neighborhood_Market,5.99,39.33,83.51,5.12,183,16816.375296,382,20.84,3.0,85
P0010,Neighborhood_Market,4.10,77.43,49.56,74.41,208,285543.225828,656,10.00,2.0,709
P0013,Neighborhood_Market,6.96,95.39,34.61,23.24,114,76725.649196,755,12.98,3.0,984
...,...,...,...,...,...,...,...,...,...,...,...
P1972,Neighborhood_Market,5.68,17.89,71.87,95.67,296,123008.113881,411,15.88,2.0,854
P1977,Neighborhood_Market,1.53,84.32,64.91,66.30,171,362873.902560,702,16.21,1.0,991
P1991,Neighborhood_Market,7.79,84.46,82.96,52.14,50,365334.635424,190,15.32,1.0,572
P1997,Neighborhood_Market,8.91,66.50,5.79,41.11,336,15828.788850,177,26.05,2.0,989


In [149]:
_, x_pred = get_simple_baseline(data, fillna_value=-1, drop_cols=['store_name'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold', X_data_test = df_StoreN)

In [153]:
submission = pd.DataFrame({
    'item_code': df_StoreN.index,
    'quantity_sold': x_pred
})

submission.to_csv('submission.csv', index=False, sep=',')
submission.head()

Unnamed: 0,item_code,quantity_sold
0,P0002,160.697917
1,P0004,244.75236
2,P0005,159.974462
3,P0010,227.499067
4,P0013,241.328037
