In [93]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import requests
import zipfile

## Data Collection

### Files sources

In [94]:
# URLs of the files
train_datas_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module4/exercise/module4_exercise_train.zip'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module4/exercise/Neighborhood_Market_data.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

def unzip_file(file_name):
    with zipfile.ZipFile(file_name, mode="r") as archive:
        archive.extractall()
    print(f"Unzipped file {file_name} into current working directory")

# Downloading the files
download_file(test_data_url, 'Neighborhood_Market_data.csv')
download_file(train_datas_url, 'module4_exercise_train.zip')
unzip_file('module4_exercise_train.zip')

Downloaded Neighborhood_Market_data.csv from https://www.raphaelcousin.com/modules/data-science-practice/module4/exercise/Neighborhood_Market_data.csv
Downloaded module4_exercise_train.zip from https://www.raphaelcousin.com/modules/data-science-practice/module4/exercise/module4_exercise_train.zip
Unzipped file module4_exercise_train.zip into current working directory


#### CityMart

In [95]:
def debug_df(df: pd.DataFrame, verbose: bool = False):
    df.info()

    if not verbose:
        print("\nHEAD\n#####\n")
        print(df.head(n = 3))
    else:
        print("\nSAMPLE\n######\n")
        print(df.sample(10))

    if df.isnull().any(axis=1).any():
        print("\nEMPTY ROWS\n##########\n")
        print(df[df.isnull().any(axis=1)])

In [96]:
# read "CityMart_data.csv"
df_city_mart = pd.read_csv("CityMart_data.csv", index_col=0, parse_dates=["last_modified"], dtype={"store_name": "category"})
# three rows missing package_volume
df_city_mart.fillna({"package_volume": -1}, inplace=True)
debug_df(df_city_mart)

<class 'pandas.core.frame.DataFrame'>
Index: 415 entries, P0019 to P1998
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   store_name                415 non-null    category      
 1   mass                      415 non-null    float64       
 2   dimension_length          415 non-null    float64       
 3   dimension_width           415 non-null    float64       
 4   dimension_height          415 non-null    float64       
 5   days_since_last_purchase  415 non-null    int64         
 6   package_volume            415 non-null    float64       
 7   stock_age                 415 non-null    int64         
 8   quantity_sold             415 non-null    int64         
 9   last_modified             415 non-null    datetime64[ns]
dtypes: category(1), datetime64[ns](1), float64(5), int64(3)
memory usage: 32.8+ KB

HEAD
#####

          store_name  mass  dimension_length  dimensi

#### Greenfield_Grocers

In [97]:
# read "Greenfield_Grocers_data.csv"
df_greenfield_grocers = pd.read_csv("Greenfield_Grocers_data.csv", delimiter="|", header=3, index_col=0, parse_dates=["LAST_MODIFIED"], dtype={"STORE_NAME": "category"})
df_greenfield_grocers.columns = [name.lower() for name in df_greenfield_grocers.columns]
df_greenfield_grocers.drop(["1", "unnamed: 12"], axis=1, inplace=True)
df_greenfield_grocers.dropna(how="all", inplace=True)
df_greenfield_grocers.fillna({"dimension_length": -1}, inplace=True)
debug_df(df_greenfield_grocers)

<class 'pandas.core.frame.DataFrame'>
Index: 401 entries, P0006 to P1995
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   store_name                401 non-null    category      
 1   mass                      401 non-null    float64       
 2   dimension_length          401 non-null    float64       
 3   dimension_width           401 non-null    float64       
 4   dimension_height          401 non-null    float64       
 5   days_since_last_purchase  401 non-null    int64         
 6   package_volume            401 non-null    float64       
 7   stock_age                 401 non-null    int64         
 8   quantity_sold             401 non-null    int64         
 9   last_modified             401 non-null    datetime64[ns]
dtypes: category(1), datetime64[ns](1), float64(5), int64(3)
memory usage: 31.7+ KB

HEAD
#####

                   store_name  mass  dimension_length

#### Outlet_data

In [98]:
# read "SuperSaver_Outlet_data.xlsx"
dfs_supersaver_outlet =  pd.read_excel("SuperSaver_Outlet_data.xlsx", sheet_name=None)

df_supersaver_outlet_quantity = dfs_supersaver_outlet["Quantity"]
df_supersaver_outlet_info = dfs_supersaver_outlet["Info"]

df_supersaver_outlet_quantity.set_index("item_code", inplace=True)

column_names = list(df_supersaver_outlet_info.columns)
column_names.pop(0)
column_names.append("to_be_dropped")
df_supersaver_outlet_info.columns = column_names

df_supersaver_outlet_info.drop("to_be_dropped", axis=1, inplace=True)
df_supersaver_outlet_info.columns = [name.replace(" ", "_") for name in df_supersaver_outlet_info.columns]

df_supersaver_outlet_info.set_index("item_code", inplace=True)

df_supersaver_outlet = pd.merge(df_supersaver_outlet_info, df_supersaver_outlet_quantity, left_index=True, right_index=True)

debug_df(df_supersaver_outlet, verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 379 entries, P0003 to P2000
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   store_name                379 non-null    object 
 1   mass                      379 non-null    float64
 2   dimension_length          379 non-null    float64
 3   dimension_width           376 non-null    float64
 4   dimension_height          379 non-null    float64
 5   days_since_last_purchase  379 non-null    int64  
 6   package_volume            379 non-null    float64
 7   stock_age                 379 non-null    int64  
 8   quantity_sold             379 non-null    int64  
dtypes: float64(5), int64(3), object(1)
memory usage: 29.6+ KB

SAMPLE
######

                  store_name  mass  dimension_length  dimension_width  \
item_code                                                               
P0276      SuperSaver_Outlet  8.48             94.16            8

#### HighStreet_Bazaar

In [99]:
# read 'HighStreet_Bazaar_data.json'
df_highstreet_bazaar = pd.read_json("HighStreet_Bazaar_data.json")
df_highstreet_bazaar.set_index("item_code", inplace=True)
df_highstreet_bazaar["store_name"] = df_highstreet_bazaar["store_name"].astype("category")
df_highstreet_bazaar["last_modified"] = pd.to_datetime(df_highstreet_bazaar["last_modified"], unit="ms")
df_highstreet_bazaar.fillna({
    "days_since_last_purchase": -1
}, inplace=True)
debug_df(df_highstreet_bazaar)

<class 'pandas.core.frame.DataFrame'>
Index: 396 entries, P0001 to P1994
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   store_name                396 non-null    category      
 1   mass                      396 non-null    float64       
 2   dimension_length          396 non-null    float64       
 3   dimension_width           396 non-null    float64       
 4   dimension_height          396 non-null    float64       
 5   days_since_last_purchase  396 non-null    float64       
 6   package_volume            396 non-null    float64       
 7   stock_age                 396 non-null    int64         
 8   quantity_sold             396 non-null    int64         
 9   last_modified             396 non-null    datetime64[ns]
dtypes: category(1), datetime64[ns](1), float64(6), int64(2)
memory usage: 31.4+ KB

HEAD
#####

                  store_name  mass  dimension_length 

#### Aggregate

In [100]:
data = pd.concat([df_city_mart, df_greenfield_grocers, df_supersaver_outlet, df_highstreet_bazaar], axis=0)
debug_df(data, verbose=True)


<class 'pandas.core.frame.DataFrame'>
Index: 1591 entries, P0019 to P1994
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   store_name                1591 non-null   object        
 1   mass                      1591 non-null   float64       
 2   dimension_length          1591 non-null   float64       
 3   dimension_width           1588 non-null   float64       
 4   dimension_height          1591 non-null   float64       
 5   days_since_last_purchase  1591 non-null   float64       
 6   package_volume            1591 non-null   float64       
 7   stock_age                 1591 non-null   int64         
 8   quantity_sold             1591 non-null   int64         
 9   last_modified             1212 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(6), int64(2), object(1)
memory usage: 136.7+ KB

SAMPLE
######

               store_name  mass  dimension_length 

#### Simple baseline

In [101]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import numpy as np

def get_simple_baseline(data, fillna_value=-1, drop_cols=None, k_fold=5, scaler='standard', model='linear', metric='mae', target_col=None, X_data_test=None):
    
    data = data.copy()
    # Handle missing values
    data.fillna(fillna_value, inplace=True)
    if X_data_test is not None:
        X_data_test = X_data_test.copy()
        X_data_test.fillna(fillna_value, inplace=True)
    
    # Drop unwanted columns
    if drop_cols:
        data.drop(drop_cols, axis=1, inplace=True)
        if X_data_test is not None:
            X_data_test.drop(drop_cols, axis=1, inplace=True)

    # Split data into features (X) and target (y)
    y = data[target_col]
    X = data.drop(target_col, axis=1)

    # Feature scaling
    if scaler == 'standard':
        scaler = StandardScaler()
    elif scaler == 'minmax':
        scaler = MinMaxScaler()
    else:
        scaler = None
    
    if scaler:
        X = scaler.fit_transform(X)
        if X_data_test is not None:
            X_data_test = scaler.transform(X_data_test)

    # Initialize the model
    if model == 'linear':
        model = LinearRegression()
    elif model == 'logistic':
        model = LogisticRegression()
    elif model == 'random_forest':
        model = RandomForestClassifier()
    else:
        raise ValueError("Unsupported model type")

    # Initialize cross-validation
    kf = KFold(n_splits=k_fold, shuffle=True, random_state=42)
    scores = []

    # Train and evaluate using k-fold cross-validation
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Evaluate using the specified metric
        if metric == 'mae':
            score = mean_absolute_error(y_test, y_pred)
        elif metric == 'accuracy':
            score = accuracy_score(y_test, np.round(y_pred))
        else:
            raise ValueError("Unsupported metric")

        scores.append(score)

    if X_data_test is not None:
        model.fit(X, y)
        return np.mean(scores), model.predict(X_data_test)
    
    # Return the average score
    return np.mean(scores)

In [120]:
get_simple_baseline(data, fillna_value=-1, drop_cols=['store_name', 'last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')

  data.fillna(fillna_value, inplace=True)


np.float64(40.03236095268504)

### API sources

In [103]:
def get_api(endpoint_url):
    try:
        # Make the GET request to the mock API
        response = requests.get(endpoint_url)

        if response.status_code == 200:
            data = response.json()
            print(data["message"])
            return data['data']
        else:
            print(f"Failed to retrieve volume data. Status code: {response.status_code}")
    
    except Exception as e:
        print(f"An error occurred: {e}")

password = get_api("https://www.raphaelcousin.com/api/exercise/auth")["password"]
prices = get_api(f"https://www.raphaelcousin.com/api/exercise/{password}/prices")

Authentication successful
Volume data retrieved successfully


In [104]:
df_prices = pd.DataFrame.from_dict(prices, orient="index", columns=["price"])
debug_df(df_prices)

<class 'pandas.core.frame.DataFrame'>
Index: 2000 entries, P0001 to P2000
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   price   2000 non-null   float64
dtypes: float64(1)
memory usage: 31.2+ KB

HEAD
#####

       price
P0001  22.14
P0002  26.91
P0003  16.90


#### Aggregate

In [105]:
data = pd.merge(data, df_prices, left_index=True, right_index=True, how='left')
debug_df(data, verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 1591 entries, P0019 to P1994
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   store_name                1591 non-null   object        
 1   mass                      1591 non-null   float64       
 2   dimension_length          1591 non-null   float64       
 3   dimension_width           1588 non-null   float64       
 4   dimension_height          1591 non-null   float64       
 5   days_since_last_purchase  1591 non-null   float64       
 6   package_volume            1591 non-null   float64       
 7   stock_age                 1591 non-null   int64         
 8   quantity_sold             1591 non-null   int64         
 9   last_modified             1212 non-null   datetime64[ns]
 10  price                     1591 non-null   float64       
dtypes: datetime64[ns](1), float64(7), int64(2), object(1)
memory usage: 213.7+ KB

SAM

In [106]:
get_simple_baseline(data, fillna_value=-1, drop_cols=['store_name', 'last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')

  data.fillna(fillna_value, inplace=True)


np.float64(44.10144429622424)

### Scrapping sources

In [107]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time

# Set up the Selenium WebDriver (e.g., Chrome)
driver = webdriver.Chrome()  # Make sure ChromeDriver is installed
# driver = webdriver.Firefox()
# driver = webdriver.Edge()
# driver = webdriver.Safari()

# Open the URL
url = 'https://www.raphaelcousin.com/module4/scrapable-data'
driver.get(url)

# Wait for the page to fully load (increase time if needed)
time.sleep(5)

# Get the fully rendered page source
html = driver.page_source

# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')

# Initialize lists to store scraped data
exercise_data = []

# Find both tables
tables = soup.find_all('table')

# Close the Selenium WebDriver
driver.quit()

def parse_row(cols):
    assert len(cols) == 4

    return {
        "item_code": cols[0].text,
        "customer_score": int(cols[1].text),
        "total_reviews": int(cols[2].text),
        "updated_timestamp": int(cols[3].text),
    }

# Scrape the second table (Exercise Data)
course_table = tables[1]
for row in course_table.find('tbody').find_all('tr'):
    cols = row.find_all('td')
    exercise_data.append(parse_row(cols))

# Convert the lists to pandas DataFrames
df_exercise = pd.DataFrame(exercise_data)
df_exercise.set_index("item_code", inplace=True)
debug_df(df_exercise)


<class 'pandas.core.frame.DataFrame'>
Index: 2000 entries, P0001 to P2000
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   customer_score     2000 non-null   int64
 1   total_reviews      2000 non-null   int64
 2   updated_timestamp  2000 non-null   int64
dtypes: int64(3)
memory usage: 62.5+ KB

HEAD
#####

           customer_score  total_reviews  updated_timestamp
item_code                                                  
P0001                   2            972            1728491
P0002                   3            260            1728548
P0003                   2            285            1728488


#### Aggregate

In [108]:
data = pd.merge(data, df_exercise, left_index=True, right_index=True, how='left')

In [109]:
get_simple_baseline(data, fillna_value=-1, drop_cols=['store_name', 'last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')

  data.fillna(fillna_value, inplace=True)


np.float64(40.03236095268504)

### Generating Submission File

In [114]:
# X_test =  read  Neighborhood_Market_data

# read
df_StoreN =  pd.read_csv("Neighborhood_Market_data.csv", sep=",", index_col='item_code')
df_StoreN

Unnamed: 0_level_0,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,last_modified
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
P0002,Neighborhood_Market,5.51,51.79,46.72,72.02,344,174261.666176,287,2023-01-02
P0004,Neighborhood_Market,3.97,84.63,39.42,42.46,189,141651.425916,387,2023-01-04
P0005,Neighborhood_Market,5.99,39.33,83.51,5.12,183,16816.375296,382,2023-01-05
P0010,Neighborhood_Market,4.10,77.43,49.56,74.41,208,285543.225828,656,2023-01-10
P0013,Neighborhood_Market,6.96,95.39,34.61,23.24,114,76725.649196,755,2023-01-13
...,...,...,...,...,...,...,...,...,...
P1972,Neighborhood_Market,5.68,17.89,71.87,95.67,296,123008.113881,411,2028-05-25
P1977,Neighborhood_Market,1.53,84.32,64.91,66.30,171,362873.902560,702,2028-05-30
P1991,Neighborhood_Market,7.79,84.46,82.96,52.14,50,365334.635424,190,2028-06-13
P1997,Neighborhood_Market,8.91,66.50,5.79,41.11,336,15828.788850,177,2028-06-19


In [115]:
df_StoreN = pd.merge(df_StoreN, df_prices, left_index=True, right_index=True, how='left')
df_StoreN = pd.merge(df_StoreN, df_exercise, left_index=True, right_index=True, how='left')

In [137]:
x_loss, x_pred = get_simple_baseline(data, fillna_value=-1, drop_cols=['store_name', 'last_modified'], k_fold=5, scaler="standard", model='linear', metric='mae', target_col='quantity_sold', X_data_test = df_StoreN)

  data.fillna(fillna_value, inplace=True)


In [138]:
if x_loss > 20:
    print(f"Not good enough : {x_loss}")
else:
    submission = pd.DataFrame({
        'item_code': df_StoreN.index,
        'quantity_sold': x_pred
    })

    submission.to_csv('submission.csv', index=False, sep=',')
    submission.head()

Not good enough : 40.03236095268504
