In [10]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import requests

## Data Collection

### Files sources

In [11]:
# URLs of the files
train_datas_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module4/exercise/module4_exercise_train.zip'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module4/exercise/Neighborhood_Market_data.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_datas_url, 'module4_exercise_train.zip')
download_file(test_data_url, 'Neighborhood_Market_data.csv')

Downloaded module4_exercise_train.zip from https://www.raphaelcousin.com/modules/data-science-practice/module4/exercise/module4_exercise_train.zip
Downloaded Neighborhood_Market_data.csv from https://www.raphaelcousin.com/modules/data-science-practice/module4/exercise/Neighborhood_Market_data.csv


In [12]:
import zipfile
import os

#extract_dir = "module4_exercise_train" # 创建目录
#os.makedirs(extract_dir, exist_ok=True)

with zipfile.ZipFile("module4_exercise_train.zip", 'r') as zip_ref:
    zip_ref.extractall("module4_exercise_train") # 提取到目录



#### CityMart

In [13]:
# read "CityMart_data.csv"
df_citymart = pd.read_csv('CityMart_data.csv', index_col='item_code')
print(df_citymart.shape)
df_citymart.head()

(415, 10)


Unnamed: 0_level_0,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,quantity_sold,last_modified
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
P0019,CityMart,2.81,26.83,38.75,24.89,323,25877.199625,253,202,2023-01-19
P0024,CityMart,3.3,59.23,34.99,21.78,321,45138.128706,21,225,2023-01-24
P0025,CityMart,2.34,22.6,16.9,60.12,291,22962.2328,316,278,2023-01-25
P0034,CityMart,6.54,18.59,68.72,21.99,126,28092.330552,612,233,2023-02-03
P0039,CityMart,9.94,57.89,88.33,35.45,312,181270.870165,968,203,2023-02-08


#### Greenfield_Grocers

In [14]:
# read "Greenfield_Grocers_data.csv"
df_greenfield = pd.read_csv('Greenfield_Grocers_data.csv', skiprows=2, sep= '|')
df_greenfield.columns = df_greenfield.iloc[0] # 将第一行替换为列名
df_greenfield = df_greenfield[1:] # 删除第一行
df_greenfield = df_greenfield.iloc[:, :-2] # 手动选择所有行和除最后两列以外的所有列
df_greenfield.columns = [col.lower() for col in df_greenfield.columns] # 切换小写列名
df_greenfield.set_index('item_code', inplace=True) # 设置item_code为索引
print(df_greenfield.shape)
df_greenfield.head()

(401, 10)


Unnamed: 0_level_0,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,quantity_sold,last_modified
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
P0006,Greenfield_Grocers,5.02,86.68,71.64,16.42,66,101964.18038400002,450,130,2023-01-06
P0014,Greenfield_Grocers,9.91,21.67,54.91,84.8,99,100903.49456,827,118,2023-01-14
P0016,Greenfield_Grocers,1.13,60.03,97.39,90.0,348,526168.953,981,136,2023-01-16
P0021,Greenfield_Grocers,0.95,40.36,67.91,99.76,301,273426.956576,289,155,2023-01-21
P0028,Greenfield_Grocers,5.24,22.37,61.78,68.67,15,94903.217262,423,177,2023-01-28


#### Outlet_data

In [15]:
# read "SuperSaver_Outlet_data.xlsx"
df_outlets = pd.read_excel('SuperSaver_Outlet_data.xlsx', sheet_name=None)
df_outlets['Info'].columns = df_outlets['Info'].columns[1:].tolist() + [None]
df_outlets['Info'].columns = df_outlets['Info'].columns.str.replace(' ', '_')
df_outlet = pd.merge(df_outlets['Quantity'], df_outlets['Info'], left_on='item_code', right_on='item_code')
df_outlet = df_outlet.iloc[:, :-1]
df_outlet.set_index('item_code', inplace=True)
print(df_outlet.shape)
df_outlet.head()

(379, 9)


Unnamed: 0_level_0,quantity_sold,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
P0003,198,SuperSaver_Outlet,2.4,50.9,8.8,68.82,337,30825.8544,277
P0007,211,SuperSaver_Outlet,9.88,43.65,48.64,55.78,16,118428.52608,291
P0008,200,SuperSaver_Outlet,1.45,36.18,10.23,69.14,315,25590.193596,169
P0009,209,SuperSaver_Outlet,6.98,48.13,45.55,44.66,295,97909.07819,443
P0012,186,SuperSaver_Outlet,7.2,23.66,65.9,12.05,40,18788.2877,45


#### HighStreet_Bazaar

In [16]:
# read 'HighStreet_Bazaar_data.json'
df_highstreet = pd.read_json('HighStreet_Bazaar_data.json')
df_highstreet.set_index('item_code', inplace=True)
print(f"HighStreet_Bazaar data shape: {df_highstreet.shape}")
df_highstreet.head()


HighStreet_Bazaar data shape: (396, 10)


Unnamed: 0_level_0,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,quantity_sold,last_modified
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
P0001,HighStreet_Bazaar,6.11,75.46,91.62,92.08,78.0,636608.450016,237,346,1672531200000
P0011,HighStreet_Bazaar,4.34,16.97,93.21,54.58,344.0,86333.208546,184,218,1673395200000
P0015,HighStreet_Bazaar,1.37,58.93,11.28,73.87,344.0,49103.634648,946,315,1673740800000
P0017,HighStreet_Bazaar,7.27,51.51,39.21,16.17,138.0,32658.663807,268,228,1673913600000
P0020,HighStreet_Bazaar,0.89,57.5,69.84,6.12,333.0,24576.696,396,228,1674172800000


#### Aggregate

In [17]:
data = pd.concat([df_citymart, df_greenfield, df_outlet, df_highstreet], axis=0)
print(data.shape)
data.head()

(1591, 10)


Unnamed: 0_level_0,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,quantity_sold,last_modified
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
P0019,CityMart,2.81,26.83,38.75,24.89,323,25877.199625,253,202,2023-01-19
P0024,CityMart,3.3,59.23,34.99,21.78,321,45138.128706,21,225,2023-01-24
P0025,CityMart,2.34,22.6,16.9,60.12,291,22962.2328,316,278,2023-01-25
P0034,CityMart,6.54,18.59,68.72,21.99,126,28092.330552,612,233,2023-02-03
P0039,CityMart,9.94,57.89,88.33,35.45,312,181270.870165,968,203,2023-02-08


#### Simple baseline

In [18]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import numpy as np

def get_simple_baseline(data, fillna_value=-1, drop_cols=None, k_fold=5, scaler='standard', model='linear', metric='mae', target_col=None, X_data_test=None):

    data = data.copy()
    # Handle missing values
    data.fillna(fillna_value, inplace=True)
    if X_data_test is not None:
        X_data_test = X_data_test.copy()
        X_data_test.fillna(fillna_value, inplace=True)

    # Drop unwanted columns
    if drop_cols:
        data.drop(drop_cols, axis=1, inplace=True)
        if X_data_test is not None:
            X_data_test.drop(drop_cols, axis=1, inplace=True)

    # Split data into features (X) and target (y)
    y = data[target_col]
    X = data.drop(target_col, axis=1)

    # Feature scaling
    if scaler == 'standard':
        scaler = StandardScaler()
    elif scaler == 'minmax':
        scaler = MinMaxScaler()
    else:
        scaler = None

    if scaler:
        X = scaler.fit_transform(X)
        if X_data_test is not None:
            X_data_test = scaler.transform(X_data_test)

    # Initialize the model
    if model == 'linear':
        model = LinearRegression()
    elif model == 'logistic':
        model = LogisticRegression()
    elif model == 'random_forest':
        model = RandomForestClassifier()
    else:
        raise ValueError("Unsupported model type")

    # Initialize cross-validation
    kf = KFold(n_splits=k_fold, shuffle=True, random_state=42)
    scores = []

    # Train and evaluate using k-fold cross-validation
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Train the model
        model.fit(X_train, y_train)

        # Make predictions
        y_pred = model.predict(X_test)

        # Evaluate using the specified metric
        if metric == 'mae':
            score = mean_absolute_error(y_test, y_pred)
        elif metric == 'accuracy':
            score = accuracy_score(y_test, np.round(y_pred))
        else:
            raise ValueError("Unsupported metric")

        scores.append(score)

    if X_data_test is not None:
        model.fit(X, y)
        return np.mean(scores), model.predict(X_data_test)

    # Return the average score
    return np.mean(scores)

In [19]:
get_simple_baseline(data, fillna_value=-1, drop_cols=['store_name', 'last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')

45.04056766700087

### API sources

In [20]:
def get_api(endpoint_url):
    try:
        # Make the GET request to the mock API
        response = requests.get(endpoint_url)

        if response.status_code == 200:
            data = response.json()
            print(data["message"])
            return data['data']
        else:
            print(f"Failed to retrieve volume data. Status code: {response.status_code}")

    except Exception as e:
        print(f"An error occurred: {e}")
# password =
# prices =

In [21]:
password = get_api("https://www.raphaelcousin.com/api/course/auth")["password"]
prices = get_api(f"https://www.raphaelcousin.com/api/course/{password}/volumes")
prices

Authentication successful
Volume data retrieved successfully


{'P0001': 48469.09869,
 'P0002': 11252.96382,
 'P0003': 24725.379062999997,
 'P0004': 8361.648239999999,
 'P0005': 255200.66715,
 'P0006': 644786.0415,
 'P0007': 302122.4971200001,
 'P0008': 314150.164308,
 'P0009': 61847.672796,
 'P0010': 746779.532672,
 'P0011': 239910.083994,
 'P0012': 17482.030523999998,
 'P0013': 10598.364972,
 'P0014': 10520.449939999999,
 'P0015': 136549.7757,
 'P0016': 247934.58020000003,
 'P0017': 54689.528088,
 'P0018': 9057.165282,
 'P0019': 21814.067016,
 'P0020': 454560.15780000004,
 'P0021': 67789.79616,
 'P0022': 39971.18208,
 'P0023': 31282.378228000005,
 'P0024': 80893.19616,
 'P0025': 87468.08641799999,
 'P0026': 17419.922304,
 'P0027': 337530.08437200007,
 'P0028': 74212.37982,
 'P0029': 58495.709272,
 'P0030': 61834.98204000001,
 'P0031': 32930.384384000005,
 'P0032': 69289.19848,
 'P0033': 313977.75333599997,
 'P0034': 6635.3966279999995,
 'P0035': 204784.65251999997,
 'P0036': 345513.98932000005,
 'P0037': 81655.593192,
 'P0038': 102508.7066319999

In [22]:
df_prices = pd.DataFrame.from_dict(prices, orient='index', columns=['volume'])
print(df_prices.shape)
df_prices.head()

(1000, 1)


Unnamed: 0,volume
P0001,48469.09869
P0002,11252.96382
P0003,24725.379063
P0004,8361.64824
P0005,255200.66715


#### Aggregate

In [23]:
data = pd.merge(data, df_prices, left_index=True, right_index=True, how='left')
data

Unnamed: 0_level_0,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,quantity_sold,last_modified,volume
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
P0019,CityMart,2.81,26.83,38.75,24.89,323,25877.199625,253,202,2023-01-19,21814.067016
P0024,CityMart,3.3,59.23,34.99,21.78,321,45138.128706,21,225,2023-01-24,80893.196160
P0025,CityMart,2.34,22.6,16.9,60.12,291,22962.2328,316,278,2023-01-25,87468.086418
P0034,CityMart,6.54,18.59,68.72,21.99,126,28092.330552,612,233,2023-02-03,6635.396628
P0039,CityMart,9.94,57.89,88.33,35.45,312,181270.870165,968,203,2023-02-08,170846.537576
...,...,...,...,...,...,...,...,...,...,...,...
P1978,HighStreet_Bazaar,6.28,72.48,47.71,83.54,322.0,288883.057632,637,235,1843344000000,
P1981,HighStreet_Bazaar,0.91,31.13,55.22,71.22,342.0,122427.080292,60,241,1843603200000,
P1982,HighStreet_Bazaar,4.65,65.72,82.01,60.75,192.0,327424.1049,925,237,1843689600000,
P1987,HighStreet_Bazaar,4.64,93.41,52.53,41.02,131.0,201278.055846,317,256,1844121600000,


In [24]:
get_simple_baseline(data, fillna_value=-1, drop_cols=['store_name', 'last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')

45.04524298046092

### Scrapping sources

In [25]:
%pip install selenium bs4

from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time

# Set up the Selenium WebDriver (e.g., Chrome)
driver = webdriver.Chrome()  # Make sure ChromeDriver is installed
# driver = webdriver.Firefox()
# driver = webdriver.Edge()
# driver = webdriver.Safari()

# Open the URL
url = 'https://www.raphaelcousin.com/module4/scrapable-data'
driver.get(url)

# Wait for the page to fully load (increase time if needed)
time.sleep(5)

# Get the fully rendered page source
html = driver.page_source

# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')

# Initialize lists to store scraped data
exercise_data = []

# Find both tables
tables = soup.find_all('table')

# Close the Selenium WebDriver
driver.quit()

# Scrape the second table (Exercise Data)
course_table = tables[1]
for row in course_table.find('tbody').find_all('tr'):
    cols = row.find_all('td')
    #exercise_data.append({ TODO })
    exercise_data.append({
        'Item Code' : cols[0].text,
        'Customer Score' : int(cols[1].text),
        'Total Reviews' : int(cols[2].text),
        'Updated Timestamp' : int(cols[3].text)
    })


# Convert the lists to pandas DataFrames
df_exercise = pd.DataFrame(exercise_data)
df_exercise.head()

Note: you may need to restart the kernel to use updated packages.


Unnamed: 0,Item Code,Customer Score,Total Reviews,Updated Timestamp
0,P0001,2,972,1728524
1,P0002,3,260,1728562
2,P0003,2,285,1728577
3,P0004,5,512,1728528
4,P0005,3,85,1728512


In [26]:
df_exercise.columns = [col.lower() for col in df_exercise.columns]
df_exercise.columns = df_exercise.columns.str.replace(' ', '_')
df_exercise.set_index('item_code', inplace=True)
print(df_exercise.shape)

(2000, 3)


#### Aggregate

In [27]:
data = pd.merge(data, df_exercise, left_index=True, right_index=True, how='left')
data.head()

Unnamed: 0_level_0,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,quantity_sold,last_modified,volume,customer_score,total_reviews,updated_timestamp
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
P0019,CityMart,2.81,26.83,38.75,24.89,323,25877.199625,253,202,2023-01-19,21814.067016,1,948,1728484
P0024,CityMart,3.3,59.23,34.99,21.78,321,45138.128706,21,225,2023-01-24,80893.19616,3,436,1728580
P0025,CityMart,2.34,22.6,16.9,60.12,291,22962.2328,316,278,2023-01-25,87468.086418,5,703,1728494
P0034,CityMart,6.54,18.59,68.72,21.99,126,28092.330552,612,233,2023-02-03,6635.396628,2,935,1728525
P0039,CityMart,9.94,57.89,88.33,35.45,312,181270.870165,968,203,2023-02-08,170846.537576,2,188,1728527


In [33]:
get_simple_baseline(data, fillna_value=-1, drop_cols=['store_name', 'last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')

41.51966936431067

### Generating Submission File

In [29]:
# X_test =  read  Neighborhood_Market_data

# read
df_StoreN =  pd.read_csv("Neighborhood_Market_data.csv", sep=",", index_col='item_code')
df_StoreN

Unnamed: 0_level_0,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,last_modified
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
P0002,Neighborhood_Market,5.51,51.79,46.72,72.02,344,174261.666176,287,2023-01-02
P0004,Neighborhood_Market,3.97,84.63,39.42,42.46,189,141651.425916,387,2023-01-04
P0005,Neighborhood_Market,5.99,39.33,83.51,5.12,183,16816.375296,382,2023-01-05
P0010,Neighborhood_Market,4.10,77.43,49.56,74.41,208,285543.225828,656,2023-01-10
P0013,Neighborhood_Market,6.96,95.39,34.61,23.24,114,76725.649196,755,2023-01-13
...,...,...,...,...,...,...,...,...,...
P1972,Neighborhood_Market,5.68,17.89,71.87,95.67,296,123008.113881,411,2028-05-25
P1977,Neighborhood_Market,1.53,84.32,64.91,66.30,171,362873.902560,702,2028-05-30
P1991,Neighborhood_Market,7.79,84.46,82.96,52.14,50,365334.635424,190,2028-06-13
P1997,Neighborhood_Market,8.91,66.50,5.79,41.11,336,15828.788850,177,2028-06-19


In [34]:
df_StoreN = pd.merge(df_StoreN, df_prices, left_index=True, right_index=True, how='left')
df_StoreN = pd.merge(df_StoreN, df_exercise, left_index=True, right_index=True, how='left')
df_StoreN.head()

Unnamed: 0_level_0,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,last_modified,volume,customer_score,total_reviews,updated_timestamp
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
P0002,Neighborhood_Market,5.51,51.79,46.72,72.02,344,174261.666176,287,2023-01-02,11252.96382,3,260,1728562
P0004,Neighborhood_Market,3.97,84.63,39.42,42.46,189,141651.425916,387,2023-01-04,8361.64824,5,512,1728528
P0005,Neighborhood_Market,5.99,39.33,83.51,5.12,183,16816.375296,382,2023-01-05,255200.66715,3,85,1728512
P0010,Neighborhood_Market,4.1,77.43,49.56,74.41,208,285543.225828,656,2023-01-10,746779.532672,2,709,1728500
P0013,Neighborhood_Market,6.96,95.39,34.61,23.24,114,76725.649196,755,2023-01-13,10598.364972,3,984,1728533


In [None]:
_, x_pred = get_simple_baseline(data, fillna_value=-1, drop_cols=['store_name', 'last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold', X_data_test = df_StoreN)
# 返回两个结果：交叉验证平均分，和对X_data_test的预测结果
# 保留x_pred

In [36]:
submission = pd.DataFrame({
    'item_code': df_StoreN.index,
    'quantity_sold': 0 # your_prediction
})

submission.to_csv('submission.csv', index=False, sep=',')
submission.head()

Unnamed: 0,item_code,quantity_sold
0,P0002,0
1,P0004,0
2,P0005,0
3,P0010,0
4,P0013,0
