In [2]:
import pandas as pd
import numpy as np

## Data Collection

### Files sources

In [3]:
import requests 

In [4]:
# URLs of the files
train_datas_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module4/exercise/module4_exercise_train.zip'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module4/exercise/Neighborhood_Market_data.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_datas_url, 'module4_exercise_train.zip')
download_file(test_data_url, 'Neighborhood_Market_data.csv')

Downloaded module4_exercise_train.zip from https://www.raphaelcousin.com/modules/data-science-practice/module4/exercise/module4_exercise_train.zip
Downloaded Neighborhood_Market_data.csv from https://www.raphaelcousin.com/modules/data-science-practice/module4/exercise/Neighborhood_Market_data.csv


#### CityMart

In [5]:
# read "CityMart_data.csv
df_cm = pd.read_csv('CityMart_data.csv')



#### Greenfield_Grocers

In [6]:
# read "Greenfield_Grocers_data.csv"
df_gg = pd.read_csv('Greenfield_Grocers_data.csv',sep='|')

In [7]:
df_gg.columns = df_gg.columns.str.lower()
df_gg.drop(columns=["1","unnamed: 12"], inplace=True)

#### Outlet_data

In [11]:
# read "SuperSaver_Outlet_data.xlsx"
df_ss = pd.read_excel("SuperSaver_Outlet_data.xlsx")

#### HighStreet_Bazaar

In [12]:
# read 'HighStreet_Bazaar_data.json'
df_hb = pd.read_json('HighStreet_Bazaar_data.json')

#### Aggregate

#### Simple baseline

In [13]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import numpy as np

def get_simple_baseline(data, fillna_value=-1, drop_cols=None, k_fold=5, scaler='standard', model='linear', metric='mae', target_col=None, X_data_test=None):
    
    data = data.copy()
    # Handle missing values
    data.fillna(fillna_value, inplace=True)
    if X_data_test is not None:
        X_data_test = X_data_test.copy()
        X_data_test.fillna(fillna_value, inplace=True)
    
    # Drop unwanted columns
    if drop_cols:
        data.drop(drop_cols, axis=1, inplace=True)
        if X_data_test is not None:
            X_data_test.drop(drop_cols, axis=1, inplace=True)

    # Split data into features (X) and target (y)
    y = data[target_col]
    X = data.drop(target_col, axis=1)

    # Feature scaling
    if scaler == 'standard':
        scaler = StandardScaler()
    elif scaler == 'minmax':
        scaler = MinMaxScaler()
    else:
        scaler = None
    
    if scaler:
        X = scaler.fit_transform(X)
        if X_data_test is not None:
            X_data_test = scaler.transform(X_data_test)

    # Initialize the model
    if model == 'linear':
        model = LinearRegression()
    elif model == 'logistic':
        model = LogisticRegression()
    elif model == 'random_forest':
        model = RandomForestClassifier()
    else:
        raise ValueError("Unsupported model type")

    # Initialize cross-validation
    kf = KFold(n_splits=k_fold, shuffle=True, random_state=42)
    scores = []

    # Train and evaluate using k-fold cross-validation
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Evaluate using the specified metric
        if metric == 'mae':
            score = mean_absolute_error(y_test, y_pred)
        elif metric == 'accuracy':
            score = accuracy_score(y_test, np.round(y_pred))
        else:
            raise ValueError("Unsupported metric")

        scores.append(score)

    if X_data_test is not None:
        model.fit(X, y)
        return np.mean(scores), model.predict(X_data_test)
    
    # Return the average score
    return np.mean(scores),y_pred

In [14]:
data = pd.concat(
    [df_gg, df_cm,df_hb,df_ss],
    axis=0,                      # 0 for vertical, 1 for horizontal
    ignore_index=True) 

In [15]:
get_simple_baseline(data, fillna_value=-1, k_fold=5,drop_cols=['item_code','store_name','last_modified'] ,scaler='standard', model='linear', metric='mae', target_col='quantity_sold')

(np.float64(45.298262959615634),
 array([202.83478343, 183.57375502, 215.17133805, 202.20100204,
        197.7536343 , 190.15089047, 189.51479604, 231.40747262,
        191.4914033 , 190.56020574, 208.54286147, 195.23050861,
        193.32835469, 235.8851397 , 195.52503774, 213.56420424,
        215.71134858, 195.20558474, 208.76171454, 191.70158859,
        186.22204309, 196.46572908, 192.28680315, 195.56853182,
        194.65376496, 196.13763867, 204.29570287, 193.07131764,
        197.93332867, 201.64147044, 198.23853091, 191.5078964 ,
        212.7282893 , 189.93788309, 220.20702009, 193.75734565,
        200.59789218, 200.67674533, 206.93096891, 187.73764011,
        196.97131403, 202.70030898, 199.39733904, 235.14241891,
        193.28255939, 198.39294422, 184.91563634, 200.1765796 ,
        187.61092146, 195.96619618, 223.27337195, 213.17599035,
        195.40748185, 231.10264069, 180.34183863, 215.12682362,
        188.21917251, 209.29580194, 217.06864121, 193.57710086,
       

### API sources

In [16]:
def get_api(endpoint_url):
    try:
        # Make the GET request to the mock API
        response = requests.get(endpoint_url)

        if response.status_code == 200:
            data = response.json()
            print(data["message"])
            return data['data']
        else:
            print(f"Failed to retrieve volume data. Status code: {response.status_code}")
    
    except Exception as e:
        print(f"An error occurred: {e}")
password =get_api('https://www.raphaelcousin.com/api/exercise/auth') 
password = password['password']
print(password)

prices = get_api(f'https://www.raphaelcousin.com/api/exercise/{password}/prices')

Authentication successful
RcUZjhdsYLRzwi4
Volume data retrieved successfully


In [17]:

df_prices = pd.DataFrame(list(prices.items()), columns=["item_code", "prices"])

In [18]:
df_prices

Unnamed: 0,item_code,prices
0,P0001,22.14
1,P0002,26.91
2,P0003,16.90
3,P0004,7.04
4,P0005,20.84
...,...,...
1995,P1996,25.80
1996,P1997,26.05
1997,P1998,17.41
1998,P1999,7.57


#### Aggregate

In [22]:
data1 = pd.merge(data, df_prices, on = 'item_code',how="inner")
#data1 = pd.merge(data, df_prices, left_index=True, right_index=True, how='left')
#data1 = data1.dropna()

In [23]:
data1.columns

Index(['item_code', 'store_name', 'mass', 'dimension_length',
       'dimension_width', 'dimension_height', 'days_since_last_purchase',
       'package_volume', 'stock_age', 'quantity_sold', 'last_modified',
       'prices'],
      dtype='object')

In [25]:
get_simple_baseline(data1, fillna_value=-1, drop_cols=['item_code','store_name','last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')

(np.float64(44.56862010545782),
 array([184.37786245, 173.54148685, 211.66142691, 200.86126572,
        194.31998664, 201.14221962, 205.27340062, 236.40712175,
        200.59613854, 187.50383306, 203.2354151 , 204.63928724,
        201.70652708, 235.4921292 , 195.88423722, 204.94476219,
        212.54480999, 200.38281906, 198.72195575, 194.10450857,
        173.22262111, 182.68645375, 191.2893341 , 202.17828734,
        176.38834714, 192.69630968, 181.79833993, 205.75224042,
        194.12280326, 216.86594446, 184.64192043, 176.22936751,
        222.09011879, 183.88346947, 223.4527046 , 200.99109467,
        198.28308326, 194.6554769 , 220.21975981, 191.30109689,
        192.81461374, 205.96900706, 203.10177858, 234.76266562,
        194.92232851, 194.23727443, 194.60714095, 195.50987144,
        197.77897724, 192.22641196, 210.6939744 , 226.29137266,
        189.46859255, 220.26512974, 176.11587106, 208.68662197,
        178.78588133, 211.72442294, 221.59770098, 180.28683323,
        

### Scrapping sources

In [29]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time

# Set up the Selenium WebDriver (e.g., Chrome)
driver = webdriver.Chrome()  # Make sure ChromeDriver is installed
#driver = webdriver.Firefox()
# driver = webdriver.Edge()
# driver = webdriver.Safari()

# Open the URL
url = 'https://www.raphaelcousin.com/module4/scrapable-data'
driver.get(url)

# Wait for the page to fully load (increase time if needed)
time.sleep(5)

# Get the fully rendered page source
html = driver.page_source

# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')

# Initialize lists to store scraped data
exercise_data = []

# Find both tables
tables = soup.find_all('table')

# Close the Selenium WebDriver
driver.quit()

# Scrape the second table (Exercise Data)
course_table = tables[1]

course_table = tables[1]
for row in course_table.find('tbody').find_all('tr'):
    cols = row.find_all('td')
    x = [col.get_text(strip=True) for col in cols]    
    exercise_data.append(x)

print(exercise_data)
# Construire le DataFrame avec toutes les colonnes



[['P0001', '2', '972', '1728492'], ['P0002', '3', '260', '1728533'], ['P0003', '2', '285', '1728581'], ['P0004', '5', '512', '1728505'], ['P0005', '3', '85', '1728489'], ['P0006', '1', '849', '1728577'], ['P0007', '3', '429', '1728524'], ['P0008', '5', '580', '1728508'], ['P0009', '5', '30', '1728570'], ['P0010', '2', '709', '1728538'], ['P0011', '2', '602', '1728533'], ['P0012', '4', '439', '1728519'], ['P0013', '3', '984', '1728506'], ['P0014', '4', '348', '1728572'], ['P0015', '5', '894', '1728568'], ['P0016', '2', '411', '1728493'], ['P0017', '2', '527', '1728484'], ['P0018', '2', '309', '1728512'], ['P0019', '1', '948', '1728509'], ['P0020', '5', '395', '1728517'], ['P0021', '2', '787', '1728560'], ['P0022', '5', '337', '1728522'], ['P0023', '2', '673', '1728538'], ['P0024', '3', '436', '1728485'], ['P0025', '5', '703', '1728513'], ['P0026', '4', '662', '1728506'], ['P0027', '1', '364', '1728508'], ['P0028', '4', '922', '1728510'], ['P0029', '5', '112', '1728563'], ['P0030', '2', 

In [30]:
# Convert the lists to pandas DataFrames
df_exercise = pd.DataFrame(exercise_data)
df_exercise
df_exercise.rename(columns={0: "item_code",1:"1",2:"2",3:"3"}, inplace=True)
df_exercise = df_exercise.astype({"1":float,"2":float,"3":float})   



In [31]:
df_exercise

Unnamed: 0,item_code,1,2,3
0,P0001,2.0,972.0,1728492.0
1,P0002,3.0,260.0,1728533.0
2,P0003,2.0,285.0,1728581.0
3,P0004,5.0,512.0,1728505.0
4,P0005,3.0,85.0,1728489.0
...,...,...,...,...
1995,P1996,5.0,512.0,1728531.0
1996,P1997,2.0,989.0,1728504.0
1997,P1998,2.0,440.0,1728574.0
1998,P1999,4.0,37.0,1728483.0


#### Aggregate

In [32]:
data2 = pd.merge(data1, df_exercise, on='item_code',  how='inner')

In [None]:
data2

Unnamed: 0,item_code,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,quantity_sold,last_modified,prices,1,2,3
0,P0006,Greenfield_Grocers,5.02,86.68,71.64,16.42,66.0,101964.180384,450.0,130,2023-01-06,15.53,1.0,849.0,1728564.0
1,P0014,Greenfield_Grocers,9.91,21.67,54.91,84.80,99.0,100903.494560,827.0,118,2023-01-14,28.94,4.0,348.0,1728498.0
2,P0016,Greenfield_Grocers,1.13,60.03,97.39,90.00,348.0,526168.953000,981.0,136,2023-01-16,15.10,2.0,411.0,1728577.0
3,P0021,Greenfield_Grocers,0.95,40.36,67.91,99.76,301.0,273426.956576,289.0,155,2023-01-21,3.35,2.0,787.0,1728503.0
4,P0028,Greenfield_Grocers,5.24,22.37,61.78,68.67,15.0,94903.217262,423.0,177,2023-01-28,15.06,4.0,922.0,1728580.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1586,P1976,,,,,,,,,236,,18.69,5.0,574.0,1728488.0
1587,P1980,,,,,,,,,192,,17.20,4.0,624.0,1728568.0
1588,P1986,,,,,,,,,193,,20.91,2.0,818.0,1728533.0
1589,P1993,,,,,,,,,185,,17.35,4.0,121.0,1728504.0


In [33]:
get_simple_baseline(data2, fillna_value=-1, k_fold=5, drop_cols= ['item_code','store_name','last_modified'], scaler='standard', model='linear', metric='mae', target_col='quantity_sold')

(np.float64(41.07169954187501),
 array([183.1983382 , 136.68825702, 258.13396742, 189.82437079,
        215.01483136, 168.3276732 , 183.95239172, 204.28737979,
        179.1597104 , 140.19664521, 185.39590673, 227.93087725,
        201.621218  , 249.38902034, 176.22489763, 181.06441781,
        231.19297207, 219.56853479, 213.58982301, 158.07870969,
        182.99033368, 167.04479681, 211.17687896, 194.42343826,
        211.06530232, 208.45443961, 143.97559783, 213.17083751,
        237.50068482, 194.98826974, 175.04672816, 193.48094802,
        208.609077  , 169.81115051, 254.47313995, 229.13158163,
        168.24800559, 193.19614811, 224.84036798, 164.98067594,
        207.50721905, 206.8365251 , 244.53829762, 252.9382319 ,
        165.83306273, 166.4548102 , 180.84415157, 196.73846824,
        227.56031102, 235.13367724, 229.88699964, 211.80660401,
        175.70263983, 234.26646702, 161.05476776, 194.28722081,
        165.10961078, 217.83424207, 227.65234907, 205.31208079,
        

### Generating Submission File

In [41]:


df_StoreN =  pd.read_csv("Neighborhood_Market_data.csv", sep=",", index_col='item_code')
df_StoreN

Unnamed: 0_level_0,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,last_modified
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
P0002,Neighborhood_Market,5.51,51.79,46.72,72.02,344,174261.666176,287,2023-01-02
P0004,Neighborhood_Market,3.97,84.63,39.42,42.46,189,141651.425916,387,2023-01-04
P0005,Neighborhood_Market,5.99,39.33,83.51,5.12,183,16816.375296,382,2023-01-05
P0010,Neighborhood_Market,4.10,77.43,49.56,74.41,208,285543.225828,656,2023-01-10
P0013,Neighborhood_Market,6.96,95.39,34.61,23.24,114,76725.649196,755,2023-01-13
...,...,...,...,...,...,...,...,...,...
P1972,Neighborhood_Market,5.68,17.89,71.87,95.67,296,123008.113881,411,2028-05-25
P1977,Neighborhood_Market,1.53,84.32,64.91,66.30,171,362873.902560,702,2028-05-30
P1991,Neighborhood_Market,7.79,84.46,82.96,52.14,50,365334.635424,190,2028-06-13
P1997,Neighborhood_Market,8.91,66.50,5.79,41.11,336,15828.788850,177,2028-06-19


In [42]:
#df_StoreN = pd.merge(df_StoreN, df_prices, left_index=True, right_index=True, how='left')
#df_StoreN = pd.merge(df_StoreN, df_exercise, left_index=True, right_index=True, how='left')
df_StoreN = pd.merge(df_StoreN, df_prices, on = 'item_code', how='inner')
df_StoreN = pd.merge(df_StoreN, df_exercise, on ='item_code', how='inner')

In [43]:
df_StoreN.columns

Index(['item_code', 'store_name', 'mass', 'dimension_length',
       'dimension_width', 'dimension_height', 'days_since_last_purchase',
       'package_volume', 'stock_age', 'last_modified', 'prices', '1', '2',
       '3'],
      dtype='object')

In [45]:
df_StoreN

Unnamed: 0,item_code,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,last_modified,prices,1,2,3
0,P0002,Neighborhood_Market,5.51,51.79,46.72,72.02,344,174261.666176,287,2023-01-02,26.91,3.0,260.0,1728533.0
1,P0004,Neighborhood_Market,3.97,84.63,39.42,42.46,189,141651.425916,387,2023-01-04,7.04,5.0,512.0,1728505.0
2,P0005,Neighborhood_Market,5.99,39.33,83.51,5.12,183,16816.375296,382,2023-01-05,20.84,3.0,85.0,1728489.0
3,P0010,Neighborhood_Market,4.10,77.43,49.56,74.41,208,285543.225828,656,2023-01-10,10.00,2.0,709.0,1728538.0
4,P0013,Neighborhood_Market,6.96,95.39,34.61,23.24,114,76725.649196,755,2023-01-13,12.98,3.0,984.0,1728506.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404,P1972,Neighborhood_Market,5.68,17.89,71.87,95.67,296,123008.113881,411,2028-05-25,15.88,2.0,854.0,1728513.0
405,P1977,Neighborhood_Market,1.53,84.32,64.91,66.30,171,362873.902560,702,2028-05-30,16.21,1.0,991.0,1728554.0
406,P1991,Neighborhood_Market,7.79,84.46,82.96,52.14,50,365334.635424,190,2028-06-13,15.32,1.0,572.0,1728489.0
407,P1997,Neighborhood_Market,8.91,66.50,5.79,41.11,336,15828.788850,177,2028-06-19,26.05,2.0,989.0,1728504.0


In [47]:
_, x_pred = get_simple_baseline(data2, fillna_value=-1, drop_cols=['item_code','store_name','last_modified'], k_fold=5, scaler='standard',target_col=['quantity_sold'], model='linear', metric='mae', X_data_test = df_StoreN)

In [52]:
print(x_pred.flatten())

[162.38533478 241.65612616 159.91123488 225.45430971 235.74267069
 196.8645821  182.39470092 184.60235913 193.01675845 192.13414134
 222.6570519  201.77214211 228.87893606 214.55964885 231.99012567
 208.08911959 208.56702916 177.82912628 229.57016279 207.31642659
 219.89940997 166.84675137 241.21758887 181.85270124 210.93411266
 162.57456628 298.83993471 211.95653853 163.47671049 173.04512671
 209.18257849 245.73502353 181.43619825 191.18668981 232.47281672
 216.67800796 194.72406133 191.28229476 250.1116177  194.5797919
 199.79741055 172.18396144 178.17080776 149.50147861 182.97306665
 196.10214153 275.72129117 198.79677505 184.77215913 194.17364847
 227.8181037  202.27510051 217.61050338 177.89891834 209.90374832
 215.17640101 170.62371747 218.03399212 208.246128   155.01886096
 173.02637353 217.52828709 164.06063313 203.59815614 156.40133269
 192.25887786 258.32199895 214.35052182 235.59590095 187.23558868
 243.64215349 168.10237836 161.29054813 203.34436708 210.79362335
 190.752481

In [53]:
submission = pd.DataFrame({
    'item_code': df_StoreN.index,
    'quantity_sold': x_pred.flatten() # your_prediction
})

submission.to_csv('submission.csv', index=False, sep=',')
submission.head()

Unnamed: 0,item_code,quantity_sold
0,0,162.385335
1,1,241.656126
2,2,159.911235
3,3,225.45431
4,4,235.742671
