In [12]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import requests

## Data Collection

### Files sources

In [13]:
# URLs of the files
train_datas_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module4/exercise/module4_exercise_train.zip'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module4/exercise/Neighborhood_Market_data.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_datas_url, 'module4_exercise_train.zip')
download_file(test_data_url, 'Neighborhood_Market_data.csv')

Downloaded module4_exercise_train.zip from https://www.raphaelcousin.com/modules/data-science-practice/module4/exercise/module4_exercise_train.zip
Downloaded Neighborhood_Market_data.csv from https://www.raphaelcousin.com/modules/data-science-practice/module4/exercise/Neighborhood_Market_data.csv


#### CityMart

In [14]:
# read "CityMart_data.csv"
citymart_pd = pd.read_csv('./module4_exercise_train/CityMart_data.csv')

citymart_pd.head()

Unnamed: 0,item_code,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,quantity_sold,last_modified
0,P0019,CityMart,2.81,26.83,38.75,24.89,323,25877.199625,253,202,2023-01-19
1,P0024,CityMart,3.3,59.23,34.99,21.78,321,45138.128706,21,225,2023-01-24
2,P0025,CityMart,2.34,22.6,16.9,60.12,291,22962.2328,316,278,2023-01-25
3,P0034,CityMart,6.54,18.59,68.72,21.99,126,28092.330552,612,233,2023-02-03
4,P0039,CityMart,9.94,57.89,88.33,35.45,312,181270.870165,968,203,2023-02-08


#### Greenfield_Grocers

In [15]:
# read "Greenfield_Grocers_data.csv"
greenfield_grocers_pd = pd.read_csv('./module4_exercise_train/Greenfield_Grocers_data.csv', 
                                    delimiter='|',
                                    header=3
)

greenfield_grocers_pd.head()

Unnamed: 0,ITEM_CODE,STORE_NAME,MASS,DIMENSION_LENGTH,DIMENSION_WIDTH,DIMENSION_HEIGHT,DAYS_SINCE_LAST_PURCHASE,PACKAGE_VOLUME,STOCK_AGE,QUANTITY_SOLD,LAST_MODIFIED,1,Unnamed: 12
0,P0006,Greenfield_Grocers,5.02,86.68,71.64,16.42,66,101964.180384,450,130,2023-01-06,,
1,P0014,Greenfield_Grocers,9.91,21.67,54.91,84.8,99,100903.49456,827,118,2023-01-14,,
2,P0016,Greenfield_Grocers,1.13,60.03,97.39,90.0,348,526168.953,981,136,2023-01-16,,
3,P0021,Greenfield_Grocers,0.95,40.36,67.91,99.76,301,273426.956576,289,155,2023-01-21,,
4,P0028,Greenfield_Grocers,5.24,22.37,61.78,68.67,15,94903.217262,423,177,2023-01-28,,


#### Outlet_data

In [16]:
# read "SuperSaver_Outlet_data.xlsx"
supersaver_outlet_pd = pd.read_excel('./module4_exercise_train/SuperSaver_Outlet_data.xlsx')

supersaver_outlet_pd.head()


Unnamed: 0,item_code,quantity_sold
0,P0003,198
1,P0007,211
2,P0008,200
3,P0009,209
4,P0012,186


#### HighStreet_Bazaar

In [17]:
# read 'HighStreet_Bazaar_data.json'
highstreet_bazaar_pd = pd.read_json('./module4_exercise_train/HighStreet_Bazaar_data.json')
highstreet_bazaar_pd.head()

Unnamed: 0,item_code,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,quantity_sold,last_modified
0,P0001,HighStreet_Bazaar,6.11,75.46,91.62,92.08,78.0,636608.450016,237,346,1672531200000
1,P0011,HighStreet_Bazaar,4.34,16.97,93.21,54.58,344.0,86333.208546,184,218,1673395200000
2,P0015,HighStreet_Bazaar,1.37,58.93,11.28,73.87,344.0,49103.634648,946,315,1673740800000
3,P0017,HighStreet_Bazaar,7.27,51.51,39.21,16.17,138.0,32658.663807,268,228,1673913600000
4,P0020,HighStreet_Bazaar,0.89,57.5,69.84,6.12,333.0,24576.696,396,228,1674172800000


#### Aggregate

In [18]:
concat_df = pd.concat(
    [citymart_pd, greenfield_grocers_pd, supersaver_outlet_pd, highstreet_bazaar_pd], 
    axis=0, 
    ignore_index=True
)
concat_df

Unnamed: 0,item_code,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,quantity_sold,...,DIMENSION_LENGTH,DIMENSION_WIDTH,DIMENSION_HEIGHT,DAYS_SINCE_LAST_PURCHASE,PACKAGE_VOLUME,STOCK_AGE,QUANTITY_SOLD,LAST_MODIFIED,1,Unnamed: 12
0,P0019,CityMart,2.81,26.83,38.75,24.89,323.0,25877.199625,253.0,202.0,...,,,,,,,,,,
1,P0024,CityMart,3.30,59.23,34.99,21.78,321.0,45138.128706,21.0,225.0,...,,,,,,,,,,
2,P0025,CityMart,2.34,22.60,16.90,60.12,291.0,22962.232800,316.0,278.0,...,,,,,,,,,,
3,P0034,CityMart,6.54,18.59,68.72,21.99,126.0,28092.330552,612.0,233.0,...,,,,,,,,,,
4,P0039,CityMart,9.94,57.89,88.33,35.45,312.0,181270.870165,968.0,203.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1586,P1978,HighStreet_Bazaar,6.28,72.48,47.71,83.54,322.0,288883.057632,637.0,235.0,...,,,,,,,,,,
1587,P1981,HighStreet_Bazaar,0.91,31.13,55.22,71.22,342.0,122427.080292,60.0,241.0,...,,,,,,,,,,
1588,P1982,HighStreet_Bazaar,4.65,65.72,82.01,60.75,192.0,327424.104900,925.0,237.0,...,,,,,,,,,,
1589,P1987,HighStreet_Bazaar,4.64,93.41,52.53,41.02,131.0,201278.055846,317.0,256.0,...,,,,,,,,,,


#### Simple baseline

In [19]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import numpy as np

def get_simple_baseline(data, fillna_value=-1, drop_cols=None, k_fold=5, scaler='standard', model='linear', metric='mae', target_col=None, X_data_test=None):
    
    data = data.copy()
    # Handle missing values
    data.fillna(fillna_value, inplace=True)
    if X_data_test is not None:
        X_data_test = X_data_test.copy()
        X_data_test.fillna(fillna_value, inplace=True)
    
    # Drop unwanted columns
    if drop_cols:
        data.drop(drop_cols, axis=1, inplace=True)
        if X_data_test is not None:
            X_data_test.drop(drop_cols, axis=1, inplace=True)

    # Split data into features (X) and target (y)
    y = data[target_col]
    X = data.drop(target_col, axis=1)

    # Feature scaling
    if scaler == 'standard':
        scaler = StandardScaler()
    elif scaler == 'minmax':
        scaler = MinMaxScaler()
    else:
        scaler = None
    
    if scaler:
        X = scaler.fit_transform(X)
        if X_data_test is not None:
            X_data_test = scaler.transform(X_data_test)

    # Initialize the model
    if model == 'linear':
        model = LinearRegression()
    elif model == 'logistic':
        model = LogisticRegression()
    elif model == 'random_forest':
        model = RandomForestClassifier()
    else:
        raise ValueError("Unsupported model type")

    # Initialize cross-validation
    kf = KFold(n_splits=k_fold, shuffle=True, random_state=42)
    scores = []

    # Train and evaluate using k-fold cross-validation
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Evaluate using the specified metric
        if metric == 'mae':
            score = mean_absolute_error(y_test, y_pred)
        elif metric == 'accuracy':
            score = accuracy_score(y_test, np.round(y_pred))
        else:
            raise ValueError("Unsupported metric")

        scores.append(score)

    if X_data_test is not None:
        model.fit(X, y)
        return np.mean(scores), model.predict(X_data_test)
    
    # Return the average score
    return np.mean(scores)

In [20]:
data = concat_df.copy()
bad_cols = [
    'ITEM_CODE', 'STORE_NAME', 'LAST_MODIFIED', 'QUANTITY_SOLD',    # doublons en MAJ
    'Unnamed: 12', '1' 
]
# data.drop(columns=bad_cols, errors='ignore')
data = data.drop(columns=[c for c in bad_cols if c in data.columns], errors='ignore')

# score1, y_pred1 = get_simple_baseline(data, fillna_value=-1, drop_cols=['item_code', 'store_name', 'last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold', X_data_test=data.drop('quantity_sold', axis=1))
score1 = get_simple_baseline(data, fillna_value=-1, drop_cols=['item_code', 'store_name', 'last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')
score1

28.380030900239213

In [21]:
data

Unnamed: 0,item_code,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,quantity_sold,last_modified,MASS,DIMENSION_LENGTH,DIMENSION_WIDTH,DIMENSION_HEIGHT,DAYS_SINCE_LAST_PURCHASE,PACKAGE_VOLUME,STOCK_AGE
0,P0019,CityMart,2.81,26.83,38.75,24.89,323.0,25877.199625,253.0,202.0,2023-01-19,,,,,,,
1,P0024,CityMart,3.30,59.23,34.99,21.78,321.0,45138.128706,21.0,225.0,2023-01-24,,,,,,,
2,P0025,CityMart,2.34,22.60,16.90,60.12,291.0,22962.232800,316.0,278.0,2023-01-25,,,,,,,
3,P0034,CityMart,6.54,18.59,68.72,21.99,126.0,28092.330552,612.0,233.0,2023-02-03,,,,,,,
4,P0039,CityMart,9.94,57.89,88.33,35.45,312.0,181270.870165,968.0,203.0,2023-02-08,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1586,P1978,HighStreet_Bazaar,6.28,72.48,47.71,83.54,322.0,288883.057632,637.0,235.0,1843344000000,,,,,,,
1587,P1981,HighStreet_Bazaar,0.91,31.13,55.22,71.22,342.0,122427.080292,60.0,241.0,1843603200000,,,,,,,
1588,P1982,HighStreet_Bazaar,4.65,65.72,82.01,60.75,192.0,327424.104900,925.0,237.0,1843689600000,,,,,,,
1589,P1987,HighStreet_Bazaar,4.64,93.41,52.53,41.02,131.0,201278.055846,317.0,256.0,1844121600000,,,,,,,


### API sources

In [22]:
def get_api(endpoint_url):
    try:
        # Make the GET request to the mock API
        response = requests.get(endpoint_url)

        if response.status_code == 200:
            data = response.json()
            print(data["message"])
            return data['data']
        else:
            print(f"Failed to retrieve volume data. Status code: {response.status_code}")
    
    except Exception as e:
        print(f"An error occurred: {e}")
# password = 
# prices = 

In [23]:
get_api('https://www.raphaelcousin.com/api/exercise/auth')

Authentication successful


{'description': 'This data is for the authenticated course access.',
 'password': 'RcUZjhdsYLRzwi4'}

In [24]:
password = 'RcUZjhdsYLRzwi4'
endpoint_url = f'https://www.raphaelcousin.com/api/exercise/{password}/prices'
prices = get_api(endpoint_url)

Volume data retrieved successfully


#### Aggregate

In [25]:
columns = ['item_code', 'prices']
df_prices = pd.DataFrame(list(prices.items()), columns=columns)
df_prices

Unnamed: 0,item_code,prices
0,P0001,22.14
1,P0002,26.91
2,P0003,16.90
3,P0004,7.04
4,P0005,20.84
...,...,...
1995,P1996,25.80
1996,P1997,26.05
1997,P1998,17.41
1998,P1999,7.57


In [27]:
data = pd.merge(data, df_prices, on='item_code', how='left')
data

Unnamed: 0,item_code,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,quantity_sold,last_modified,MASS,DIMENSION_LENGTH,DIMENSION_WIDTH,DIMENSION_HEIGHT,DAYS_SINCE_LAST_PURCHASE,PACKAGE_VOLUME,STOCK_AGE,prices
0,P0019,CityMart,2.81,26.83,38.75,24.89,323.0,25877.199625,253.0,202.0,2023-01-19,,,,,,,,17.86
1,P0024,CityMart,3.30,59.23,34.99,21.78,321.0,45138.128706,21.0,225.0,2023-01-24,,,,,,,,14.04
2,P0025,CityMart,2.34,22.60,16.90,60.12,291.0,22962.232800,316.0,278.0,2023-01-25,,,,,,,,3.38
3,P0034,CityMart,6.54,18.59,68.72,21.99,126.0,28092.330552,612.0,233.0,2023-02-03,,,,,,,,15.11
4,P0039,CityMart,9.94,57.89,88.33,35.45,312.0,181270.870165,968.0,203.0,2023-02-08,,,,,,,,14.20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1586,P1978,HighStreet_Bazaar,6.28,72.48,47.71,83.54,322.0,288883.057632,637.0,235.0,1843344000000,,,,,,,,15.85
1587,P1981,HighStreet_Bazaar,0.91,31.13,55.22,71.22,342.0,122427.080292,60.0,241.0,1843603200000,,,,,,,,17.98
1588,P1982,HighStreet_Bazaar,4.65,65.72,82.01,60.75,192.0,327424.104900,925.0,237.0,1843689600000,,,,,,,,17.75
1589,P1987,HighStreet_Bazaar,4.64,93.41,52.53,41.02,131.0,201278.055846,317.0,256.0,1844121600000,,,,,,,,22.68


In [28]:
score2 = get_simple_baseline(data, fillna_value=-1, drop_cols=['item_code', 'store_name', 'last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')

if score2 < score1:
    print("Amélioration du modèle !")
else:
    print("Modèle moins bien...")
score2

Amélioration du modèle !


28.28899496666594

### Scrapping sources

In [29]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time

# Set up the Selenium WebDriver (e.g., Chrome)
driver = webdriver.Chrome()  # Make sure ChromeDriver is installed
# driver = webdriver.Firefox()
# driver = webdriver.Edge()
# driver = webdriver.Safari()

# Open the URL
url = 'https://www.raphaelcousin.com/module4/scrapable-data'
driver.get(url)

# Wait for the page to fully load (increase time if needed)
time.sleep(5)

# Get the fully rendered page source
html = driver.page_source

# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')

# Initialize lists to store scraped data
exercise_data = []

# Find both tables
tables = soup.find_all('table')

# Close the Selenium WebDriver
driver.quit()

# Scrape the second table (Exercise Data)
course_table = tables[1]
for row in course_table.find('tbody').find_all('tr'):
    cols = row.find_all('td')
    exercise_data.append([col.get_text() for col in cols])

# Convert the lists to pandas DataFrames
df_exercise = pd.DataFrame(exercise_data, columns=['item_code', 'customer_score', 'total_reviews', 'updated_timestamp'])
df_exercise


Unnamed: 0,item_code,customer_score,total_reviews,updated_timestamp
0,P0001,2,972,1728552
1,P0002,3,260,1728538
2,P0003,2,285,1728559
3,P0004,5,512,1728498
4,P0005,3,85,1728527
...,...,...,...,...
1995,P1996,5,512,1728536
1996,P1997,2,989,1728574
1997,P1998,2,440,1728492
1998,P1999,4,37,1728576


#### Aggregate

In [30]:
data

Unnamed: 0,item_code,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,quantity_sold,last_modified,MASS,DIMENSION_LENGTH,DIMENSION_WIDTH,DIMENSION_HEIGHT,DAYS_SINCE_LAST_PURCHASE,PACKAGE_VOLUME,STOCK_AGE,prices
0,P0019,CityMart,2.81,26.83,38.75,24.89,323.0,25877.199625,253.0,202.0,2023-01-19,,,,,,,,17.86
1,P0024,CityMart,3.30,59.23,34.99,21.78,321.0,45138.128706,21.0,225.0,2023-01-24,,,,,,,,14.04
2,P0025,CityMart,2.34,22.60,16.90,60.12,291.0,22962.232800,316.0,278.0,2023-01-25,,,,,,,,3.38
3,P0034,CityMart,6.54,18.59,68.72,21.99,126.0,28092.330552,612.0,233.0,2023-02-03,,,,,,,,15.11
4,P0039,CityMart,9.94,57.89,88.33,35.45,312.0,181270.870165,968.0,203.0,2023-02-08,,,,,,,,14.20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1586,P1978,HighStreet_Bazaar,6.28,72.48,47.71,83.54,322.0,288883.057632,637.0,235.0,1843344000000,,,,,,,,15.85
1587,P1981,HighStreet_Bazaar,0.91,31.13,55.22,71.22,342.0,122427.080292,60.0,241.0,1843603200000,,,,,,,,17.98
1588,P1982,HighStreet_Bazaar,4.65,65.72,82.01,60.75,192.0,327424.104900,925.0,237.0,1843689600000,,,,,,,,17.75
1589,P1987,HighStreet_Bazaar,4.64,93.41,52.53,41.02,131.0,201278.055846,317.0,256.0,1844121600000,,,,,,,,22.68


In [31]:
data = pd.merge(data, df_exercise, on='item_code', how='left')
data

Unnamed: 0,item_code,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,quantity_sold,...,DIMENSION_LENGTH,DIMENSION_WIDTH,DIMENSION_HEIGHT,DAYS_SINCE_LAST_PURCHASE,PACKAGE_VOLUME,STOCK_AGE,prices,customer_score,total_reviews,updated_timestamp
0,P0019,CityMart,2.81,26.83,38.75,24.89,323.0,25877.199625,253.0,202.0,...,,,,,,,17.86,1,948,1728570
1,P0024,CityMart,3.30,59.23,34.99,21.78,321.0,45138.128706,21.0,225.0,...,,,,,,,14.04,3,436,1728485
2,P0025,CityMart,2.34,22.60,16.90,60.12,291.0,22962.232800,316.0,278.0,...,,,,,,,3.38,5,703,1728537
3,P0034,CityMart,6.54,18.59,68.72,21.99,126.0,28092.330552,612.0,233.0,...,,,,,,,15.11,2,935,1728526
4,P0039,CityMart,9.94,57.89,88.33,35.45,312.0,181270.870165,968.0,203.0,...,,,,,,,14.20,2,188,1728520
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1586,P1978,HighStreet_Bazaar,6.28,72.48,47.71,83.54,322.0,288883.057632,637.0,235.0,...,,,,,,,15.85,3,306,1728552
1587,P1981,HighStreet_Bazaar,0.91,31.13,55.22,71.22,342.0,122427.080292,60.0,241.0,...,,,,,,,17.98,5,542,1728501
1588,P1982,HighStreet_Bazaar,4.65,65.72,82.01,60.75,192.0,327424.104900,925.0,237.0,...,,,,,,,17.75,2,6,1728540
1589,P1987,HighStreet_Bazaar,4.64,93.41,52.53,41.02,131.0,201278.055846,317.0,256.0,...,,,,,,,22.68,2,754,1728542


In [32]:
score3 = get_simple_baseline(data, fillna_value=-1, drop_cols=['item_code', 'store_name', 'last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')

if score3 < score2:
    print("Amélioration du modèle !")
elif score3 < score1:
    print("Moins bien que mmodèle 2, mais mieux que modèle 1.")
else:
    print("Modèle moins bien...")
score3

Amélioration du modèle !


15.521904880101488

### Generating Submission File

In [33]:
# X_test =  read  Neighborhood_Market_data

# read
df_StoreN =  pd.read_csv("Neighborhood_Market_data.csv", sep=",", index_col='item_code')
df_StoreN

Unnamed: 0_level_0,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,last_modified
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
P0002,Neighborhood_Market,5.51,51.79,46.72,72.02,344,174261.666176,287,2023-01-02
P0004,Neighborhood_Market,3.97,84.63,39.42,42.46,189,141651.425916,387,2023-01-04
P0005,Neighborhood_Market,5.99,39.33,83.51,5.12,183,16816.375296,382,2023-01-05
P0010,Neighborhood_Market,4.10,77.43,49.56,74.41,208,285543.225828,656,2023-01-10
P0013,Neighborhood_Market,6.96,95.39,34.61,23.24,114,76725.649196,755,2023-01-13
...,...,...,...,...,...,...,...,...,...
P1972,Neighborhood_Market,5.68,17.89,71.87,95.67,296,123008.113881,411,2028-05-25
P1977,Neighborhood_Market,1.53,84.32,64.91,66.30,171,362873.902560,702,2028-05-30
P1991,Neighborhood_Market,7.79,84.46,82.96,52.14,50,365334.635424,190,2028-06-13
P1997,Neighborhood_Market,8.91,66.50,5.79,41.11,336,15828.788850,177,2028-06-19


In [34]:
data.columns = data.columns.str.lower()
df_StoreN = pd.merge(df_StoreN, df_prices, on='item_code', how='left')
df_StoreN = pd.merge(df_StoreN, df_exercise, on='item_code', how='left')
df_StoreN = df_StoreN[data.columns.drop('quantity_sold')]
df_StoreN

Unnamed: 0,item_code,store_name,mass,dimension_length,dimension_width,dimension_height,days_since_last_purchase,package_volume,stock_age,last_modified,...,dimension_length.1,dimension_width.1,dimension_height.1,days_since_last_purchase.1,package_volume.1,stock_age.1,prices,customer_score,total_reviews,updated_timestamp
0,P0002,Neighborhood_Market,5.51,51.79,46.72,72.02,344,174261.666176,287,2023-01-02,...,51.79,46.72,72.02,344,174261.666176,287,26.91,3,260,1728538
1,P0004,Neighborhood_Market,3.97,84.63,39.42,42.46,189,141651.425916,387,2023-01-04,...,84.63,39.42,42.46,189,141651.425916,387,7.04,5,512,1728498
2,P0005,Neighborhood_Market,5.99,39.33,83.51,5.12,183,16816.375296,382,2023-01-05,...,39.33,83.51,5.12,183,16816.375296,382,20.84,3,85,1728527
3,P0010,Neighborhood_Market,4.10,77.43,49.56,74.41,208,285543.225828,656,2023-01-10,...,77.43,49.56,74.41,208,285543.225828,656,10.00,2,709,1728555
4,P0013,Neighborhood_Market,6.96,95.39,34.61,23.24,114,76725.649196,755,2023-01-13,...,95.39,34.61,23.24,114,76725.649196,755,12.98,3,984,1728573
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404,P1972,Neighborhood_Market,5.68,17.89,71.87,95.67,296,123008.113881,411,2028-05-25,...,17.89,71.87,95.67,296,123008.113881,411,15.88,2,854,1728570
405,P1977,Neighborhood_Market,1.53,84.32,64.91,66.30,171,362873.902560,702,2028-05-30,...,84.32,64.91,66.30,171,362873.902560,702,16.21,1,991,1728541
406,P1991,Neighborhood_Market,7.79,84.46,82.96,52.14,50,365334.635424,190,2028-06-13,...,84.46,82.96,52.14,50,365334.635424,190,15.32,1,572,1728517
407,P1997,Neighborhood_Market,8.91,66.50,5.79,41.11,336,15828.788850,177,2028-06-19,...,66.50,5.79,41.11,336,15828.788850,177,26.05,2,989,1728574


In [35]:
score_test, x_pred = get_simple_baseline(data, fillna_value=-1, drop_cols=['item_code', 'store_name', 'last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold', X_data_test = df_StoreN)
score_test

15.521904880101488

In [36]:
df_StoreN.index

RangeIndex(start=0, stop=409, step=1)

In [37]:
x_pred

array([196.63061527, 278.56920531, 192.83927727, 260.6735896 ,
       271.8800782 , 233.88914835, 220.56265521, 212.69657199,
       234.5110337 , 233.68580615, 262.1865782 , 238.25981396,
       258.8993136 , 250.86053982, 262.05274779, 242.02447551,
       235.16418025, 205.20325779, 266.01467704, 237.34472909,
       245.95674245, 201.70445072, 281.44231453, 217.30232123,
       244.67346142, 192.78968284, 332.91485812, 249.36183725,
       195.71876982, 210.46922325, 249.19057291, 280.05597448,
       212.8808512 , 230.86100931, 283.36141414, 251.16254297,
       238.74694031, 213.3331117 , 289.80536176, 227.19683603,
       229.73084225, 202.13096446, 206.96017497, 184.02066923,
       216.6281207 , 240.24417033, 314.38269615, 228.69163944,
       219.39697259, 224.49595808, 267.77881808, 235.27200682,
       257.15910279, 204.20361855, 253.66671921, 247.43242212,
       198.60380865, 255.11205002, 240.74850528, 181.74812648,
       208.26675667, 251.79577765, 194.78281417, 244.66

In [38]:
submission = pd.DataFrame({
    'item_code': df_StoreN.index,
    'quantity_sold': x_pred # your_prediction
})

submission.to_csv('submission.csv', index=False, sep=',')
submission.head()

Unnamed: 0,item_code,quantity_sold
0,0,196.630615
1,1,278.569205
2,2,192.839277
3,3,260.67359
4,4,271.880078
