In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import requests

## Data Collection

### Files sources

In [None]:
# URLs of the files
train_datas_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module4/exercise/module4_exercise_train.zip'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module4/exercise/Neighborhood_Market_data.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_datas_url, 'module4_exercise_train.zip')
download_file(test_data_url, 'Neighborhood_Market_data.csv')

In [None]:
import zipfile
import os
# Unzipping the training data
with zipfile.ZipFile('module4_exercise_train.zip', 'r') as zip_ref:
    zip_ref.extractall('module4_exercise_train')
print('Unzipped training data to module4_exercise_train/')

#### CityMart

In [None]:
# read "CityMart_data.csv"
citymart_data = pd.read_csv("module4_exercise_train/CityMart_data.csv", index_col=0, parse_dates=["last_modified"], dtype={"store_name": "category"})
#pd.read_csv('Neighborhood_Market_data.csv',index_col="item_code")

display(citymart_data)

In [None]:
citymart_data.info()

In [None]:
citymart_data.describe()

In [None]:
citymart_data.dtypes


#### Greenfield_Grocers

In [None]:
greenfield_data = pd.read_csv('module4_exercise_train/Greenfield_Grocers_data.csv',sep="|")


In [None]:
greenfield_data = pd.read_csv('module4_exercise_train/Greenfield_Grocers_data.csv',sep="|")
greenfield_data.columns = greenfield_data.iloc[2]  
greenfield_data.columns=list(greenfield_data.columns.str.lower())
greenfield_data=greenfield_data.drop([0,1,2])
greenfield_data=greenfield_data.set_index("item_code")
greenfield_data=greenfield_data[[              'store_name',                     'mass',
               'dimension_length',          'dimension_width',
               'dimension_height', 'days_since_last_purchase',
                 'package_volume',                'stock_age',
                  'quantity_sold',            'last_modified']]
display(greenfield_data)

In [None]:
greenfield_data.info()
greenfield_data.describe()

In [None]:

greenfield_data.dtypes
print(greenfield_data.columns)
# conversion auto de toutes les colonnes numériques possibles
for col in [ 'mass', 'dimension_length', 'dimension_width',
       'dimension_height', 'days_since_last_purchase', 'package_volume',
       'stock_age', 'quantity_sold']:
    greenfield_data[col] = pd.to_numeric(greenfield_data[col], errors="ignore")
greenfield_data.dtypes


#### Outlet_data

In [None]:
# read "SuperSaver_Outlet_data.xlsx"
supersaver_data = pd.read_excel('module4_exercise_train/SuperSaver_Outlet_data.xlsx',sheet_name=None)

supersaver_data_quantity = supersaver_data["Quantity"]
supersaver_data_info = supersaver_data["Info"]
display(supersaver_data_info)
display(supersaver_data_quantity)


In [None]:
supersaver_data_info.columns=['item code', 'store name', 'mass', 'dimension length', 'dimension width', 'dimension height', 'days_since last_purchase', 'package volume', 'stock age','suppp']
supersaver_data_info=supersaver_data_info[['item code', 'store name', 'mass', 'dimension length', 'dimension width', 'dimension height', 'days_since last_purchase', 'package volume', 'stock age']]

# Normaliser les noms de colonnes (remplacer espaces par underscore)
supersaver_data_info.columns = [c.strip().replace(" ", "_") for c in supersaver_data_info.columns]
supersaver_data_quantity.columns = [c.strip().replace(" ", "_") for c in supersaver_data_quantity.columns]

display(supersaver_data_info)


In [None]:
display(supersaver_data_quantity)

In [None]:
supersaver_data_final = supersaver_data_info.merge(
    supersaver_data_quantity,
    on="item_code",   # clé commune
    how="left"        # on garde tous les items du DataFrame "info"
)
supersaver_data_final = supersaver_data_final.set_index("item_code")

display(supersaver_data_final)
print(supersaver_data_final.shape)


In [None]:

supersaver_data_final.dtypes
# conversion auto de toutes les colonnes numériques possibles
for col in supersaver_data_final.columns:
    supersaver_data_final[col] = pd.to_numeric(supersaver_data_final[col], errors="ignore")
supersaver_data_final.dtypes


#### HighStreet_Bazaar

In [None]:
# read 'HighStreet_Bazaar_data.json'
highstreet_data = pd.read_json('module4_exercise_train/HighStreet_Bazaar_data.json')
highstreet_data=highstreet_data.set_index("item_code")
display(highstreet_data)

In [None]:
highstreet_data.dtypes

#### Aggregate

In [None]:

#display(citymart_data)
#print(citymart_data.columns)
print(citymart_data.dtypes)

#display(greenfield_data)
#print(greenfield_data.columns)
print(greenfield_data.dtypes)

#display(supersaver_data_final)
#print(supersaver_data_final.columns)
print(supersaver_data_final.dtypes)

#display(highstreet_data)
#print(highstreet_data.columns)
print(highstreet_data.dtypes)

In [None]:
#display(citymart_data)
#print(citymart_data.columns)
print(citymart_data.dtypes)

#display(greenfield_data)
#print(greenfield_data.columns)
print(greenfield_data.dtypes)

#display(supersaver_data_final)
#print(supersaver_data_final.columns)
print(supersaver_data_final.dtypes)

#display(highstreet_data)
#print(highstreet_data.columns)
print(highstreet_data.dtypes)

In [None]:
data = pd.concat([citymart_data, greenfield_data, supersaver_data_final, highstreet_data], axis=0)
# Supposons que ton DataFrame s'appelle df
display(data)


In [None]:
data.columns

In [None]:
print("citymart_data shape:", citymart_data.shape)
print("greenfield_data shape:", greenfield_data.shape)
print("supersaver_data_final shape:", supersaver_data_final.shape)
print("highstreet_data shape:", highstreet_data.shape)

print("\nComposition de data par store_name:")
print(data['store_name'].value_counts())

print("\nNb de lignes sans cible (quantity_sold) dans data:")
print(data['quantity_sold'].isna().sum())


In [None]:
data.dtypes

#### Simple baseline

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import numpy as np

def get_simple_baseline(data, fillna_value=-1, drop_cols=None, k_fold=5, scaler='standard', model='linear', metric='mae', target_col=None, X_data_test=None):
    
    data = data.copy()
    # Handle missing values
    data.fillna(fillna_value, inplace=True)
    if X_data_test is not None:
        X_data_test = X_data_test.copy()
        X_data_test.fillna(fillna_value, inplace=True)
    
    # Drop unwanted columns
    if drop_cols:
        data.drop(drop_cols, axis=1, inplace=True)
        if X_data_test is not None:
            X_data_test.drop(drop_cols, axis=1, inplace=True)

    # Split data into features (X) and target (y)
    y = data[target_col]
    X = data.drop(target_col, axis=1)

    # Feature scaling
    if scaler == 'standard':
        scaler = StandardScaler()
    elif scaler == 'minmax':
        scaler = MinMaxScaler()
    else:
        scaler = None
    
    if scaler:
        X = scaler.fit_transform(X)
        if X_data_test is not None:
            X_data_test = scaler.transform(X_data_test)

    # Initialize the model
    if model == 'linear':
        model = LinearRegression()
    elif model == 'logistic':
        model = LogisticRegression()
    elif model == 'random_forest':
        model = RandomForestClassifier()
    else:
        raise ValueError("Unsupported model type")

    # Initialize cross-validation
    kf = KFold(n_splits=k_fold, shuffle=True, random_state=42)
    scores = []

    # Train and evaluate using k-fold cross-validation
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Evaluate using the specified metric
        if metric == 'mae':
            score = mean_absolute_error(y_test, y_pred)
        elif metric == 'accuracy':
            score = accuracy_score(y_test, np.round(y_pred))
        else:
            raise ValueError("Unsupported metric")

        scores.append(score)

    if X_data_test is not None:
        model.fit(X, y)
        return np.mean(scores), model.predict(X_data_test)
    
    # Return the average score
    return np.mean(scores)

In [None]:
get_simple_baseline(data, fillna_value=-1, drop_cols=['store_name', 'last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')

### API sources

In [None]:
def get_api(endpoint_url):
    try:
        # Make the GET request to the mock API
        response = requests.get(endpoint_url)

        if response.status_code == 200:
            data = response.json()
            print(data["message"])
            return data['data']
        else:
            print(f"Failed to retrieve volume data. Status code: {response.status_code}")
    
    except Exception as e:
        print(f"An error occurred: {e}")
password = get_api("https://www.raphaelcousin.com/api/exercise/auth")
print(password)
link=str("https://www.raphaelcousin.com/api/exercise/"+password["password"]+"/prices")

prices = get_api(link)

In [None]:
print(prices)

In [None]:
df_prices = pd.DataFrame.from_dict(prices, orient="index", columns=["price"])
print(df_prices)


#### Aggregate

In [None]:
data = pd.merge(data, df_prices, left_index=True, right_index=True, how='left')
display(data)

In [None]:
get_simple_baseline(data, fillna_value=-1, drop_cols=['store_name', 'last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')

### Scrapping sources

In [None]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time

# Set up the Selenium WebDriver (e.g., Chrome)
driver = webdriver.Chrome()  # Make sure ChromeDriver is installed
# driver = webdriver.Firefox()
# driver = webdriver.Edge()
# driver = webdriver.Safari()

# Open the URL
url = 'https://www.raphaelcousin.com/module4/scrapable-data'
driver.get(url)

# Wait for the page to fully load (increase time if needed)
time.sleep(5)

# Get the fully rendered page source
html = driver.page_source

# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')

# Initialize lists to store scraped data
exercise_data = []

# Find both tables
tables = soup.find_all('table')

# Close the Selenium WebDriver
driver.quit()

# Scrape the second table (Exercise Data)
course_table = tables[1]
for row in course_table.find('tbody').find_all('tr'):
    cols = row.find_all('td')
    #exercise_data.append({ TODO })

# Convert the lists to pandas DataFrames
df_exercise = pd.DataFrame(exercise_data)
df_exercise


#### Aggregate

In [None]:
data = pd.merge(data, df_exercise, left_index=True, right_index=True, how='left')

In [None]:
get_simple_baseline(data, fillna_value=-1, drop_cols=['store_name', 'last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')

### Generating Submission File

In [None]:
#X_test =  read  Neighborhood_Market_data

# read
df_StoreN =  pd.read_csv("Neighborhood_Market_data.csv", sep=",", index_col='item_code')
df_StoreN

In [None]:
df_StoreN = pd.merge(df_StoreN, df_prices, left_index=True, right_index=True, how='left')
df_StoreN = pd.merge(df_StoreN, df_exercise, left_index=True, right_index=True, how='left')

In [None]:
_, y_pred = get_simple_baseline(data, fillna_value=-1, drop_cols=['store_name', 'last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold', X_data_test = df_StoreN)

In [None]:
submission = pd.DataFrame({
    'item_code': df_StoreN.index,
    'quantity_sold': y_pred # your_prediction
})

submission.to_csv('submission.csv', index=False, sep=',')
submission.head()