In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

## Data Collection

### Files sources

In [None]:
import requests
# URLs of the files
train_datas_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module4/exercise/module4_exercise_train.zip'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module4/exercise/Neighborhood_Market_data.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_datas_url, 'module4_exercise_train.zip')
download_file(test_data_url, 'Neighborhood_Market_data.csv')

#### Ouvrir le fichier zippé

In [None]:
import zipfile

# chemin vers ton fichier ZIP
zip_path = 'module4_exercise_train.zip'

# Ouvrir le  zip et afficher le nom des fichiers
with zipfile.ZipFile(zip_path, "r") as zip_ref:
    # lister les fichiers contenus
    print("Fichiers dans le zip :", zip_ref.namelist())

#### CityMart

In [None]:
# read "CityMart_data.csv"

# premier fichier CSV à l'intérieur du zip
csv_filename = zip_ref.namelist()[0]
print(csv_filename)

# Ouvire le zip et ouvrir le fichier
with zipfile.ZipFile(zip_path, "r") as zip_ref:
    # lire le CSV directement sans l'extraire
    with zip_ref.open(csv_filename) as f:
        df_CityMart = pd.read_csv(f)
print(df_CityMart.head())

#### Greenfield_Grocers

In [None]:
# read "Greenfield_Grocers_data.csv"

# second fichier CSV à l'intérieur du zip
csv_filename2 = zip_ref.namelist()[1]
print(csv_filename2)

# Ouvire le zip et ouvrir le fichier
with zipfile.ZipFile(zip_path, "r") as zip_ref:
    # lire le CSV directement sans l'extraire
    with zip_ref.open(csv_filename2) as f:
        df_Greenfield = pd.read_csv(f, sep="|", header=3)
print(df_Greenfield.head())
print(df_Greenfield.shape)

# On supprime les deux dernières colonnes
df_Greenfield = df_Greenfield.drop(['Unnamed: 12','1'], axis=1)
print(df_Greenfield.shape)

# On mets les noms des colonnes en minuscule
df_Greenfield.columns = df_Greenfield.columns.str.lower()
print(df_Greenfield.head())

#### Outlet_data

In [None]:
# read "SuperSaver_Outlet_data.xlsx"

# fichier xlsx à l'intérieur du zip
xlsx_filename = zip_ref.namelist()[3]
print(xlsx_filename)

# Ouvire le zip et ouvrir le fichier
with zipfile.ZipFile(zip_path, "r") as zip_ref:
    # lire le xlsx directement sans l'extraire
    with zip_ref.open(xlsx_filename) as f:
        df_SuperSaver = pd.read_excel(f)
print(df_SuperSaver.head())

#### HighStreet_Bazaar

In [None]:
# read 'HighStreet_Bazaar_data.json'

# fichier json à l'intérieur du zip
json_filename = zip_ref.namelist()[2]
print(json_filename)

# Ouvire le zip et ouvrir le fichier
with zipfile.ZipFile(zip_path, "r") as zip_ref:
    # lire le xlsx directement sans l'extraire
    with zip_ref.open(json_filename) as f:
        df_HighStreet = pd.read_json(f)
print(df_HighStreet.head())

# Mettre les dates en format date
print(df_HighStreet['last_modified'].dtype)
df_HighStreet['last_modified'] = pd.to_datetime(df_HighStreet['last_modified'], unit='ms').dt.date # ms = millisecond
print(df_HighStreet.head())

#### Concaténation des fichiers

In [None]:
# Concat: Stack DataFrames vertically or horizontally
dfs = [
    df_CityMart.set_index("item_code"),
    df_Greenfield.set_index("item_code"),
    df_HighStreet.set_index("item_code"),
    df_SuperSaver.set_index("item_code")
]

df_vertical = pd.concat(dfs, axis=0)

print(df_vertical.head())
print(df_vertical.shape)
print(len(df_vertical))
print(len(df_CityMart) + len(df_Greenfield) + len(df_HighStreet) + len(df_SuperSaver))

#### Aggregate

#### Simple baseline

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import numpy as np

def get_simple_baseline(data, fillna_value=-1, drop_cols=None, k_fold=5, scaler='standard', model='linear', metric='mae', target_col=None, X_data_test=None):
    
    data = data.copy()
    # Handle missing values
    data.fillna(fillna_value, inplace=True)
    if X_data_test is not None:
        X_data_test = X_data_test.copy()
        X_data_test.fillna(fillna_value, inplace=True)
    
    # Drop unwanted columns
    if drop_cols:
        data.drop(drop_cols, axis=1, inplace=True)
        if X_data_test is not None:
            X_data_test.drop(drop_cols, axis=1, inplace=True)

    # Split data into features (X) and target (y)
    y = data[target_col]
    X = data.drop(target_col, axis=1)

    # Feature scaling
    if scaler == 'standard':
        scaler = StandardScaler()
    elif scaler == 'minmax':
        scaler = MinMaxScaler()
    else:
        scaler = None
    
    if scaler:
        X = scaler.fit_transform(X)
        if X_data_test is not None:
            X_data_test = scaler.transform(X_data_test)

    # Initialize the model
    if model == 'linear':
        model = LinearRegression()
    elif model == 'logistic':
        model = LogisticRegression()
    elif model == 'random_forest':
        model = RandomForestClassifier()
    else:
        raise ValueError("Unsupported model type")

    # Initialize cross-validation
    kf = KFold(n_splits=k_fold, shuffle=True, random_state=42)
    scores = []

    # Train and evaluate using k-fold cross-validation
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Evaluate using the specified metric
        if metric == 'mae':
            score = mean_absolute_error(y_test, y_pred)
        elif metric == 'accuracy':
            score = accuracy_score(y_test, np.round(y_pred))
        else:
            raise ValueError("Unsupported metric")

        scores.append(score)

    if X_data_test is not None:
        model.fit(X, y)
        return np.mean(scores), model.predict(X_data_test)
    
    # Return the average score
    return np.mean(scores)

In [None]:
# get_simple_baseline(data, fillna_value=-1, drop_cols=['store_name', 'last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')

get_simple_baseline(df_vertical, fillna_value=-1, drop_cols=['store_name', 'last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')

### API sources

In [None]:
import requests

def get_api(endpoint_url):
    try:
        # Make the GET request to the mock API
        response = requests.get(endpoint_url)

        if response.status_code == 200:
            data = response.json()
            print(data["message"])
            return data['data']
        else:
            print(f"Failed to retrieve volume data. Status code: {response.status_code}")
    
    except Exception as e:
        print(f"An error occurred: {e}")
        
password = get_api("https://www.raphaelcousin.com/api/exercise/auth")["password"]
print(password)
prices = get_api(f"https://www.raphaelcousin.com/api/exercise/{password}/prices")
print(len(prices))
print(type(prices))

In [None]:
# df_prices = 
df_prices = pd.DataFrame.from_dict(prices, orient="index", columns=["price"])
print(len(df_prices))
print(df_prices.head())

#### Aggregate

In [None]:
data = pd.merge(df_vertical, df_prices, left_index=True, right_index=True, how='left')
print(data.shape)
print(data.head())

In [None]:
# get_simple_baseline(data, fillna_value=-1, drop_cols=['store_name', 'last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')
get_simple_baseline(data, fillna_value=-1, drop_cols=['store_name', 'last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')

### Scrapping sources

In [None]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time

# Set up the Selenium WebDriver (e.g., Chrome)
driver = webdriver.Chrome()  # Make sure ChromeDriver is installed
# driver = webdriver.Firefox()
# driver = webdriver.Edge()
# driver = webdriver.Safari()

# Open the URL
url = 'https://www.raphaelcousin.com/module4/scrapable-data'
driver.get(url)

# Wait for the page to fully load (increase time if needed)
time.sleep(5)

# Get the fully rendered page source
html = driver.page_source

# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')

# Initialize lists to store scraped data
exercise_data = []

# Find both tables
tables = soup.find_all('table')

# Close the Selenium WebDriver
driver.quit()

# Scrape the second table (Exercise Data)
course_table = tables[1]
for row in course_table.find('tbody').find_all('tr'):
    cols = row.find_all('td')
    exercise_data.append({ 
        'Item Code': cols[0].text,
        'Customer Score': cols[1].text,
        'Total Reviews': cols[2].text,
                            })

# Convert the lists to pandas DataFrames
df_exercise = pd.DataFrame(exercise_data)
print(df_exercise.head())

# On mets les noms des colonnes en minuscule
df_exercise.columns = df_exercise.columns.str.lower()
# Fixe item code comme index
df_exercise = df_exercise.set_index("item code")
print(df_exercise.head())


#### Aggregate

In [None]:
data = pd.merge(data, df_exercise, left_index=True, right_index=True, how='left')
print(data.shape)
print(data.head())

In [None]:
get_simple_baseline(data, fillna_value=-1, drop_cols=['store_name', 'last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')

## Data Pre-Processing

In [None]:
# Find out if there is at least one missing value 
null = data.isnull().values
if null.any() >=1 :
    count = len(data[data.isnull().any(axis=1)])
    print(f"Number of rows with at least one missing value: {count}")
    proportion = count / len(data) * 100
    print(f"Proportion of rows with missing values in the dataset: {np.round(proportion,2)}")

# On remplace les valeurs manquantes des variables numériques par la médiane
data2=data.fillna(data.median(numeric_only=True))


In [None]:
get_simple_baseline(data2, fillna_value=-1, drop_cols=['store_name', 'last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')

In [None]:
# On change la manière de traiter les valeurs manquantes
get_simple_baseline(data, fillna_value=data.median(numeric_only=True), drop_cols=['store_name', 'last_modified'], k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')

In [None]:
# On diminue le nombre de folds (5 -> 3)
get_simple_baseline(data, fillna_value=data.median(numeric_only=True), drop_cols=['store_name', 'last_modified'], k_fold=3, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')

### Generating Submission File

In [None]:
# X_test =  read  Neighborhood_Market_data

# read
df_StoreN =  pd.read_csv("Neighborhood_Market_data.csv", sep=",", index_col='item_code')
df_StoreN

In [None]:
df_StoreN = pd.merge(df_StoreN, df_prices, left_index=True, right_index=True, how='left')
df_StoreN = pd.merge(df_StoreN, df_exercise, left_index=True, right_index=True, how='left')

In [None]:
m, y_pred = get_simple_baseline(data, fillna_value=data.median(numeric_only=True), drop_cols=['store_name', 'last_modified'], k_fold=3, scaler='standard', model='linear', metric='mae', target_col='quantity_sold', X_data_test = df_StoreN)

In [None]:
print(f"mae: {m}")

In [None]:
submission = pd.DataFrame({
    'item_code': df_StoreN.index,
    'quantity_sold': y_pred # your_prediction
})

submission.to_csv('submission.csv', index=False, sep=',')
submission.head()