In [None]:
!pip install selenium -q
!apt-get update -q
!apt-get install -y chromium-browser chromium-chromedriver -q

In [None]:
import pandas as pd
import zipfile
import os
import json
import requests
import time
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
import numpy as np
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup



train_datas_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module4/exercise/module4_exercise_train.zip'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module4/exercise/Neighborhood_Market_data.csv'

def download_file(url, file_name):
    if not os.path.exists(file_name):
        response = requests.get(url)
        response.raise_for_status()
        with open(file_name, 'wb') as file:
            file.write(response.content)
        print(f'Downloaded {file_name} from {url}')
    else:
        print(f'{file_name} already exists, skipping download.')


download_file(train_datas_url, 'module4_exercise_train.zip')
download_file(test_data_url, 'Neighborhood_Market_data.csv')


os.makedirs("module4_exercise_train", exist_ok=True)


if not os.listdir("module4_exercise_train"):
    with zipfile.ZipFile("module4_exercise_train.zip", 'r') as zip_ref:
        zip_ref.extractall("module4_exercise_train")
    print("Extraction terminée.")
else:
    print("Les fichiers sont déjà extraits.")


df_citymart = pd.read_csv("module4_exercise_train/CityMart_data.csv")
df_greenfield = pd.read_csv("module4_exercise_train/Greenfield_Grocers_data.csv")
with open("module4_exercise_train/HighStreet_Bazaar_data.json", "r") as f:
    data_highstreet = json.load(f)
df_highstreet = pd.DataFrame(data_highstreet)
df_supersaver = pd.read_excel("module4_exercise_train/SuperSaver_Outlet_data.xlsx")
df_neighborhood = pd.read_csv("Neighborhood_Market_data.csv")



In [None]:

df_citymart.columns = [c.lower() for c in df_citymart.columns]
df_greenfield.columns = [c.lower() for c in df_greenfield.columns]
df_highstreet.columns = [c.lower() for c in df_highstreet.columns]
df_supersaver.columns = [c.lower() for c in df_supersaver.columns]
df_neighborhood.columns = [c.lower() for c in df_neighborhood.columns]

In [None]:

password = "RcUZjhdsYLRzwi4"
api_url = f"https://www.raphaelcousin.com/api/exercise/{password}/prices"
response = requests.get(api_url)

if response.status_code == 200:
    data = response.json()
    prices_dict = data.get("data", data)
    df_prices = pd.DataFrame(list(prices_dict.items()), columns=["item_code", "unit_cost"])
else:
    print(" Erreur de récupération des prix.")

In [None]:

!pip install selenium -q
!apt-get update -q
!apt-get install -y chromium-browser chromium-chromedriver -q


chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(options=chrome_options)


url = "https://www.raphaelcousin.com/module4/scrapable-data"
driver.get(url)
time.sleep(3)
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
driver.quit()


tables = soup.find_all("table")
rows = []
for row in tables[1].find("tbody").find_all("tr"):
    cols = [c.get_text(strip=True) for c in row.find_all("td")]
    rows.append(cols)


df_reviews = pd.DataFrame(rows, columns=["item_code", "customer_score", "total_reviews", "timestamp"])
df_reviews = df_reviews.drop(columns=["timestamp"])
df_reviews["customer_score"] = pd.to_numeric(df_reviews["customer_score"], errors="coerce")
df_reviews["total_reviews"] = pd.to_numeric(df_reviews["total_reviews"], errors="coerce")

In [None]:

df_train = pd.concat([df_citymart, df_greenfield, df_highstreet, df_supersaver], ignore_index=True)


df_train["item_code"] = df_train["item_code"].astype(str).str.strip()
df_neighborhood["item_code"] = df_neighborhood["item_code"].astype(str).str.strip()
df_prices["item_code"] = df_prices["item_code"].astype(str).str.strip()
df_reviews["item_code"] = df_reviews["item_code"].astype(str).str.strip()


df_train = pd.merge(df_train, df_prices, on="item_code", how="left")
df_neighborhood = pd.merge(df_neighborhood, df_prices, on="item_code", how="left")
df_train = pd.merge(df_train, df_reviews, on="item_code", how="left")
df_neighborhood = pd.merge(df_neighborhood, df_reviews, on="item_code", how="left")

In [None]:

df_train_all_stores = df_train.dropna(subset=["quantity_sold"]).copy()


cols_to_impute = ["mass", "dimension_length", "dimension_width", "dimension_height",
                  "stock_age", "unit_cost", "customer_score", "total_reviews"]
for col in cols_to_impute:
    median_value = df_train_all_stores[col].median()
    df_train_all_stores[col] = df_train_all_stores[col].fillna(median_value)

df_train_all_stores["calc_volume"] = (
    df_train_all_stores["dimension_length"] *
    df_train_all_stores["dimension_width"] *
    df_train_all_stores["dimension_height"]
)


df_train_encoded = pd.get_dummies(df_train_all_stores, columns=["store_name"], prefix="store", drop_first=True)


features_to_keep = cols_to_impute + ["calc_volume"] + [col for col in df_train_encoded.columns if "store_" in col]
X = df_train_encoded[features_to_keep]
y = df_train_encoded["quantity_sold"]


model = LinearRegression()
kf = KFold(n_splits=5, shuffle=True, random_state=42)
mae_scores = []

for train_idx, test_idx in kf.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mae_scores.append(mae)

print("MAE moyen du modèle linéaire :", np.mean(mae_scores))

In [None]:


from sklearn.linear_model import LinearRegression


df_neighborhood_imputed = df_neighborhood.copy()
for col in cols_to_impute:
    median_value = df_train_all_stores[col].median()
    df_neighborhood_imputed[col] = df_neighborhood_imputed[col].fillna(median_value)

df_neighborhood_imputed["calc_volume"] = (
    df_neighborhood_imputed["dimension_length"] *
    df_neighborhood_imputed["dimension_width"] *
    df_neighborhood_imputed["dimension_height"]
)

df_neighborhood_encoded = pd.get_dummies(df_neighborhood_imputed, columns=["store_name"], prefix="store", drop_first=True)

for col in [c for c in df_train_encoded.columns if "store_" in c]:
    if col not in df_neighborhood_encoded.columns:
        df_neighborhood_encoded[col] = 0


X_final = df_train_encoded[features_to_keep]
y_final = df_train_encoded["quantity_sold"]
X_test_final = df_neighborhood_encoded[features_to_keep]


final_model = LinearRegression()
final_model.fit(X_final, y_final)


y_pred = final_model.predict(X_test_final)


submission = pd.DataFrame({
    "item_code": df_neighborhood["item_code"],
    "quantity_sold": np.round(y_pred).astype(int)
})

submission.to_csv("submissionformodule4.csv", index=False)
print(" submissionformodule4.csv créé :", submission.shape)
submission.head()