In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import requests

In [None]:
# URLs of the files
train_datas_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module4/exercise/module4_exercise_train.zip'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module4/exercise/Neighborhood_Market_data.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_datas_url, 'module4_exercise_train.zip')
download_file(test_data_url, 'Neighborhood_Market_data.csv')

## 1) Load and combine data from multiple sources (files, API, web scraping).

 #### CityMart_data

In [None]:
# read
df_CityMart_data =  pd.read_csv("CityMart_data.csv", sep=",", index_col='item_code')
df_CityMart_data

#### Greenfield_Grocers_data.csv

In [None]:
# read
df_Greenfield_Grocers_data = pd.read_csv("Greenfield_Grocers_data.csv", sep='|', skiprows=2)
df_Greenfield_Grocers_data.columns = df_Greenfield_Grocers_data.iloc[0]
df_Greenfield_Grocers_data = df_Greenfield_Grocers_data[1:] 

# Nettoyer les noms de colonnes (les transformer en chaînes)
df_Greenfield_Grocers_data.columns = df_Greenfield_Grocers_data.columns.astype(str).str.strip()

# Supprimer les colonnes '1.0', 'nan', ou 'Unnamed'
cols_to_drop = [col for col in df_Greenfield_Grocers_data.columns if col in ['1.0', 'nan'] or col.startswith('Unnamed')]
df_Greenfield_Grocers_data.drop(columns=cols_to_drop, inplace=True)

df_Greenfield_Grocers_data.set_index('ITEM_CODE', inplace=True)

# Renommer l’index
df_Greenfield_Grocers_data.index.name = 'item_code'

# Ecrire les noms de colonne en minuscule
df_Greenfield_Grocers_data.columns = [col.lower() for col in df_Greenfield_Grocers_data.columns]

df_Greenfield_Grocers_data

#### Neighborhood_Market_data

In [None]:
# read
df_Neighborhood_Market_data =  pd.read_csv("Neighborhood_Market_data.csv", sep=",", index_col = 'item_code')
df_Neighborhood_Market_data

#### HighStreet_Bazaar_data.json

In [None]:

df_HighStreet_Bazaar_data = pd.read_json('HighStreet_Bazaar_data.json', orient='records')
df_HighStreet_Bazaar_data.set_index('item_code', inplace=True)
df_HighStreet_Bazaar_data

#### SuperSaver_Outlet_data.xlsx

In [None]:
# Lire un fichier excel
df_SuperSaver_Outlet_data_b =  pd.read_excel("SuperSaver_Outlet_data.xlsx", sheet_name=None)
df_SuperSaver_Outlet_data_b

In [None]:
# Lire les deux feuilles du classeur
df_sheet1 = df_SuperSaver_Outlet_data_b["Quantity"]
df_sheet2 = df_SuperSaver_Outlet_data_b["Info"]

In [None]:
df_sheet1.head(10)

In [None]:
df_sheet2.head(10)

In [None]:
# suppression des entêtes
df_sheet2.columns = range(df_sheet2.shape[1])

# Attribution des noms
df_sheet2.columns = [
    "item_code",
    "store_name",
    "mass",
    "dimension_length",
    "dimension_width",
    "dimension_height",
    "days_since_last_purchase",
    "package_volume",
    "stock_age",
    "supp"  
]


In [None]:
df_sheet2.head(10)

In [None]:
df_sheet2.drop(columns=["supp"], errors="ignore", inplace=True)

In [None]:
df_sheet2.head(10)

In [None]:

df_SuperSaver_Outlet_data = pd.merge(df_sheet2, df_sheet1)


In [None]:
df_SuperSaver_Outlet_data.head(10)

In [None]:
df_SuperSaver_Outlet_data.set_index('item_code', inplace=True)

#### Agregate

In [None]:
data = pd.concat([df_CityMart_data, df_Greenfield_Grocers_data, df_Neighborhood_Market_data, df_HighStreet_Bazaar_data, df_SuperSaver_Outlet_data], axis=0)
data

#### API source

In [None]:
# Récupere les output des endpoints qui sont des dictionnaires
def get_api(endpoint_url):
    try:
        # Make the GET request to the mock API
        response = requests.get(endpoint_url)

        if response.status_code == 200:
            data = response.json()
            print(data["message"])
            return data['data']
        else:
            print(f"Failed to retrieve volume data. Status code: {response.status_code}")
    
    except Exception as e:
        print(f"An error occurred: {e}")
password = get_api("https://www.raphaelcousin.com/api/exercise/auth")["password"]
print(password)
prices = get_api(f"https://www.raphaelcousin.com/api/exercise/{password}/prices")

In [None]:
# Convertion d'un dictionnaire en dataframe
# orient='index' : indique que les clés du dictionnaire deviendront les index du DataFrame
#columns=['price'] : nom de la colonne contenant les valeurs du dictionnaire

df_prices = pd.DataFrame.from_dict(prices, orient='index', columns=['unit_cost'])

In [None]:
df_prices.head(10)

#### Aggregate

In [None]:
data = pd.merge(data, df_prices, left_index=True, right_index=True, how='left')
data

#### Scrapping sources

In [None]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time

# Permet de lancer un moteur de recherche
# Set up the Selenium WebDriver (e.g., Chrome)
driver = webdriver.Chrome()  # Make sure ChromeDriver is installed
# driver = webdriver.Firefox()
# driver = webdriver.Edge()
# driver = webdriver.Safari()

# Open the URL
url = 'https://www.raphaelcousin.com/module4/scrapable-data'
driver.get(url)

# Wait for the page to fully load (increase time if needed)
time.sleep(5)

# Get the fully rendered page source
html = driver.page_source

# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')

# Initialize lists to store scraped data
exercise_data = []

# Charge tous les tableaux numeroté de 0,1,2, etc
# Find both tables
tables = soup.find_all('table')

# Close the Selenium WebDriver
driver.quit()

# On recupere les données des colonnes du tableau 1 qu'on veut stocker en attribuant les mêmes noms que la page web
# Scrape the first table (Course Data)
course_table = tables[1]
for row in course_table.find('tbody').find_all('tr'):
    cols = row.find_all('td')
    exercise_data.append({
        'item_code': cols[0].text,
        'customer_score': int(cols[1].text),
        'total_reviews': int(cols[2].text),
        # 'Updated Timestamp': cols[3].text
    })

# Convert the lists to pandas DataFrames
df_course_exo = pd.DataFrame(exercise_data)
df_course_exo

In [None]:
df_course_exo.set_index('item_code', inplace=True)

#### Aggregate

In [None]:
data = pd.merge(data, df_course_exo, left_index=True, right_index=True, how='left')
data

## 2) Perform exploratory data analysis (EDA) on the combined dataset.

### Comprehension des données

In [None]:
#Aperçu du dataset
data.shape

In [None]:
# Affichage des 10 premières lignes
data.head(10)

In [None]:
# Affichage des 10 dernières lignes
data.tail(10)

In [None]:
# Affichage aléatoire de 10 lignes
data.sample(10)

In [None]:
# Info générale sur le data
data.info()

In [None]:
# Affichage des types des variables
data.dtypes

In [None]:
# Fonction pour un nettoyage partiel des données
def dataClean(df):
    # Colonnes numériques potentielles
    num_cols = [
        'mass', 'dimension_length', 'dimension_width', 'dimension_height',
        'days_since_last_purchase', 'package_volume', 'stock_age',
        'quantity_sold', 'unit_cost', 'customer_score', 'total_reviews'
    ]

    # Colonnes réellement présentes
    existing_cols = [col for col in num_cols if col in df.columns]

    # Convertir les colonnes numériques en float
    df[existing_cols] = df[existing_cols].apply(pd.to_numeric, errors='coerce')

    # Remplacer les NaN par la moyenne
    df.fillna(df.mean(numeric_only=True), inplace=True)

    # Supprimer les outliers si 'quantity_sold' existe
    #if 'quantity_sold' in df.columns:
     #   Q1 = df['quantity_sold'].quantile(0.25)
      #  Q3 = df['quantity_sold'].quantile(0.75)
       # IQR = Q3 - Q1
     #   lower = Q1 - 1.5 * IQR
     #   upper = Q3 + 1.5 * IQR
      #  df = df[(df['quantity_sold'] >= lower) & (df['quantity_sold'] <= upper)]

    # Supprimer les colonnes inutiles si elles existent
    df.drop(columns=["last_modified", "store_name"], errors="ignore", inplace=True)

    return df



In [None]:
data = dataClean(data)

In [None]:
data.dtypes

In [None]:
# Résumer sur les variables
data.describe()

## Data preparation

In [None]:
# Les valeurs manquantes
data.isnull().sum()

In [None]:
# Affichage des lignes ayant les valeurs manquantes
data[data.isnull().any(axis=1)]

In [None]:
# Nombre de lignes dupliquées
data.duplicated().sum()

In [None]:
# Affichage des lignes dupliquées
data[data.duplicated(keep=False)].sort_values(by=data.columns.tolist())

## Comprendre les features univariés

In [None]:
# Histogramme pour tous les features numériques
data.hist(figsize=(10,8))
plt.tight_layout()
plt.show()

In [None]:
# Detection des outliers

# Box plots for numerical variables to check outliers
print("\nBox plots to check for outliers:")
#numeric_cols = ["unit_cost", "customer_score", "total_reviews"]

# Sélection automatique des colonnes numériques
numeric_cols = data.select_dtypes(include=['number']).columns.tolist()

# Set up the plot grid
n_cols = 2
n_rows = int(np.ceil(len(numeric_cols) / n_cols))

fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(8, n_rows*6))
fig.tight_layout(pad=4.0)

for i, col in enumerate(numeric_cols):
    row = i // n_cols
    col_pos = i % n_cols
    sns.boxplot(x=data[col], ax=axes[row, col_pos])
    axes[row, col_pos].set_title(f'Box plot of {col}')

# Remove empty subplots if the number of columns is odd
if len(numeric_cols) % n_cols != 0:
    for j in range(len(numeric_cols), n_rows * n_cols):
        fig.delaxes(axes.flatten()[j])

plt.show()

In [None]:
# Correlation matrix
print("\nCorrelation matrix:")
#corr_matrix = data.corr()
corr_matrix = data.corr(numeric_only=True)

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.show()

### Simple baseline

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import numpy as np

def get_simple_baseline(data, fillna_value=-1, drop_cols=None, k_fold=5, scaler='standard', model='linear', metric='mae', target_col=None, X_data_test=None):
    
    data = data.copy()
    # Handle missing values
    data.fillna(fillna_value, inplace=True)
    if X_data_test is not None:
        X_data_test = X_data_test.copy()
        X_data_test.fillna(fillna_value, inplace=True)
    
    # Drop unwanted columns
    if drop_cols:
        data.drop(drop_cols, axis=1, inplace=True)
        if X_data_test is not None:
            X_data_test.drop(drop_cols, axis=1, inplace=True)

    # Split data into features (X) and target (y)
    y = data[target_col]
    X = data.drop(target_col, axis=1)

    # Feature scaling
    if scaler == 'standard':
        scaler = StandardScaler()
    elif scaler == 'minmax':
        scaler = MinMaxScaler()
    else:
        scaler = None
    
    if scaler:
        X = scaler.fit_transform(X)
        if X_data_test is not None:
            X_data_test = scaler.transform(X_data_test)

    # Initialize the model
    if model == 'linear':
        model = LinearRegression()
    elif model == 'logistic':
        model = LogisticRegression()
    elif model == 'random_forest':
        model = RandomForestClassifier()
    else:
        raise ValueError("Unsupported model type")

    # Initialize cross-validation
    kf = KFold(n_splits=k_fold, shuffle=True, random_state=42)
    scores = []

    # Train and evaluate using k-fold cross-validation
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Evaluate using the specified metric
        if metric == 'mae':
            score = mean_absolute_error(y_test, y_pred)
        elif metric == 'accuracy':
            score = accuracy_score(y_test, np.round(y_pred))
        else:
            raise ValueError("Unsupported metric")

        scores.append(score)

    if X_data_test is not None:
        model.fit(X, y)
        return np.mean(scores), model.predict(X_data_test)
    
    # Return the average score
    return np.mean(scores)

In [None]:
# Entrainement du model
supp =['mass', 'dimension_length', 'dimension_width', 'dimension_height','days_since_last_purchase', 'stock_age']
data.columns

In [None]:
get_simple_baseline(data, drop_cols=supp, k_fold=5, scaler='standard', model='linear', metric='mae', target_col='quantity_sold')

# Get test data and Predict baseline

In [None]:
# read
df_StoreE =  pd.read_csv("Neighborhood_Market_data.csv", sep=",", index_col='item_code')
df_StoreE

In [None]:
# On ajoute les colonnes de df_volumes et df_course
df_StoreE = pd.merge(df_StoreE, df_prices, left_index=True, right_index=True, how='left')
df_StoreE = pd.merge(df_StoreE, df_course_exo, left_index=True, right_index=True, how='left')
df_StoreE

In [None]:
df_StoreE = dataClean(df_StoreE)

In [None]:
_, y_pred = get_simple_baseline(data,
                    fillna_value=-1,
                    drop_cols=supp,
                    k_fold=5,
                    scaler='standard',
                    model='linear',
                    metric='mae',
                    target_col='quantity_sold',
                    X_data_test=df_StoreE)

In [None]:
y_pred

In [None]:
submission = pd.DataFrame({
    'item_code': df_StoreE.index,
    'quantity_sold': y_pred # your_prediction
})

submission.to_csv('submission.csv', index=False, sep=',')
submission.head()