# Analyse ML - Pr√©diction des Prix Immobiliers en Tunisie

## Objectif
Ce notebook impl√©mente une analyse compl√®te de Machine Learning pour pr√©dire les prix immobiliers (location et vente) en Tunisie.

**M√©thodologie (TP 4.1 + TP 4.2):**
- Pr√©traitement des donn√©es (OneHotEncoder, StandardScaler)
- Entra√Ænement de 6 mod√®les de r√©gression
- Validation crois√©e (5-fold)
- √âvaluation avec m√©triques multiples (R¬≤, MAE, MSE, RMSE, MAPE, MedAE)
- Analyse d'importance des features

## 1. Imports et Configuration

In [None]:
# Imports de baseimport pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as snsimport warningswarnings.filterwarnings('ignore')# Configuration de l'affichagepd.set_option('display.max_columns', None)pd.set_option('display.max_rows', 100)plt.style.use('seaborn-v0_8-darkgrid')sns.set_palette("husl")print("‚úÖ Imports r√©ussis!")

In [None]:
# Imports pour le Machine Learningfrom sklearn.model_selection import train_test_split, cross_val_score, GridSearchCVfrom sklearn.preprocessing import StandardScaler, OneHotEncoderfrom sklearn.compose import ColumnTransformerfrom sklearn.pipeline import Pipeline# Mod√®les de r√©gressionfrom sklearn.linear_model import LinearRegression, Ridge, Lassofrom sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressorfrom sklearn.tree import DecisionTreeRegressor# M√©triques d'√©valuationfrom sklearn.metrics import (    r2_score, mean_absolute_error, mean_squared_error,    mean_absolute_percentage_error, median_absolute_error)print("‚úÖ Imports ML r√©ussis!")

## 2. Chargement et Exploration des Donn√©es

In [None]:
# Chargement du datasetdf = pd.read_csv('../data/dataset_loyers_tunisie.csv')print(f"üìä Dataset charg√©: {df.shape[0]} lignes, {df.shape[1]} colonnes")print(f"\nColonnes: {list(df.columns)}")df.head()

In [None]:
# Informations g√©n√©ralesprint("=== INFORMATIONS G√âN√âRALES ===")print(df.info())print("\n=== STATISTIQUES DESCRIPTIVES ===")df.describe()

In [None]:
# V√©rification des valeurs manquantesprint("=== VALEURS MANQUANTES ===")missing = df.isnull().sum()if missing.sum() == 0:    print("‚úÖ Aucune valeur manquante!")else:    print(missing[missing > 0])# V√©rification des doublonsduplicates = df.duplicated().sum()print(f"\n=== DOUBLONS ===")print(f"Nombre de doublons: {duplicates}")

In [None]:
# Distribution des types (Location vs Vente)print("=== DISTRIBUTION DES TYPES ===")print(df['Type'].value_counts())print(f"\nPourcentage:")print(df['Type'].value_counts(normalize=True) * 100)# Visualisationfig, ax = plt.subplots(1, 2, figsize=(12, 4))df['Type'].value_counts().plot(kind='bar', ax=ax[0], color=['#3498db', '#e74c3c'])ax[0].set_title('Distribution des Types de Transaction')ax[0].set_xlabel('Type')ax[0].set_ylabel('Nombre')df['Type'].value_counts().plot(kind='pie', ax=ax[1], autopct='%1.1f%%', colors=['#3498db', '#e74c3c'])ax[1].set_title('R√©partition Location vs Vente')ax[1].set_ylabel('')plt.tight_layout()plt.show()

## 3. Analyse Exploratoire des Donn√©es (EDA)

In [None]:
# Distribution des variables num√©riquesnumerical_cols = ['Superficie', 'Nb_pieces', 'Standing', 'Etage', 'Distance_centre', 'Loyer', 'Prix_vente']fig, axes = plt.subplots(3, 3, figsize=(15, 12))axes = axes.ravel()for idx, col in enumerate(numerical_cols):    if col in df.columns:        df[col].hist(bins=30, ax=axes[idx], edgecolor='black', alpha=0.7)        axes[idx].set_title(f'Distribution de {col}')        axes[idx].set_xlabel(col)        axes[idx].set_ylabel('Fr√©quence')# Cacher les axes videsfor idx in range(len(numerical_cols), len(axes)):    axes[idx].axis('off')plt.tight_layout()plt.show()

In [None]:
# Boxplots pour d√©tecter les outliersfig, axes = plt.subplots(2, 3, figsize=(15, 8))axes = axes.ravel()cols_to_plot = ['Superficie', 'Nb_pieces', 'Standing', 'Etage', 'Distance_centre', 'Loyer']for idx, col in enumerate(cols_to_plot):    if col in df.columns:        df.boxplot(column=col, ax=axes[idx])        axes[idx].set_title(f'Boxplot - {col}')        axes[idx].set_ylabel(col)plt.tight_layout()plt.show()

In [None]:
# Matrice de corr√©lation# S√©lectionner uniquement les colonnes num√©riquesnumeric_df = df.select_dtypes(include=[np.number])plt.figure(figsize=(12, 10))correlation_matrix = numeric_df.corr()sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm',             square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})plt.title('Matrice de Corr√©lation des Variables Num√©riques', fontsize=16, fontweight='bold')plt.tight_layout()plt.show()# Afficher les corr√©lations les plus fortes avec le prixprint("\n=== CORR√âLATIONS AVEC LE PRIX (Loyer) ===")if 'Loyer' in correlation_matrix.columns:    correlations = correlation_matrix['Loyer'].sort_values(ascending=False)    print(correlations)

In [None]:
# Analyse par villeprint("=== STATISTIQUES PAR VILLE ===")ville_stats = df.groupby('Ville').agg({    'Loyer': ['mean', 'median', 'std', 'count'],    'Superficie': 'mean',    'Nb_pieces': 'mean'}).round(2)print(ville_stats)# Visualisationfig, axes = plt.subplots(1, 2, figsize=(14, 5))# Prix moyen par villeville_prix = df.groupby('Ville')['Loyer'].mean().sort_values(ascending=False)ville_prix.plot(kind='barh', ax=axes[0], color='steelblue')axes[0].set_title('Prix Moyen par Ville')axes[0].set_xlabel('Loyer Moyen (TND)')# Nombre d'annonces par villedf['Ville'].value_counts().plot(kind='bar', ax=axes[1], color='coral')axes[1].set_title('Nombre d\'Annonces par Ville')axes[1].set_ylabel('Nombre')axes[1].set_xlabel('Ville')plt.tight_layout()plt.show()

In [None]:
# Analyse par Type (Location vs Vente)print("=== STATISTIQUES PAR TYPE ===")type_stats = df.groupby('Type').agg({    'Loyer': ['mean', 'median', 'std'],    'Prix_vente': ['mean', 'median', 'std'],    'Superficie': 'mean',    'Nb_pieces': 'mean'}).round(2)print(type_stats)# Cr√©er une colonne Prix combin√©e pour l'analysedf['Prix'] = df.apply(lambda row: row['Loyer'] if row['Type'] == 'Location' else row['Prix_vente'], axis=1)# Visualisationfig, axes = plt.subplots(1, 2, figsize=(14, 5))# Boxplot des prix par typedf.boxplot(column='Prix', by='Type', ax=axes[0])axes[0].set_title('Distribution des Prix par Type')axes[0].set_xlabel('Type')axes[0].set_ylabel('Prix (TND)')plt.sca(axes[0])plt.xticks(rotation=0)# Violin plotsns.violinplot(data=df, x='Type', y='Prix', ax=axes[1], palette='Set2')axes[1].set_title('Distribution des Prix par Type (Violin Plot)')axes[1].set_ylabel('Prix (TND)')plt.tight_layout()plt.show()

## 4. Pr√©traitement des Donn√©es

In [None]:
# S√©paration des features et de la cible# On utilise la colonne Prix qui combine Loyer et Prix_vente selon le TypeX = df[['Ville', 'Superficie', 'Standing', 'Nb_pieces', 'Meuble', 'Etage', 'Distance_centre', 'Type']]y = df['Prix']print(f"‚úÖ Features (X): {X.shape}")print(f"‚úÖ Cible (y): {y.shape}")print(f"\nPremi√®res lignes de X:")X.head()

In [None]:
# D√©finition des colonnes cat√©gorielles et num√©riquescategorical_features = ['Ville', 'Type']numerical_features = ['Superficie', 'Standing', 'Nb_pieces', 'Meuble', 'Etage', 'Distance_centre']print(f"Variables cat√©gorielles: {categorical_features}")print(f"Variables num√©riques: {numerical_features}")

In [None]:
# Split train/test (80/20)X_train, X_test, y_train, y_test = train_test_split(    X, y, test_size=0.2, random_state=42)print(f"‚úÖ Ensemble d'entra√Ænement: {X_train.shape[0]} √©chantillons")print(f"‚úÖ Ensemble de test: {X_test.shape[0]} √©chantillons")print(f"\nR√©partition train/test: {X_train.shape[0]/(X_train.shape[0]+X_test.shape[0])*100:.1f}% / {X_test.shape[0]/(X_train.shape[0]+X_test.shape[0])*100:.1f}%")

In [None]:
# Cr√©ation du preprocessor avec ColumnTransformerpreprocessor = ColumnTransformer(    transformers=[        ('num', StandardScaler(), numerical_features),        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_features)    ])print("‚úÖ Preprocessor cr√©√© avec:")print("   - StandardScaler pour les variables num√©riques")print("   - OneHotEncoder pour les variables cat√©gorielles")

## 5. Entra√Ænement des Mod√®lesNous allons entra√Æner plusieurs mod√®les de r√©gression et comparer leurs performances:1. **Linear Regression** (baseline)2. **Ridge Regression** (r√©gularisation L2)3. **Lasso Regression** (r√©gularisation L1)4. **Decision Tree Regressor**5. **Random Forest Regressor**6. **Gradient Boosting Regressor**

In [None]:
# Fonction pour √©valuer un mod√®ledef evaluate_model(model, X_train, X_test, y_train, y_test, model_name):    """√âvalue un mod√®le et retourne les m√©triques"""    # Pr√©dictions    y_pred_train = model.predict(X_train)    y_pred_test = model.predict(X_test)        # Calcul des m√©triques    metrics = {        'Model': model_name,        'R2_train': r2_score(y_train, y_pred_train),        'R2_test': r2_score(y_test, y_pred_test),        'MAE': mean_absolute_error(y_test, y_pred_test),        'MSE': mean_squared_error(y_test, y_pred_test),        'RMSE': np.sqrt(mean_squared_error(y_test, y_pred_test)),        'MAPE': mean_absolute_percentage_error(y_test, y_pred_test) * 100,        'MedAE': median_absolute_error(y_test, y_pred_test)    }        return metrics, y_pred_test# Dictionnaire pour stocker les r√©sultatsresults = []predictions = {}print("‚úÖ Fonction d'√©valuation cr√©√©e!")

### 5.1 Linear Regression (Baseline)

In [None]:
# Linear Regressionlr_pipeline = Pipeline([    ('preprocessor', preprocessor),    ('regressor', LinearRegression())])lr_pipeline.fit(X_train, y_train)metrics_lr, pred_lr = evaluate_model(lr_pipeline, X_train, X_test, y_train, y_test, 'Linear Regression')results.append(metrics_lr)predictions['Linear Regression'] = pred_lrprint("‚úÖ Linear Regression entra√Æn√©!")print(f"   R¬≤ (test): {metrics_lr['R2_test']:.4f}")print(f"   RMSE: {metrics_lr['RMSE']:.2f} TND")

### 5.2 Ridge Regression

In [None]:
# Ridge Regressionridge_pipeline = Pipeline([    ('preprocessor', preprocessor),    ('regressor', Ridge(alpha=1.0, random_state=42))])ridge_pipeline.fit(X_train, y_train)metrics_ridge, pred_ridge = evaluate_model(ridge_pipeline, X_train, X_test, y_train, y_test, 'Ridge Regression')results.append(metrics_ridge)predictions['Ridge'] = pred_ridgeprint("‚úÖ Ridge Regression entra√Æn√©!")print(f"   R¬≤ (test): {metrics_ridge['R2_test']:.4f}")print(f"   RMSE: {metrics_ridge['RMSE']:.2f} TND")

### 5.3 Lasso Regression

In [None]:
# Lasso Regressionlasso_pipeline = Pipeline([    ('preprocessor', preprocessor),    ('regressor', Lasso(alpha=1.0, random_state=42))])lasso_pipeline.fit(X_train, y_train)metrics_lasso, pred_lasso = evaluate_model(lasso_pipeline, X_train, X_test, y_train, y_test, 'Lasso Regression')results.append(metrics_lasso)predictions['Lasso'] = pred_lassoprint("‚úÖ Lasso Regression entra√Æn√©!")print(f"   R¬≤ (test): {metrics_lasso['R2_test']:.4f}")print(f"   RMSE: {metrics_lasso['RMSE']:.2f} TND")

### 5.4 Decision Tree Regressor

In [None]:
# Decision Treedt_pipeline = Pipeline([    ('preprocessor', preprocessor),    ('regressor', DecisionTreeRegressor(max_depth=10, random_state=42))])dt_pipeline.fit(X_train, y_train)metrics_dt, pred_dt = evaluate_model(dt_pipeline, X_train, X_test, y_train, y_test, 'Decision Tree')results.append(metrics_dt)predictions['Decision Tree'] = pred_dtprint("‚úÖ Decision Tree entra√Æn√©!")print(f"   R¬≤ (test): {metrics_dt['R2_test']:.4f}")print(f"   RMSE: {metrics_dt['RMSE']:.2f} TND")

### 5.5 Random Forest Regressor

In [None]:
# Random Forestrf_pipeline = Pipeline([    ('preprocessor', preprocessor),    ('regressor', RandomForestRegressor(n_estimators=100, max_depth=15, random_state=42, n_jobs=-1))])rf_pipeline.fit(X_train, y_train)metrics_rf, pred_rf = evaluate_model(rf_pipeline, X_train, X_test, y_train, y_test, 'Random Forest')results.append(metrics_rf)predictions['Random Forest'] = pred_rfprint("‚úÖ Random Forest entra√Æn√©!")print(f"   R¬≤ (test): {metrics_rf['R2_test']:.4f}")print(f"   RMSE: {metrics_rf['RMSE']:.2f} TND")

### 5.6 Gradient Boosting Regressor

In [None]:
# Gradient Boostinggb_pipeline = Pipeline([    ('preprocessor', preprocessor),    ('regressor', GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42))])gb_pipeline.fit(X_train, y_train)metrics_gb, pred_gb = evaluate_model(gb_pipeline, X_train, X_test, y_train, y_test, 'Gradient Boosting')results.append(metrics_gb)predictions['Gradient Boosting'] = pred_gbprint("‚úÖ Gradient Boosting entra√Æn√©!")print(f"   R¬≤ (test): {metrics_gb['R2_test']:.4f}")print(f"   RMSE: {metrics_gb['RMSE']:.2f} TND")

## 6. Validation Crois√©e

In [None]:
# Validation crois√©e pour tous les mod√®lesmodels_cv = {    'Linear Regression': lr_pipeline,    'Ridge': ridge_pipeline,    'Lasso': lasso_pipeline,    'Decision Tree': dt_pipeline,    'Random Forest': rf_pipeline,    'Gradient Boosting': gb_pipeline}cv_results = {}print("=== VALIDATION CROIS√âE (5-Fold) ===\n")for name, model in models_cv.items():    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2', n_jobs=-1)    cv_results[name] = {        'mean': scores.mean(),        'std': scores.std(),        'scores': scores    }    print(f"{name}:")    print(f"  R¬≤ moyen: {scores.mean():.4f} (+/- {scores.std():.4f})")    print(f"  Scores: {scores}\n")

In [None]:
# Visualisation des r√©sultats de validation crois√©efig, ax = plt.subplots(figsize=(12, 6))models_names = list(cv_results.keys())means = [cv_results[m]['mean'] for m in models_names]stds = [cv_results[m]['std'] for m in models_names]x_pos = np.arange(len(models_names))ax.bar(x_pos, means, yerr=stds, align='center', alpha=0.7, ecolor='black', capsize=10, color='skyblue')ax.set_ylabel('R¬≤ Score')ax.set_xticks(x_pos)ax.set_xticklabels(models_names, rotation=45, ha='right')ax.set_title('Validation Crois√©e - Comparaison des Mod√®les')ax.yaxis.grid(True)plt.tight_layout()plt.show()

## 7. Comparaison des Mod√®les

In [None]:
# Tableau de comparaisonresults_df = pd.DataFrame(results)results_df = results_df.round(4)results_df = results_df.sort_values('R2_test', ascending=False)print("=== TABLEAU COMPARATIF DES MOD√àLES ===\n")print(results_df.to_string(index=False))# Sauvegarder le meilleur mod√®lebest_model_name = results_df.iloc[0]['Model']print(f"\nüèÜ MEILLEUR MOD√àLE: {best_model_name}")print(f"   R¬≤ (test): {results_df.iloc[0]['R2_test']:.4f}")print(f"   RMSE: {results_df.iloc[0]['RMSE']:.2f} TND")

In [None]:
# Visualisation comparativefig, axes = plt.subplots(2, 2, figsize=(15, 10))# R¬≤ Scoreax = axes[0, 0]results_df.plot(x='Model', y=['R2_train', 'R2_test'], kind='bar', ax=ax, color=['lightblue', 'darkblue'])ax.set_title('R¬≤ Score - Train vs Test')ax.set_ylabel('R¬≤ Score')ax.set_xlabel('')ax.legend(['Train', 'Test'])ax.tick_params(axis='x', rotation=45)# RMSEax = axes[0, 1]results_df.plot(x='Model', y='RMSE', kind='bar', ax=ax, color='coral', legend=False)ax.set_title('RMSE (Root Mean Squared Error)')ax.set_ylabel('RMSE (TND)')ax.set_xlabel('')ax.tick_params(axis='x', rotation=45)# MAEax = axes[1, 0]results_df.plot(x='Model', y='MAE', kind='bar', ax=ax, color='lightgreen', legend=False)ax.set_title('MAE (Mean Absolute Error)')ax.set_ylabel('MAE (TND)')ax.set_xlabel('')ax.tick_params(axis='x', rotation=45)# MAPEax = axes[1, 1]results_df.plot(x='Model', y='MAPE', kind='bar', ax=ax, color='gold', legend=False)ax.set_title('MAPE (Mean Absolute Percentage Error)')ax.set_ylabel('MAPE (%)')ax.set_xlabel('')ax.tick_params(axis='x', rotation=45)plt.tight_layout()plt.show()

## 8. Analyse d'Importance des Features

In [None]:
# Extraction de l'importance des features pour Random Forestrf_model = rf_pipeline.named_steps['regressor']feature_importance = rf_model.feature_importances_# R√©cup√©rer les noms des features apr√®s transformationpreprocessor_fitted = rf_pipeline.named_steps['preprocessor']feature_names = numerical_features.copy()# Ajouter les noms des features cat√©gorielles encod√©escat_encoder = preprocessor_fitted.named_transformers_['cat']cat_features = cat_encoder.get_feature_names_out(categorical_features)feature_names.extend(cat_features)# Cr√©er un DataFrameimportance_df = pd.DataFrame({    'Feature': feature_names,    'Importance': feature_importance}).sort_values('Importance', ascending=False)print("=== IMPORTANCE DES FEATURES (Random Forest) ===\n")print(importance_df.to_string(index=False))

In [None]:
# Visualisation de l'importance des featuresfig, ax = plt.subplots(figsize=(10, 8))# Top 15 featurestop_features = importance_df.head(15)ax.barh(range(len(top_features)), top_features['Importance'], color='steelblue')ax.set_yticks(range(len(top_features)))ax.set_yticklabels(top_features['Feature'])ax.set_xlabel('Importance')ax.set_title('Top 15 Features les Plus Importantes (Random Forest)')ax.invert_yaxis()plt.tight_layout()plt.show()

## 9. Analyse des Pr√©dictions

In [None]:
# Pr√©dictions vs Valeurs R√©elles pour le meilleur mod√®lebest_predictions = predictions[best_model_name]fig, axes = plt.subplots(1, 2, figsize=(15, 5))# Scatter plotax = axes[0]ax.scatter(y_test, best_predictions, alpha=0.5, s=30)ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)ax.set_xlabel('Valeurs R√©elles (TND)')ax.set_ylabel('Pr√©dictions (TND)')ax.set_title(f'Pr√©dictions vs R√©alit√© - {best_model_name}')ax.grid(True, alpha=0.3)# R√©sidusax = axes[1]residuals = y_test - best_predictionsax.scatter(best_predictions, residuals, alpha=0.5, s=30)ax.axhline(y=0, color='r', linestyle='--', lw=2)ax.set_xlabel('Pr√©dictions (TND)')ax.set_ylabel('R√©sidus (TND)')ax.set_title(f'Analyse des R√©sidus - {best_model_name}')ax.grid(True, alpha=0.3)plt.tight_layout()plt.show()

In [None]:
# Distribution des r√©sidusfig, axes = plt.subplots(1, 2, figsize=(15, 5))# Histogrammeax = axes[0]ax.hist(residuals, bins=50, edgecolor='black', alpha=0.7)ax.set_xlabel('R√©sidus (TND)')ax.set_ylabel('Fr√©quence')ax.set_title('Distribution des R√©sidus')ax.axvline(x=0, color='r', linestyle='--', lw=2)# Q-Q plotax = axes[1]from scipy import statsstats.probplot(residuals, dist="norm", plot=ax)ax.set_title('Q-Q Plot des R√©sidus')plt.tight_layout()plt.show()print(f"\nMoyenne des r√©sidus: {residuals.mean():.2f} TND")print(f"√âcart-type des r√©sidus: {residuals.std():.2f} TND")

## 10. Sauvegarde du Meilleur Mod√®le

In [None]:
# Sauvegarde du meilleur mod√®leimport joblibimport os# S√©lectionner le pipeline du meilleur mod√®lebest_pipeline = models_cv[best_model_name]# Cr√©er le dossier models s'il n'existe pasos.makedirs('../models', exist_ok=True)# Sauvegardermodel_path = '../models/best_model.joblib'joblib.dump(best_pipeline, model_path)print(f"‚úÖ Meilleur mod√®le sauvegard√©: {model_path}")print(f"   Mod√®le: {best_model_name}")print(f"   R¬≤ (test): {results_df.iloc[0]['R2_test']:.4f}")print(f"\nüéâ Analyse ML termin√©e avec succ√®s!")