In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.cluster import KMeans
import time

In [3]:
data = pd.read_csv('database_clean.csv')

In [6]:
# Features et cible
features = ['distance_km', 'passenger_count', 'pickup_hour', 'is_weekend','pickup_dow']
X = data[features]
y = data['fare_amount']

In [8]:
# Standardisation des features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [10]:
# Séparation train/test
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [12]:

# Dictionnaire des modèles
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=200,learning_rate=0.1, max_depth=6,subsample=0.8,colsample_bytree=0.8,random_state=42),
    'KNN': KNeighborsRegressor(n_neighbors=5)
}

In [14]:
# Fonction d'évaluation
def evaluate_model(model, model_name, X_train, X_test, y_train, y_test):
    start_time = time.time()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Validation croisée
    cv_scores = cross_val_score(model, X_scaled, y, cv=5, scoring='r2')
    cv_mean, cv_std = cv_scores.mean(), cv_scores.std()
    
    execution_time = time.time() - start_time
    return {
        'Model': model_name,
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2,
        'CV R2 Mean': cv_mean,
        'CV R2 Std': cv_std,
        'Time (s)': execution_time
    }

In [16]:
# Évaluation des modèles de régression
results = []
for name, model in models.items():
    result = evaluate_model(model, name, X_train, X_test, y_train, y_test)
    results.append(result)
    print(f"{name}: RMSE = {result['RMSE']:.2f}, MAE = {result['MAE']:.2f}, R2 = {result['R2']:.3f}, "
      f"CV R2 = {result['CV R2 Mean']:.3f} (±{result['CV R2 Std']:.3f}), Time = {result['Time (s)']:.2f}s")

Linear Regression: RMSE = 4.88, MAE = 2.45, R2 = 0.703, CV R2 = 0.737 (±0.029), Time = 0.10s
Random Forest: RMSE = 4.44, MAE = 2.60, R2 = 0.755, CV R2 = 0.758 (±0.011), Time = 77.24s
Gradient Boosting: RMSE = 4.00, MAE = 2.33, R2 = 0.801, CV R2 = 0.784 (±0.012), Time = 35.33s
XGBoost: RMSE = 4.34, MAE = 2.43, R2 = 0.765, CV R2 = 0.756 (±0.013), Time = 1.72s
KNN: RMSE = 4.38, MAE = 2.61, R2 = 0.762, CV R2 = 0.752 (±0.010), Time = 0.79s


In [18]:
# Clustering avec KMeans
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(X_scaled)
data['cluster'] = cluster_labels

In [20]:
# Évaluation du clustering (inertie et silhouette si besoin)
inertia = kmeans.inertia_
print(f"Clustering Inertia: {inertia:.2f}")

Clustering Inertia: 93325.59


In [22]:
# Résultats par cluster
cluster_summary = data.groupby('cluster')[features + ['fare_amount']].mean()
print("\nCluster Means:")
print(cluster_summary)


Cluster Means:
         distance_km  passenger_count  pickup_hour  is_weekend  pickup_dow  \
cluster                                                                      
0           2.618446         1.228220    13.913167    0.000000    2.080863   
1           2.904813         1.348556    12.413091    1.000000    5.556771   
2           2.962996         5.044733    13.971861    0.312169    3.193843   
3          15.738089         1.507678    13.049424    0.242322    2.984165   

         fare_amount  
cluster               
0           9.399325  
1           9.708264  
2          10.169519  
3          38.895576  


In [24]:

# Sauvegarde des résultats
results_df = pd.DataFrame(results)
results_df.to_csv('model_evaluation_results.csv', index=False)
print("\nResults saved to 'model_evaluation_results.csv'")


Results saved to 'model_evaluation_results.csv'


In [26]:
# Meilleur modèle basé sur R2
best_model = max(results, key=lambda x: x['R2'])
print(f"\nBest Model: {best_model['Model']} with R2 = {best_model['R2']:.3f}")


Best Model: Gradient Boosting with R2 = 0.801


In [28]:
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score


In [30]:
# Scores d'évaluation non supervisée
silhouette = silhouette_score(X_scaled, cluster_labels)
calinski = calinski_harabasz_score(X_scaled, cluster_labels)
davies = davies_bouldin_score(X_scaled, cluster_labels)

print(f"Silhouette Score: {silhouette:.3f}")
print(f"Calinski-Harabasz Index: {calinski:.3f}")
print(f"Davies-Bouldin Index: {davies:.3f}")

Silhouette Score: 0.429
Calinski-Harabasz Index: 18098.251
Davies-Bouldin Index: 0.954


In [None]:
Silhouette Score = 0.429
➝ Le score est compris entre -1 et 1. Plus il est proche de 1, plus les clusters sont bien séparés.
➝ 0.429 est correct : ça montre que les clusters sont plutôt bien définis, mais pas parfaits.

Calinski-Harabasz Index = 18 098.251
➝ Plus ce score est élevé, mieux c’est.
➝ valeur est très grande, ce qui confirme que les clusters sont denses et bien séparés.

Davies-Bouldin Index = 0.954
➝ Plus il est bas, mieux c’est.
➝ En dessous de 1, c’est déjà un bon signe : les clusters ne se chevauchent pas trop.