In [39]:
import pandas as pd

### Data Collection

In [40]:
import requests

# URLs of the files
train_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_train.csv'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_test.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_data_url, 'module3_exercise_train.csv')
download_file(test_data_url, 'module3_exercise_test.csv')

Downloaded module3_exercise_train.csv from https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_train.csv
Downloaded module3_exercise_test.csv from https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_test.csv


In [41]:
df_train =  pd.read_csv("module3_exercise_train.csv", sep=",", index_col='id')
df_train = df_train.dropna()

### Data Preprocessing

In [42]:
df_train

Unnamed: 0_level_0,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
254,2,1,4.0,1,1977.0,two,440,0,55,0,0,165,0,0,7,2010.0,127500
1066,1,1,5.0,1,1983.0,two,612,349,40,0,0,0,0,0,9,2009.0,316600
638,4,1,10.0,1,1998.0,two,420,144,123,0,0,0,0,0,7,2006.0,258000
799,3,1,8.0,0,1916.0,one,180,0,0,0,140,0,0,0,8,2009.0,135000
380,2,1,5.0,0,2005.0,two,438,108,0,0,0,0,0,0,3,2006.0,167240
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,4,1,7.0,0,-1.0,zero,0,0,192,0,0,0,0,0,11,2008.0,130000
1130,3,1,6.0,0,1964.0,two,504,0,0,0,0,0,0,0,7,2008.0,145000
1294,3,1,7.0,1,1996.0,three,889,220,0,0,0,0,0,0,7,2009.0,265000
860,3,1,6.0,1,1966.0,two,453,188,108,0,0,0,0,0,7,2006.0,155000


In [43]:
df_train.drop(df_train.columns[0],axis = 1)
df_train.fillna(-1)

Unnamed: 0_level_0,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
254,2,1,4.0,1,1977.0,two,440,0,55,0,0,165,0,0,7,2010.0,127500
1066,1,1,5.0,1,1983.0,two,612,349,40,0,0,0,0,0,9,2009.0,316600
638,4,1,10.0,1,1998.0,two,420,144,123,0,0,0,0,0,7,2006.0,258000
799,3,1,8.0,0,1916.0,one,180,0,0,0,140,0,0,0,8,2009.0,135000
380,2,1,5.0,0,2005.0,two,438,108,0,0,0,0,0,0,3,2006.0,167240
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,4,1,7.0,0,-1.0,zero,0,0,192,0,0,0,0,0,11,2008.0,130000
1130,3,1,6.0,0,1964.0,two,504,0,0,0,0,0,0,0,7,2008.0,145000
1294,3,1,7.0,1,1996.0,three,889,220,0,0,0,0,0,0,7,2009.0,265000
860,3,1,6.0,1,1966.0,two,453,188,108,0,0,0,0,0,7,2006.0,155000


In [44]:
def namer (x):
    if x == 'two':
        return 2
    elif x == 'one':
        return 1
    elif x == 'zero':
        return 0
    else:
        return 3

df_train['GarageCars'] = df_train['GarageCars'].map(namer)

### Model Building and Evaluation

In [45]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

### Generating Submission File

In [46]:
X_test =  pd.read_csv("module3_exercise_test.csv", sep=",", index_col='id')


In [54]:
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

# On suppose que X et target sont déjà définis
# X, target = df_train[df_train.columns[:-1]].values , df_train[[df_train.columns[-1]]].values

# ==============================================================================
# 1. PRÉPARATION DES DONNÉES (SIMPLE ET STANDARD)
# ==============================================================================

# a) Séparer les données en ensembles d'entraînement et de validation
X_train, X_val, y_train, y_val = train_test_split(X, target, test_size=0.2, random_state=42)

# b) Mettre les features à l'échelle (bonne pratique, même si moins critique pour XGBoost)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# ==============================================================================
# 2. ENTRAÎNEMENT DU MODÈLE XGBOOST
# ==============================================================================

# a) Initialiser le modèle avec des hyperparamètres robustes
# n_estimators : nombre d'arbres
# learning_rate : vitesse d'apprentissage
# max_depth : profondeur maximale de chaque arbre
# subsample : fraction d'échantillons utilisée pour chaque arbre (prévient le surapprentissage)
xgbr = xgb.XGBRegressor(objective='reg:squarederror',
                        n_estimators=1000,
                        learning_rate=0.05,
                        max_depth=5,
                        subsample=0.8,
                        colsample_bytree=0.8,
                        random_state=42,
                        n_jobs=-1)

# b) Entraîner le modèle
print("Début de l'entraînement avec XGBoost...")
xgbr.fit(X_train_scaled, y_train,
         eval_set=[(X_val_scaled, y_val)])                    # Ne pas afficher les logs de chaque tour

print("✅ Entraînement terminé !")

# ==============================================================================
# 3. ÉVALUATION
# ==============================================================================

# a) Faire des prédictions sur l'ensemble de validation
y_pred_val = xgbr.predict(X_val_scaled)

# b) Calculer les métriques finales
final_mae = mean_absolute_error(y_val, y_pred_val)
final_rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))

print(f"\nMAE finale sur l'ensemble de validation : {final_mae:.2f}")
print(f"RMSE finale sur l'ensemble de validation : {final_rmse:.2f}")

if final_mae < 30000:
    print("\n🎉 Objectif atteint !")
else:
    print("\nObjectif non atteint. Les hyperparamètres de XGBoost peuvent être optimisés davantage.")

Début de l'entraînement avec XGBoost...
[0]	validation_0-rmse:77058.98715
[1]	validation_0-rmse:74986.38757
[2]	validation_0-rmse:72914.11988
[3]	validation_0-rmse:70815.32551
[4]	validation_0-rmse:69031.81664
[5]	validation_0-rmse:67027.04033
[6]	validation_0-rmse:65572.71887
[7]	validation_0-rmse:64033.52202
[8]	validation_0-rmse:62543.20142
[9]	validation_0-rmse:61278.26329
[10]	validation_0-rmse:60263.50464
[11]	validation_0-rmse:58811.03918
[12]	validation_0-rmse:57727.94537
[13]	validation_0-rmse:56648.57144
[14]	validation_0-rmse:55943.22035
[15]	validation_0-rmse:54957.40493
[16]	validation_0-rmse:53687.31370
[17]	validation_0-rmse:52609.74602
[18]	validation_0-rmse:51811.47709
[19]	validation_0-rmse:51286.83798
[20]	validation_0-rmse:50456.79379
[21]	validation_0-rmse:49839.46807
[22]	validation_0-rmse:49142.96882
[23]	validation_0-rmse:48680.54748
[24]	validation_0-rmse:48084.48624
[25]	validation_0-rmse:47845.89017
[26]	validation_0-rmse:47433.64065
[27]	validation_0-rmse:46

In [55]:
X_test['GarageCars'] = X_test['GarageCars'].map(namer)
def replace(x,mean):
  if pd.isna(x):
    return mean
  else:
    return x
cols = X_test.columns

for col in cols:
    m = X_test[col].mean()
    X_test[col] = X_test[col].apply(lambda x: replace(x, m))

In [56]:
# Mettre X_test à l'échelle PUIS prédire en une seule ligne
predictions = xgbr.predict(scaler.transform(X_test))



In [57]:
submission = pd.DataFrame({
    'id': X_test.index,
    'SalePrice': predictions
})

submission.to_csv('submission.csv', index=False, sep=',')
submission.head()

Unnamed: 0,id,SalePrice
0,892,280148.09375
1,1105,232439.8125
2,413,203104.90625
3,522,234991.84375
4,1036,191170.953125


✅ df_train y df_test creados con éxito.

Aperçu de df_train:
         CRIM    ZN  INDUS  CHAS     NOX     RM   AGE     DIS  RAD    TAX  \
477  15.02340   0.0  18.10     0  0.6140  5.304  97.3  2.1007   24  666.0   
15    0.62739   0.0   8.14     0  0.5380  5.834  56.5  4.4986    4  307.0   
332   0.03466  35.0   6.06     0  0.4379  6.031  23.3  6.6407    1  304.0   
423   7.05042   0.0  18.10     0  0.6140  6.103  85.1  2.0218   24  666.0   
19    0.72580   0.0   8.14     0  0.5380  5.727  69.5  3.7965    4  307.0   

     PTRATIO       B  LSTAT  SalePrice  
477     20.2  349.48  24.91       12.0  
15      21.0  395.62   8.47       19.9  
332     16.9  362.25   7.83       19.4  
423     20.2    2.52  23.29       13.4  
19      21.0  390.95  11.28       18.2  

------------------

Aperçu de df_test (este es el que usarás para la predicción final):
        CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
173  0.09178   0.0   4.05     0  0.510  6.416  84.1  2.6463    5  