In [44]:
import pandas as pd

## Data Collection

In [45]:
import requests

# URLs of the files
train_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_train.csv'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_test.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_data_url, 'module3_exercise_train.csv')
download_file(test_data_url, 'module3_exercise_test.csv')

Downloaded module3_exercise_train.csv from https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_train.csv
Downloaded module3_exercise_test.csv from https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_test.csv


### Import train & test datasets

In [72]:
df_train =  pd.read_csv("module3_exercise_train.csv", sep=",")
df_train

Unnamed: 0,id,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,254,2,1,4.0,1,1977.0,two,440,0,55,0,0,165,0,0,7,2010.0,127500
1,1066,1,1,5.0,1,1983.0,two,612,349,40,0,0,0,0,0,9,2009.0,316600
2,638,4,1,10.0,1,1998.0,two,420,144,123,0,0,0,0,0,7,2006.0,258000
3,799,3,1,8.0,0,1916.0,one,180,0,0,0,140,0,0,0,8,2009.0,135000
4,380,2,1,5.0,0,2005.0,two,438,108,0,0,0,0,0,0,3,2006.0,167240
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1163,1095,4,1,7.0,0,-1.0,zero,0,0,192,0,0,0,0,0,11,2008.0,130000
1164,1130,3,1,6.0,0,1964.0,two,504,0,0,0,0,0,0,0,7,2008.0,145000
1165,1294,3,1,7.0,1,1996.0,three,889,220,0,0,0,0,0,0,7,2009.0,265000
1166,860,3,1,6.0,1,1966.0,two,453,188,108,0,0,0,0,0,7,2006.0,155000


In [73]:
df_test = pd.read_csv('module3_exercise_test.csv', sep=",")
df_test

Unnamed: 0,id,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,892,3,1,8,1.0,2006.0,three,788,0,191,0.0,0,0,0,0,3,2007
1,1105,3,1,8,2.0,1965.0,two,513,0,0,0.0,0,0,0,0,6,2009
2,413,2,1,7,3.0,1955.0,one,303,476,0,0.0,0,142,0,0,11,2009
3,522,3,1,7,0.0,1993.0,two,457,370,70,0.0,238,0,0,0,2,2010
4,1036,3,1,6,0.0,1999.0,two,506,0,34,0.0,0,0,0,0,3,2006
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
287,479,4,1,10,1.0,1969.0,one,336,272,0,42.0,0,116,0,0,3,2010
288,1361,3,1,8,0.0,2003.0,two,500,144,68,0.0,0,0,0,0,9,2007
289,802,3,1,7,1.0,2002.0,two,577,0,211,0.0,0,0,0,0,3,2008
290,651,4,2,8,1.0,1999.0,two,672,344,0,,0,0,0,0,5,2010


### Quik check

In [47]:
def quick_check(df, n=5):
    print("📊 Dimensions :", df.shape)
    print("\n🔎 Aperçu des", n, "premières lignes :")
    display(df.head(n))

    print("\nℹ️ Infos générales :")
    df.info()

    print("\n📐 Types de colonnes :")
    print(df.dtypes)

    print("\n📉 Statistiques descriptives (numériques) :")
    display(df.describe())

    print("\n📉 Statistiques descriptives (catégorielles / objets) :")
    display(df.describe(include="O"))

    print("\n❓ Valeurs manquantes :")
    print(df.isna().sum())

    print("\n🔁 Nb de valeurs uniques par colonne :")
    print(df.nunique())


In [48]:
quick_check(df_train)


📊 Dimensions : (1168, 18)

🔎 Aperçu des 5 premières lignes :


Unnamed: 0,id,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,254,2,1,4.0,1,1977.0,two,440,0,55,0,0,165,0,0,7,2010.0,127500
1,1066,1,1,5.0,1,1983.0,two,612,349,40,0,0,0,0,0,9,2009.0,316600
2,638,4,1,10.0,1,1998.0,two,420,144,123,0,0,0,0,0,7,2006.0,258000
3,799,3,1,8.0,0,1916.0,one,180,0,0,0,140,0,0,0,8,2009.0,135000
4,380,2,1,5.0,0,2005.0,two,438,108,0,0,0,0,0,0,3,2006.0,167240



ℹ️ Infos générales :
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1168 entries, 0 to 1167
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             1168 non-null   int64  
 1   BedroomAbvGr   1168 non-null   int64  
 2   KitchenAbvGr   1168 non-null   int64  
 3   TotRmsAbvGrd   1167 non-null   float64
 4   Fireplaces     1168 non-null   int64  
 5   GarageYrBlt    1168 non-null   float64
 6   GarageCars     1167 non-null   object 
 7   GarageArea     1168 non-null   int64  
 8   WoodDeckSF     1168 non-null   int64  
 9   OpenPorchSF    1168 non-null   int64  
 10  EnclosedPorch  1168 non-null   int64  
 11  3SsnPorch      1168 non-null   int64  
 12  ScreenPorch    1168 non-null   int64  
 13  PoolArea       1168 non-null   int64  
 14  MiscVal        1168 non-null   int64  
 15  MoSold         1168 non-null   int64  
 16  YrSold         1167 non-null   float64
 17  SalePrice      1168 non-null  

Unnamed: 0,id,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1168.0,1168.0,1168.0,1167.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1167.0,1168.0
mean,729.904966,2.872432,1.043664,6.497858,0.607877,1867.833904,469.120719,91.976027,46.300514,23.607021,2.782534,15.701199,2.378425,39.72089,6.336473,2007.802057,179163.84589
std,425.369088,0.805929,0.212653,1.61833,0.630833,454.504382,210.626791,123.448275,67.180017,62.307174,23.908902,57.516696,36.47796,489.113625,2.692267,1.327656,76848.682649
min,0.0,0.0,1.0,2.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,359.75,2.0,1.0,5.0,0.0,1957.0,324.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,130000.0
50%,731.5,3.0,1.0,6.0,1.0,1977.0,478.0,0.0,24.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,161625.0
75%,1100.75,3.0,1.0,7.0,1.0,2000.0,576.0,168.0,66.25,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,212250.0
max,1459.0,8.0,3.0,14.0,3.0,2010.0,1418.0,736.0,547.0,386.0,320.0,480.0,648.0,15500.0,12.0,2010.0,745000.0



📉 Statistiques descriptives (catégorielles / objets) :


Unnamed: 0,GarageCars
count,1167
unique,5
top,two
freq,662



❓ Valeurs manquantes :
id               0
BedroomAbvGr     0
KitchenAbvGr     0
TotRmsAbvGrd     1
Fireplaces       0
GarageYrBlt      0
GarageCars       1
GarageArea       0
WoodDeckSF       0
OpenPorchSF      0
EnclosedPorch    0
3SsnPorch        0
ScreenPorch      0
PoolArea         0
MiscVal          0
MoSold           0
YrSold           1
SalePrice        0
dtype: int64

🔁 Nb de valeurs uniques par colonne :
id               1168
BedroomAbvGr        8
KitchenAbvGr        3
TotRmsAbvGrd       12
Fireplaces          4
GarageYrBlt        98
GarageCars          5
GarageArea        387
WoodDeckSF        242
OpenPorchSF       184
EnclosedPorch     105
3SsnPorch          15
ScreenPorch        66
PoolArea            6
MiscVal            19
MoSold             12
YrSold              5
SalePrice         571
dtype: int64


## Data Preprocessing

In [74]:
df_train = df_train.drop('id', axis=1).fillna(-1)
df_test = df_test.drop('id', axis=1).fillna(-1)


In [77]:
df_train['GarageCars'].value_counts()

GarageCars
two      662
one      298
three    140
zero      65
four       2
-1         1
Name: count, dtype: int64

### Encoding GarageCars values

In [79]:
mapping = {
    "zero": 0,
    "one": 1,
    "two": 2,
    "three": 3,
    "four": 4,
    "five": 5,
    "six": 6,
    -1: -1  # ou np.nan si tu veux traiter comme manquant
}

# Mapper et encoder
df_train["GarageCars_num"] = df_train["GarageCars"].map(mapping)
df_test["GarageCars_num"] = df_test["GarageCars"].map(mapping)

# Drop les données catégorielles
df_train = df_train.drop(columns='GarageCars')
df_test = df_test.drop(columns='GarageCars')

df_train["GarageCars_num"].value_counts()

GarageCars_num
 2    662
 1    298
 3    140
 0     65
 4      2
-1      1
Name: count, dtype: int64

### Split train dataset into train, val

In [82]:
from sklearn.model_selection import train_test_split

X, y = df_train.drop(columns=['SalePrice']), df_train['SalePrice']
# Split into train+val and test
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Split test set
X_test = df_test.copy()

## Model Building and Evaluation

### Imports

In [83]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

### Model

In [87]:
model = LinearRegression()
model.fit(X_train, y_train)

### Apply model to validation set

In [88]:
y_val_pred = model.predict(X_val)

### Results on val set 

In [91]:
mae_val = mean_absolute_error(y_val, y_val_pred)
mae_val

32712.38339012344

### mae < 36 000

In [92]:
y_test_pred = model.predict(X_test)

## Generating Submission File

In [93]:
X_test =  pd.read_csv("module3_exercise_test.csv", sep=",", index_col='id')

In [94]:
submission = pd.DataFrame({
    'id': X_test.index,
    'SalePrice': y_test_pred # your_prediction
})

submission.to_csv('submission.csv', index=False, sep=',')
submission.head()

Unnamed: 0,id,SalePrice
0,892,284047.496867
1,1105,234570.514942
2,413,243200.437924
3,522,240488.655625
4,1036,156352.963208
