In [24]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

### Data Collection

In [25]:
import requests

# URLs of the files
train_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_train.csv'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_test.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_data_url, 'module3_exercise_train.csv')
download_file(test_data_url, 'module3_exercise_test.csv')

Downloaded module3_exercise_train.csv from https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_train.csv
Downloaded module3_exercise_test.csv from https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_test.csv


In [38]:
df_train =  pd.read_csv("module3_exercise_train.csv", sep=",", index_col='id')

### Data Preprocessing

In [39]:
df_train.sample(10)

Unnamed: 0_level_0,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
178,2,1,4.0,0,1977.0,two,440,0,55,0,0,200,0,0,10,2008.0,140000
780,3,1,7.0,0,2007.0,two,562,0,0,0,0,0,0,0,5,2008.0,205950
1196,2,1,5.0,0,1997.0,one,308,0,0,168,0,0,0,0,7,2007.0,108500
1360,3,1,5.0,0,1970.0,one,280,0,0,0,0,0,0,0,2,2010.0,122000
1259,4,1,8.0,1,1958.0,two,451,0,0,0,0,0,0,0,7,2006.0,172500
1353,3,1,6.0,1,2005.0,two,453,38,144,0,0,0,0,0,4,2006.0,176000
137,3,1,6.0,0,2003.0,two,595,0,45,0,0,0,0,0,6,2008.0,183000
468,4,1,8.0,0,1968.0,two,441,0,0,0,0,0,0,0,5,2009.0,154000
708,3,1,5.0,0,1975.0,two,440,0,0,0,0,0,0,0,1,2007.0,122000
104,4,2,8.0,0,1920.0,three,779,0,0,90,0,0,0,0,5,2006.0,87000


In [40]:
df_train.shape

(1168, 17)

In [41]:
def dataClean(data):
    # Colonnes à supprimer
    cols_to_drop = ['3SsnPorch', 'ScreenPorch', 'MiscVal']
    data.drop(columns=cols_to_drop, inplace=True, errors='ignore')

    # Remplacer les valeurs manquantes par -1
    data.fillna(-1, inplace=True)

    # Dictionnaire pour convertir les mots en chiffres
    word_to_num = {
        'zero': 0,
        'one': 1,
        'two': 2,
        'three': 3,
        'four': 4,
        'five': 5,
        'six': 6,
        'seven': 7,
        'eight': 8,
        'nine': 9,
        'ten': 10
    }

    # Appliquer la conversion uniquement si la colonne existe
    if 'GarageCars' in data.columns and data['GarageCars'].dtype == object:
        data['GarageCars'] = data['GarageCars'].str.lower().map(word_to_num)

    data = data.dropna(subset=['GarageCars'])

    return data


In [42]:
data_train_clean = dataClean(df_train)
data_train_clean.sample(10)

Unnamed: 0_level_0,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,PoolArea,MoSold,YrSold,SalePrice
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1181,3,1,7.0,1,1976.0,2.0,440,87,0,0,0,7,2009.0,148500
1100,2,1,6.0,2,1957.0,1.0,319,288,258,0,0,12,2006.0,207500
467,2,1,11.0,2,2009.0,3.0,820,0,67,0,0,3,2010.0,611657
459,3,1,8.0,0,1997.0,2.0,532,509,135,0,0,10,2009.0,167500
304,4,1,9.0,1,1964.0,2.0,540,0,52,0,0,6,2007.0,165000
384,3,1,6.0,0,2004.0,2.0,576,0,102,0,0,6,2008.0,159000
920,2,1,5.0,0,1961.0,1.0,275,0,0,112,0,5,2008.0,130000
1073,3,1,6.0,0,1965.0,1.0,276,736,68,0,0,6,2008.0,147500
1063,4,2,9.0,0,1910.0,1.0,440,0,60,112,0,4,2006.0,137000
1401,1,1,3.0,0,-1.0,0.0,0,96,24,0,0,5,2010.0,75500


In [43]:
# Si tu cherches les valeurs manquantes (NaN)
#missing_rows = data_train_clean[data_train_clean['GarageCars'].isna()]

# Si tu cherches les lignes où GarageCars vaut -1
#missing_rows = data_train_clean[data_train_clean['GarageCars'] == -1]

#print(missing_rows)


### Model Building and Evaluation

In [44]:
# Import necessary libraries for model building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

In [45]:
# Preparing the data
features = [
    'BedroomAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars',
    'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'MoSold', 'YrSold'
]

X = data_train_clean[features]
y = data_train_clean['SalePrice']



In [46]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [47]:
# Initialize and train the Logistic Regression model
model = LinearRegression()
model.fit(X_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [48]:
# Evaluate the model

def pred_eval(model, X_t, y_t):
    y_pred = model.predict(X_t)
    Mae = mean_absolute_error(y_t, y_pred)
    print("MAE:", Mae)
    return y_pred

In [49]:
# Predict and eval on the train data
y_pred = pred_eval(model, X_train, y_train)

MAE: 33407.78903187546


In [50]:
# Predict and eval on the test data
y_pred = pred_eval(model, X_test, y_test)

MAE: 34036.56100266357


### Generating Submission File

In [51]:
data_test =  pd.read_csv("module3_exercise_test.csv", sep=",", index_col='id')

In [52]:
# data preprocessing
X_test = dataClean(data_test)

In [53]:
# Preparing the data
X_train_full= data_train_clean[features]
y_train_full = data_train_clean['SalePrice']

In [54]:
# Train on all information you have
model.fit(X_train_full, y_train_full)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [55]:
# Make predictions on the test set
y_pred = model.predict(X_test[features])

In [56]:
submission = pd.DataFrame({
    'id': X_test.index,
    'SalePrice': y_pred # your_prediction
})

submission.to_csv('submission.csv', index=False, sep=',')
submission.head()

Unnamed: 0,id,SalePrice
0,892,276214.65562
1,1105,230902.830726
2,413,248536.726587
3,522,203623.240547
4,1036,154213.020969
