In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

### Data Collection

In [2]:
import requests

# URLs of the files
train_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_train.csv'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_test.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_data_url, 'module3_exercise_train.csv')
download_file(test_data_url, 'module3_exercise_test.csv')

Downloaded module3_exercise_train.csv from https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_train.csv
Downloaded module3_exercise_test.csv from https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_test.csv


In [3]:
df_train =  pd.read_csv("module3_exercise_train.csv", sep=",", index_col='id')

### Data Preprocessing

In [4]:
df_train.sample(10)

Unnamed: 0_level_0,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
653,3,1,5.0,0,1966.0,one,264,0,0,0,0,0,0,0,10,2008.0,110000
512,3,1,8.0,1,2010.0,three,606,168,95,0,0,0,0,0,4,2010.0,395192
943,3,1,6.0,0,2005.0,two,508,264,98,0,0,0,0,0,1,2007.0,203000
1357,3,1,6.0,0,1966.0,one,386,0,0,0,0,0,0,0,1,2007.0,127000
246,4,1,7.0,2,1970.0,two,550,0,42,0,0,0,0,0,5,2008.0,237500
165,2,1,5.0,0,1938.0,one,200,0,0,96,0,0,0,0,6,2010.0,58500
171,3,1,7.0,3,1965.0,two,486,0,42,0,0,189,0,0,10,2009.0,205000
506,2,1,5.0,1,1926.0,one,180,0,128,0,0,0,0,0,5,2007.0,102000
1261,3,1,7.0,0,2007.0,two,578,144,105,0,0,0,0,0,4,2009.0,212000
386,4,2,8.0,0,1967.0,two,576,0,0,0,0,0,0,0,6,2007.0,143000


In [5]:
df_train.shape

(1168, 17)

In [22]:
def dataClean(data):
    # Colonnes à supprimer
    cols_to_drop = ['3SsnPorch', 'ScreenPorch', 'MiscVal']
    data.drop(columns=cols_to_drop, inplace=True, errors='ignore')

    # Remplacer les valeurs manquantes par -1
    #data.fillna(-1, inplace=True)

    # Remplacer les NaN par la moyenne de chaque colonne
    data.fillna(data.mean(numeric_only=True), inplace=True)

    # Dictionnaire pour convertir les mots en chiffres
    word_to_num = {
        'zero': 0,
        'one': 1,
        'two': 2,
        'three': 3,
        'four': 4,
        'five': 5,
        'six': 6,
        'seven': 7,
        'eight': 8,
        'nine': 9,
        'ten': 10
    }

    # Appliquer la conversion uniquement si la colonne existe
    if 'GarageCars' in data.columns and data['GarageCars'].dtype == object:
        data['GarageCars'] = data['GarageCars'].str.lower().map(word_to_num)

    data = data.dropna(subset=['GarageCars'])

    return data


In [23]:
data_train_clean = dataClean(df_train)
data_train_clean.sample(10)

Unnamed: 0_level_0,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,PoolArea,MoSold,YrSold,SalePrice
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
16,4,1,7.0,0,1969.0,1.0,299,379,36,0,0,3,2010.0,158000
724,3,1,6.0,0,2007.0,2.0,396,100,30,0,0,8,2007.0,181134
1039,3,1,5.0,1,1998.0,2.0,514,402,25,0,0,8,2007.0,215000
1330,3,1,7.0,0,-1.0,0.0,0,136,0,115,0,3,2007.0,127000
21,2,1,5.0,2,1968.0,2.0,576,0,0,240,0,7,2009.0,110000
87,4,2,8.0,0,1979.0,2.0,410,0,0,0,0,6,2009.0,118964
31,2,1,4.0,0,1956.0,1.0,252,261,0,156,0,6,2007.0,132000
534,3,1,6.0,0,1950.0,1.0,288,0,0,96,0,4,2008.0,119000
153,3,1,7.0,1,1998.0,3.0,642,0,35,272,0,2,2006.0,140000
953,6,2,10.0,0,1987.0,2.0,576,0,0,0,0,11,2008.0,142600


In [24]:
# Si tu cherches les valeurs manquantes (NaN)
#missing_rows = data_train_clean[data_train_clean['GarageCars'].isna()]

# Si tu cherches les lignes où GarageCars vaut -1
#missing_rows = data_train_clean[data_train_clean['GarageCars'] == -1]

#print(missing_rows)


### Model Building and Evaluation

In [25]:
# Import necessary libraries for model building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

In [26]:
# Preparing the data
features = [
    'BedroomAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars',
    'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'MoSold', 'YrSold'
]

X = data_train_clean[features]
y = data_train_clean['SalePrice']



In [27]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
# Initialize and train the Logistic Regression model
model = LinearRegression()
model.fit(X_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [29]:
# Evaluate the model

def pred_eval(model, X_t, y_t):
    y_pred = model.predict(X_t)
    Mae = mean_absolute_error(y_t, y_pred)
    print("MAE:", Mae)
    return y_pred

In [30]:
# Predict and eval on the train data
y_pred = pred_eval(model, X_train, y_train)

MAE: 33923.72172371685


In [31]:
# Predict and eval on the test data
y_pred = pred_eval(model, X_test, y_test)

MAE: 33316.9201754125


### Generating Submission File

In [32]:
data_test =  pd.read_csv("module3_exercise_test.csv", sep=",", index_col='id')

In [33]:
# data preprocessing
X_test = dataClean(data_test)

In [34]:
# Preparing the data
X_train_full= data_train_clean[features]
y_train_full = data_train_clean['SalePrice']

In [35]:
# Train on all information you have
model.fit(X_train_full, y_train_full)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [36]:
# Make predictions on the test set
y_pred = model.predict(X_test[features])

In [37]:
submission = pd.DataFrame({
    'id': X_test.index,
    'SalePrice': y_pred # your_prediction
})

submission.to_csv('submission.csv', index=False, sep=',')
submission.head()

Unnamed: 0,id,SalePrice
0,892,276283.376871
1,1105,230907.910175
2,413,248570.396156
3,522,203697.846746
4,1036,154207.92339
