In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

### Data Collection

In [2]:
import requests

# URLs of the files
train_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_train.csv'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_test.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_data_url, 'module3_exercise_train.csv')
download_file(test_data_url, 'module3_exercise_test.csv')

Downloaded module3_exercise_train.csv from https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_train.csv
Downloaded module3_exercise_test.csv from https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_test.csv


In [3]:
df =  pd.read_csv("module3_exercise_train.csv", sep=",", index_col='id')
test =  pd.read_csv("module3_exercise_test.csv", sep=",", index_col='id')

### Comprendre les données

In [4]:
df.sample(10)

Unnamed: 0_level_0,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
737,3,1,7.0,1,1964.0,two,440,239,42,0,0,0,0,0,7,2007.0,187000
1112,3,1,7.0,1,1916.0,one,225,0,0,330,0,0,0,0,7,2006.0,144000
232,3,1,4.0,0,1979.0,one,264,0,0,140,0,0,0,0,6,2007.0,100000
760,2,1,5.0,1,1978.0,two,588,168,180,0,0,0,0,0,6,2007.0,194000
1070,3,1,5.0,1,1950.0,one,352,0,0,248,0,0,0,0,7,2009.0,110000
1401,1,1,3.0,0,-1.0,zero,0,96,24,0,0,0,0,0,5,2010.0,75500
1348,2,1,6.0,0,2004.0,two,400,0,113,0,0,0,0,0,10,2009.0,185000
1017,4,1,8.0,1,1968.0,two,487,0,98,0,0,0,0,0,9,2006.0,170000
1343,3,1,10.0,1,2009.0,three,746,144,76,0,0,0,0,0,5,2010.0,378500
1354,4,1,8.0,0,1990.0,two,693,0,0,0,0,0,0,0,4,2006.0,250000


In [5]:
# Infos générales sur le data
print("Infos générales sur le data")
df.info()

Infos générales sur le data
<class 'pandas.core.frame.DataFrame'>
Index: 1168 entries, 254 to 1126
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   BedroomAbvGr   1168 non-null   int64  
 1   KitchenAbvGr   1168 non-null   int64  
 2   TotRmsAbvGrd   1167 non-null   float64
 3   Fireplaces     1168 non-null   int64  
 4   GarageYrBlt    1168 non-null   float64
 5   GarageCars     1167 non-null   object 
 6   GarageArea     1168 non-null   int64  
 7   WoodDeckSF     1168 non-null   int64  
 8   OpenPorchSF    1168 non-null   int64  
 9   EnclosedPorch  1168 non-null   int64  
 10  3SsnPorch      1168 non-null   int64  
 11  ScreenPorch    1168 non-null   int64  
 12  PoolArea       1168 non-null   int64  
 13  MiscVal        1168 non-null   int64  
 14  MoSold         1168 non-null   int64  
 15  YrSold         1167 non-null   float64
 16  SalePrice      1168 non-null   int64  
dtypes: float64(3), int64(13), o

In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 292 entries, 892 to 722
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   BedroomAbvGr   292 non-null    int64  
 1   KitchenAbvGr   292 non-null    int64  
 2   TotRmsAbvGrd   292 non-null    int64  
 3   Fireplaces     291 non-null    float64
 4   GarageYrBlt    292 non-null    float64
 5   GarageCars     292 non-null    object 
 6   GarageArea     292 non-null    int64  
 7   WoodDeckSF     292 non-null    int64  
 8   OpenPorchSF    292 non-null    int64  
 9   EnclosedPorch  291 non-null    float64
 10  3SsnPorch      292 non-null    int64  
 11  ScreenPorch    292 non-null    int64  
 12  PoolArea       292 non-null    int64  
 13  MiscVal        292 non-null    int64  
 14  MoSold         292 non-null    int64  
 15  YrSold         292 non-null    int64  
dtypes: float64(3), int64(12), object(1)
memory usage: 38.8+ KB


In [7]:
df.describe(include=['object'])

Unnamed: 0,GarageCars
count,1167
unique,5
top,two
freq,662


In [8]:
print("Unique values in each column:")
print(f"{'GarageCars'}: {df['GarageCars'].nunique()} - {df['GarageCars'].unique()}")

Unique values in each column:
GarageCars: 5 - ['two' 'one' 'three' 'zero' nan 'four']


In [9]:
# sur tout le data
for col in df.columns:
    print(f"Colonne: {col}")
    print("Nombre d'éléments par type de données :")
    print(df[col].apply(type).value_counts())
    print("-" * 40)

Colonne: BedroomAbvGr
Nombre d'éléments par type de données :
BedroomAbvGr
<class 'int'>    1168
Name: count, dtype: int64
----------------------------------------
Colonne: KitchenAbvGr
Nombre d'éléments par type de données :
KitchenAbvGr
<class 'int'>    1168
Name: count, dtype: int64
----------------------------------------
Colonne: TotRmsAbvGrd
Nombre d'éléments par type de données :
TotRmsAbvGrd
<class 'float'>    1168
Name: count, dtype: int64
----------------------------------------
Colonne: Fireplaces
Nombre d'éléments par type de données :
Fireplaces
<class 'int'>    1168
Name: count, dtype: int64
----------------------------------------
Colonne: GarageYrBlt
Nombre d'éléments par type de données :
GarageYrBlt
<class 'float'>    1168
Name: count, dtype: int64
----------------------------------------
Colonne: GarageCars
Nombre d'éléments par type de données :
GarageCars
<class 'str'>      1167
<class 'float'>       1
Name: count, dtype: int64
-------------------------------------

In [10]:
# 1. Handle Inconsistencies
# 1. Handle Inconsistencies
def handle_inconsistencies(X_train, y_train, X_val=None):
    # Nettoyage des colonnes catégorielles
    X_tr = X_train.copy()
    X_tr['GarageCars'] = X_tr['GarageCars'].fillna(X_tr['GarageCars'].mode()[0])
    X_tr['GarageCars'] = X_tr['GarageCars'].astype(str).str.strip().str.lower()

    if X_val is not None:
        X_te = X_val.copy()
        X_te['GarageCars'] = X_te['GarageCars'].fillna(X_te['GarageCars'].mode()[0])
        X_te['GarageCars'] = X_te['GarageCars'].astype(str).str.strip().str.lower()
        return X_tr, y_train.copy(), X_te
    else:
        return X_tr, y_train.copy()
        

In [11]:
# nombre de copies uniquement
nb_doublons_copies = df.duplicated().sum()
print("Nombre de doublons (copies uniquement) :", nb_doublons_copies)

Nombre de doublons (copies uniquement) : 1


In [12]:
exact_duplicates = df[df.duplicated()]
print("Exact duplicates:")
print(exact_duplicates)

Exact duplicates:
      BedroomAbvGr  KitchenAbvGr  TotRmsAbvGrd  Fireplaces  GarageYrBlt  \
id                                                                        
1389             3             1           7.0           0       2004.0   

     GarageCars  GarageArea  WoodDeckSF  OpenPorchSF  EnclosedPorch  \
id                                                                    
1389        two         380           0           40              0   

      3SsnPorch  ScreenPorch  PoolArea  MiscVal  MoSold  YrSold  SalePrice  
id                                                                          
1389          0            0         0        0       4  2006.0     130000  


In [13]:
# 2. Handling Duplicates
def handle_duplicates(X_train, y_train, X_val=None):
    # Supprimer les doublons dans X_train
    X_train_no_duplicate = X_train.drop_duplicates()
    # Garder les mêmes indices dans y_train que dans X_train_no_duplicate
    y_train_no_duplicate = y_train.loc[X_train_no_duplicate.index]

    if X_val is not None:
        # Supprimer les doublons dans X_val
        X_val_no_duplicate = X_val.drop_duplicates()
        return X_train_no_duplicate, y_train_no_duplicate, X_val_no_duplicate
    else:
        return X_train_no_duplicate, y_train_no_duplicate


In [14]:
# Nombre de valeurs manquantes par colonne
# Summarize missing values per column
print("\nMissing values per column:")
missing_data = df.isnull().sum()
print(missing_data)


Missing values per column:
BedroomAbvGr     0
KitchenAbvGr     0
TotRmsAbvGrd     1
Fireplaces       0
GarageYrBlt      0
GarageCars       1
GarageArea       0
WoodDeckSF       0
OpenPorchSF      0
EnclosedPorch    0
3SsnPorch        0
ScreenPorch      0
PoolArea         0
MiscVal          0
MoSold           0
YrSold           1
SalePrice        0
dtype: int64


In [15]:
# Nombre de lignes avec au moins une valeur manquante
print("Nombre de lignes avec au moins une valeur manquante")
df.isnull().any(axis=1).sum()

Nombre de lignes avec au moins une valeur manquante


np.int64(3)

In [16]:
def val_manquante(X_train, y_train, X_val=None):
    xt = X_train.copy()
    xt['GarageCars'] = xt['GarageCars'].fillna(xt['GarageCars'].mode()[0])
    xt['YrSold'] = xt['YrSold'].fillna(xt['YrSold'].mean())
    xt['TotRmsAbvGrd'] = xt['TotRmsAbvGrd'].fillna(xt['TotRmsAbvGrd'].mean())
    
    if X_val is not None:
        xv = X_val.copy()
        xv['GarageCars'] = xv['GarageCars'].fillna(xv['GarageCars'].mode()[0])
        xv['YrSold'] = xv['YrSold'].fillna(xv['YrSold'].mean())
        xv['TotRmsAbvGrd'] = xv['TotRmsAbvGrd'].fillna(xv['TotRmsAbvGrd'].mean())
        return xt, y_train.copy(), xv
    else:
        return xt, y_train.copy()

In [17]:
def format_cor(X_train, y_train, X_val=None):
    # Dictionnaire pour convertir les mots en chiffres
    word_to_num = {
        'zero': 0,
        'one': 1,
        'two': 2,
        'three': 3,
        'four': 4,
        'five': 5,
        'six': 6,
        'seven': 7,
        'eight': 8,
        'nine': 9,
        'ten': 10
    }

    def convert_column(df):
        df = df.copy()
        # Convertir en string, mettre en minuscules, puis mapper
        df['GarageCars'] = df['GarageCars'].apply(
            lambda x: word_to_num.get(str(x).strip().lower()) if pd.notnull(x) else x
        )
        return df

    X_train = convert_column(X_train)

    if X_val is not None:
        X_val = convert_column(X_val)
        return X_train, y_train.copy(), X_val
    else:
        return X_train, y_train.copy()



In [18]:
print(df['SalePrice'])

id
254     127500
1066    316600
638     258000
799     135000
380     167240
         ...  
1095    130000
1130    145000
1294    265000
860     155000
1126    239000
Name: SalePrice, Length: 1168, dtype: int64


# Data Preprocessing

In [19]:
y_train = df['SalePrice']
X_train = df.drop('SalePrice', axis=1)
X_val = test

In [20]:
X_train,y_train,X_val = handle_inconsistencies(X_train, y_train, X_val)
X_train,y_train,X_val = val_manquante(X_train, y_train, X_val)
X_train,y_train,X_val = format_cor(X_train, y_train, X_val)

# Model Building and Evaluation

In [21]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [22]:
pip install lightgbm

Note: you may need to restart the kernel to use updated packages.


In [23]:
pip install scikit-optimize

Note: you may need to restart the kernel to use updated packages.


In [24]:
# Import necessary libraries for model building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

In [25]:
# Preparing the data
#features = [
#    'BedroomAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars',
 #   'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'MoSold', 'YrSold'
#]

#X = data_train_clean[features]
#y = data_train_clean['SalePrice']



In [26]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [27]:
# Initialize and train the Logistic Regression model
model = LinearRegression()
model.fit(X_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [28]:
# Evaluate the model

def pred_eval(model, X_t, y_t):
    y_pred = model.predict(X_t)
    Mae = mean_absolute_error(y_t, y_pred)
    print("MAE:", Mae)
    return y_pred

In [29]:
# Predict and eval on the train data
y_pred = pred_eval(model, X_train, y_train)

MAE: 32390.918722459748


In [30]:
# Predict and eval on the test data
y_pred = pred_eval(model, X_test, y_test)

MAE: 32521.58344497896


### Generating Submission File

In [33]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [34]:
submission = pd.DataFrame({
    'id': X_test.index,
    'SalePrice': y_pred # your_prediction
})

submission.to_csv('submission.csv', index=False, sep=',')
submission.head()

Unnamed: 0,id,SalePrice
0,1411,177506.895714
1,315,170291.607994
2,583,78054.292937
3,1342,312205.920195
4,891,226668.748866
