In [32]:
# importation des bibliotheques necessaires

import numpy as np
import pandas as pd


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score


In [33]:
df = pd.read_csv('data/donneesMaisons.csv')
df.head()

Unnamed: 0,Quartier,Surface,Chambres,AnneeConstruction,DistanceCentreVille,QualiteQuartier,Adresse,Prix
0,Elhouda,174.91,4,2001,3045.02,2,"20 Rue C, Béni Mellal",245023.0
1,Taqaddom,290.14,4,1991,4260.18,4,"44 Rue D, Fes",321516.0
2,Almassira1,246.4,4,1998,581.9,5,"69 Rue B, Rabat",129535.0
3,Elkasba,219.73,3,2011,4109.8,2,"78 Rue D, Agadir",309495.0
4,Elkasba,131.2,4,1994,4761.03,8,"77 Rue A, Tanger",325335.0


# EDA : Exploratory Data Analysis

In [34]:
df.shape

(5000, 8)

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Quartier             4990 non-null   object 
 1   Surface              4991 non-null   float64
 2   Chambres             5000 non-null   int64  
 3   AnneeConstruction    5000 non-null   int64  
 4   DistanceCentreVille  4991 non-null   float64
 5   QualiteQuartier      5000 non-null   int64  
 6   Adresse              5000 non-null   object 
 7   Prix                 4500 non-null   float64
dtypes: float64(3), int64(3), object(2)
memory usage: 312.6+ KB


In [36]:
# suppression des colonnes non necessaires
df = df.drop(['Quartier', 'Adresse'], axis=1)
df['Prix'] = df['Prix'] + 10**5
df.head()

Unnamed: 0,Surface,Chambres,AnneeConstruction,DistanceCentreVille,QualiteQuartier,Prix
0,174.91,4,2001,3045.02,2,345023.0
1,290.14,4,1991,4260.18,4,421516.0
2,246.4,4,1998,581.9,5,229535.0
3,219.73,3,2011,4109.8,2,409495.0
4,131.2,4,1994,4761.03,8,425335.0


In [37]:
# Gestions des valeurs manquantes
val_manq = df.isnull().sum()
val_manq

Surface                  9
Chambres                 0
AnneeConstruction        0
DistanceCentreVille      9
QualiteQuartier          0
Prix                   500
dtype: int64

In [38]:
# Remplissage des valeurs manquantes
df = df.fillna(df.mean())

In [39]:
# division du dataset en features et target
X = df.drop('Prix', axis=1)
y = df['Prix']

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) 
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(4000, 5) (1000, 5) (4000,) (1000,)


In [41]:
# sauvegarde des données
df.to_csv('data/donneesMaisonsNettoye.csv', index=False)

In [42]:
# creation du modele

model = LinearRegression()
model.fit(X_train, y_train)


In [43]:
# calcul de la precision du modele
accuracy = model.score(X_test, y_test)
print(accuracy)

0.888299834586303


In [44]:
# prediction
y_pred = model.predict(X_test)
print(y_pred)

# comparaison des predictions avec les valeurs reelles
comparaison = pd.DataFrame({'Prediction': y_pred, 'Reel': y_test})
comparaison.head()

[246723.13341207 324973.71438616 374410.41288461 408015.00688456
 395165.93660213 316463.87677186 263093.9734551  304948.8376181
 316755.84919174 412920.39608038 367587.06447402 292084.74859482
 414547.10798792 289116.31624209 429522.56981343 387646.44847741
 270207.62819874 250746.37357994 247221.55170606 248240.09933971
 296903.11169777 418478.68272207 423997.16334012 297771.59232239
 398392.92784918 275531.99071942 395149.49212382 263427.62008944
 321327.21175474 366802.14055518 324304.3873382  379490.3197823
 308441.38400126 291561.63604344 293321.87769481 364392.44851467
 391242.47915018 299475.16538107 359994.93134719 282585.23507164
 322567.33514149 274542.96096817 376167.83656394 240953.46308706
 298160.51199808 262109.79074866 430930.29512864 395574.61463673
 247523.69121437 322386.78057242 368017.0227737  258373.30583585
 391366.29744309 330263.43477591 303367.55312003 321944.28453916
 289752.89390929 361230.68892034 408755.08709053 306906.40407293
 273131.77548201 268621.259

Unnamed: 0,Prediction,Reel
398,246723.133412,336256.672667
3833,324973.714386,326244.0
4836,374410.412885,373113.0
4572,408015.006885,414307.0
636,395165.936602,399886.0


In [45]:
# sauvegarde du modele
import joblib
joblib.dump(model, 'models/model.pkl')

['models/model.pkl']