In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# Exploring

In [28]:
df_dvf = pd.read_csv('../raw_data/dvf.csv')

df_dvf.shape

  df_dvf = pd.read_csv('../raw_data/dvf.csv')


(271360, 41)

In [29]:
df_dvf['nature_mutation'].unique()

array(["Vente en l'état futur d'achèvement", 'Vente', 'Expropriation',
       'Echange', 'Vente terrain à bâtir', 'Adjudication',
       'nature_mutation'], dtype=object)

In [30]:
df_dvf['nature_mutation'].value_counts()

nature_mutation
Vente                                 188594
Vente en l'état futur d'achèvement     78211
Adjudication                            2066
Expropriation                           1556
Echange                                  739
Vente terrain à bâtir                    189
nature_mutation                            5
Name: count, dtype: int64

In [31]:
df_dvf.dtypes

id_mutation                     object
date_mutation                   object
numero_disposition              object
nature_mutation                 object
valeur_fonciere                 object
adresse_numero                  object
adresse_suffixe                 object
adresse_nom_voie                object
adresse_code_voie               object
code_postal                     object
code_commune                    object
nom_commune                     object
code_departement                object
ancien_code_commune             object
ancien_nom_commune              object
id_parcelle                     object
ancien_id_parcelle              object
numero_volume                   object
lot1_numero                     object
lot1_surface_carrez             object
lot2_numero                     object
lot2_surface_carrez             object
lot3_numero                     object
lot3_surface_carrez             object
lot4_numero                     object
lot4_surface_carrez      

In [32]:
df_dvf.columns

Index(['id_mutation', 'date_mutation', 'numero_disposition', 'nature_mutation',
       'valeur_fonciere', 'adresse_numero', 'adresse_suffixe',
       'adresse_nom_voie', 'adresse_code_voie', 'code_postal', 'code_commune',
       'nom_commune', 'code_departement', 'ancien_code_commune',
       'ancien_nom_commune', 'id_parcelle', 'ancien_id_parcelle',
       'numero_volume', 'lot1_numero', 'lot1_surface_carrez', 'lot2_numero',
       'lot2_surface_carrez', 'lot3_numero', 'lot3_surface_carrez',
       'lot4_numero', 'lot4_surface_carrez', 'lot5_numero',
       'lot5_surface_carrez', 'nombre_lots', 'code_type_local', 'type_local',
       'surface_reelle_bati', 'nombre_pieces_principales',
       'code_nature_culture', 'nature_culture', 'code_nature_culture_speciale',
       'nature_culture_speciale', 'surface_terrain', 'longitude', 'latitude',
       'section_prefixe'],
      dtype='object')

In [33]:
df_dvf.isnull().sum()

id_mutation                          0
date_mutation                        0
numero_disposition                   0
nature_mutation                      0
valeur_fonciere                   2591
adresse_numero                   29402
adresse_suffixe                 259219
adresse_nom_voie                 10195
adresse_code_voie                10195
code_postal                      10195
code_commune                         0
nom_commune                          0
code_departement                     0
ancien_code_commune             270288
ancien_nom_commune              270288
id_parcelle                          0
ancien_id_parcelle              271355
numero_volume                   269411
lot1_numero                      68751
lot1_surface_carrez             226795
lot2_numero                     224988
lot2_surface_carrez             259418
lot3_numero                     265455
lot3_surface_carrez             270341
lot4_numero                     269360
lot4_surface_carrez      

In [34]:
df_dvf[['code_postal', 'code_commune', 'nom_commune']].isnull().sum()/len(df_dvf)

code_postal     0.03757
code_commune    0.00000
nom_commune     0.00000
dtype: float64

In [35]:
df_dvf['nombre_lots'].astype('str').value_counts()

nombre_lots
1              156237
0               68751
2               40467
3                3905
4                1109
5                 432
6                 185
7                 114
9                  40
8                  38
10                 18
12                  8
11                  6
13                  6
15                  5
nombre_lots         5
28                  4
27                  4
20                  3
19                  3
45                  3
16                  2
34                  2
31                  2
22                  2
14                  2
198                 1
120                 1
95                  1
18                  1
21                  1
61                  1
17                  1
Name: count, dtype: int64

In [36]:
df_dvf['type_local'].unique()

array([nan, 'Appartement', 'Dépendance',
       'Local industriel. commercial ou assimilé', 'Maison', 'type_local'],
      dtype=object)

# 🔅 Preprocessings

## 🧹 Keep only useful columns and translate

### Removing columns

In [37]:
keep_col =['date_mutation',
            'nature_mutation',
            'valeur_fonciere',
            'code_postal',
            'code_commune',
            'code_departement',
            'nombre_lots',
            'type_local',
            'surface_reelle_bati',
            'nombre_pieces_principales',
            'surface_terrain',
            'longitude',
            'latitude']
df_dvf = df_dvf[keep_col]

df_dvf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271360 entries, 0 to 271359
Data columns (total 13 columns):
 #   Column                     Non-Null Count   Dtype 
---  ------                     --------------   ----- 
 0   date_mutation              271360 non-null  object
 1   nature_mutation            271360 non-null  object
 2   valeur_fonciere            268769 non-null  object
 3   code_postal                261165 non-null  object
 4   code_commune               271360 non-null  object
 5   code_departement           271360 non-null  object
 6   nombre_lots                271360 non-null  object
 7   type_local                 208331 non-null  object
 8   surface_reelle_bati        121084 non-null  object
 9   nombre_pieces_principales  208146 non-null  object
 10  surface_terrain            67421 non-null   object
 11  longitude                  267372 non-null  object
 12  latitude                   267372 non-null  object
dtypes: object(13)
memory usage: 26.9+ MB


### Translate column-names

In [38]:
df_dvf.columns = ['date', 'built', 'price', 'postal code',
                'city', 'region', 'number of units', 'property type',
                'built area', 'number of rooms', 'land area',
                'longitude', 'latitude']

### Remove non-representative rows

In [39]:
df_dvf.shape

(271360, 13)

In [40]:
df_dvf['built'].unique()

array(["Vente en l'état futur d'achèvement", 'Vente', 'Expropriation',
       'Echange', 'Vente terrain à bâtir', 'Adjudication',
       'nature_mutation'], dtype=object)

We want to keep only :
- Houses and appartments
- Normal sales and off-plan sales
- Sales with only one unit

In [41]:
df_useful = df_dvf[((df_dvf['built'] == "Vente") | (df_dvf['built'] == "Vente en l'état futur d'achèvement")) &
                    df_dvf['number of units'] == (1 or '1')]
df_useful.shape

(219246, 13)

In [42]:
df_dvf['property type'].unique()

array([nan, 'Appartement', 'Dépendance',
       'Local industriel. commercial ou assimilé', 'Maison', 'type_local'],
      dtype=object)

In [None]:
### Removing rows and columns###
df_dvf = df_dvf.dropna(subset=['valeur_fonciere', 'surface_reelle_bati', 'longitude'])
del_col =['adresse_suffixe', 'code_commune', 'nom_commune', 'code_departement',
          'ancien_code_commune', 'ancien_nom_commune', 'ancien_id_parcelle', 'numero_volume',
          'lot1_numero', 'lot1_surface_carrez', 'lot2_numero', 'lot2_surface_carrez', 'lot3_numero',
          'lot3_surface_carrez', 'lot4_numero', 'lot4_surface_carrez', 'lot5_numero', 'lot5_surface_carrez',
          'code_nature_culture', 'nature_culture', 'code_nature_culture_speciale', 'nature_culture_speciale',
          'section_prefixe']
df_dvf = df_dvf.drop(columns = del_col)

df_dvf.info()

In [None]:
### Formating data ####
df_dvf[['valeur_fonciere', 'surface_reelle_bati', 'nombre_pieces_principales', 'surface_terrain', 'longitude', 'latitude', 'code_postal', 'nombre_lots']] = df_dvf[['valeur_fonciere', 'surface_reelle_bati', 'nombre_pieces_principales', 'surface_terrain', 'longitude', 'latitude', 'code_postal', 'nombre_lots']].apply(lambda x: pd.to_numeric(x, errors='coerce').astype('float64'))

df_dvf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55499 entries, 17 to 107198
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id_mutation                55499 non-null  object 
 1   date_mutation              55499 non-null  object 
 2   numero_disposition         55499 non-null  object 
 3   nature_mutation            55499 non-null  object 
 4   valeur_fonciere            55497 non-null  float64
 5   adresse_numero             55149 non-null  object 
 6   adresse_nom_voie           55499 non-null  object 
 7   adresse_code_voie          55499 non-null  object 
 8   code_postal                55497 non-null  float64
 9   id_parcelle                55499 non-null  object 
 10  nombre_lots                55497 non-null  float64
 11  code_type_local            55499 non-null  object 
 12  type_local                 55499 non-null  object 
 13  surface_reelle_bati        55497 non-null  f

In [None]:
df_dvf['prix_m_2'] = df_dvf['valeur_fonciere']/ df_dvf['surface_reelle_bati']



df_dvf.head(5)

Unnamed: 0,id_mutation,date_mutation,numero_disposition,nature_mutation,valeur_fonciere,adresse_numero,adresse_nom_voie,adresse_code_voie,code_postal,id_parcelle,nombre_lots,code_type_local,type_local,surface_reelle_bati,nombre_pieces_principales,surface_terrain,longitude,latitude,prix_m_2
17,2022-60863,2022-06-21,1,Vente,630000.0,209.0,RTE DE BELLET,745,6200.0,06088000MV0142,1.0,1.0,Maison,123.0,5.0,,7.222367,43.703745,5121.95122
61,2021-53129,2021-01-07,1,Vente,175000.0,119.0,BD GAMBETTA,2755,6000.0,06088000MH0122,2.0,2.0,Appartement,44.0,2.0,,7.255831,43.707262,3977.272727
62,2021-53130,2021-01-05,1,Vente,143600.0,166.0,BD DE LA MADELEINE,3820,6000.0,06088000ML0059,1.0,2.0,Appartement,50.0,2.0,,7.236937,43.701856,2872.0
63,2021-53132,2021-01-06,1,Vente,373500.0,144.0,RTE ST PIERRE FERIC,5910,6000.0,06088000LY0389,1.0,2.0,Appartement,66.0,3.0,,7.240552,43.714549,5659.090909
64,2021-53133,2021-01-06,1,Vente,80450.0,5.0,RUE ABBE SALVETTI,27,6300.0,06088000IY0079,1.0,2.0,Appartement,20.0,1.0,,7.292555,43.703503,4022.5


In [None]:
df_dvf['surface_terrain'].fillna(value=0, inplace=True)

df_dvf.head(5)

Unnamed: 0,id_mutation,date_mutation,numero_disposition,nature_mutation,valeur_fonciere,adresse_numero,adresse_nom_voie,adresse_code_voie,code_postal,id_parcelle,nombre_lots,code_type_local,type_local,surface_reelle_bati,nombre_pieces_principales,surface_terrain,longitude,latitude,prix_m_2
17,2022-60863,2022-06-21,1,Vente,630000.0,209.0,RTE DE BELLET,745,6200.0,06088000MV0142,1.0,1.0,Maison,123.0,5.0,0.0,7.222367,43.703745,5121.95122
61,2021-53129,2021-01-07,1,Vente,175000.0,119.0,BD GAMBETTA,2755,6000.0,06088000MH0122,2.0,2.0,Appartement,44.0,2.0,0.0,7.255831,43.707262,3977.272727
62,2021-53130,2021-01-05,1,Vente,143600.0,166.0,BD DE LA MADELEINE,3820,6000.0,06088000ML0059,1.0,2.0,Appartement,50.0,2.0,0.0,7.236937,43.701856,2872.0
63,2021-53132,2021-01-06,1,Vente,373500.0,144.0,RTE ST PIERRE FERIC,5910,6000.0,06088000LY0389,1.0,2.0,Appartement,66.0,3.0,0.0,7.240552,43.714549,5659.090909
64,2021-53133,2021-01-06,1,Vente,80450.0,5.0,RUE ABBE SALVETTI,27,6300.0,06088000IY0079,1.0,2.0,Appartement,20.0,1.0,0.0,7.292555,43.703503,4022.5


In [None]:
df_dvf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55499 entries, 17 to 107198
Data columns (total 19 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id_mutation                55499 non-null  object 
 1   date_mutation              55499 non-null  object 
 2   numero_disposition         55499 non-null  object 
 3   nature_mutation            55499 non-null  object 
 4   valeur_fonciere            55497 non-null  float64
 5   adresse_numero             55149 non-null  object 
 6   adresse_nom_voie           55499 non-null  object 
 7   adresse_code_voie          55499 non-null  object 
 8   code_postal                55497 non-null  float64
 9   id_parcelle                55499 non-null  object 
 10  nombre_lots                55497 non-null  float64
 11  code_type_local            55499 non-null  object 
 12  type_local                 55499 non-null  object 
 13  surface_reelle_bati        55497 non-null  f

In [None]:
df_dvf.head(5)

Unnamed: 0,id_mutation,date_mutation,numero_disposition,nature_mutation,valeur_fonciere,adresse_numero,adresse_nom_voie,adresse_code_voie,code_postal,id_parcelle,nombre_lots,code_type_local,type_local,surface_reelle_bati,nombre_pieces_principales,surface_terrain,longitude,latitude,prix_m_2
17,2022-60863,2022-06-21,1,Vente,630000.0,209.0,RTE DE BELLET,745,6200.0,06088000MV0142,1.0,1.0,Maison,123.0,5.0,0.0,7.222367,43.703745,5121.95122
61,2021-53129,2021-01-07,1,Vente,175000.0,119.0,BD GAMBETTA,2755,6000.0,06088000MH0122,2.0,2.0,Appartement,44.0,2.0,0.0,7.255831,43.707262,3977.272727
62,2021-53130,2021-01-05,1,Vente,143600.0,166.0,BD DE LA MADELEINE,3820,6000.0,06088000ML0059,1.0,2.0,Appartement,50.0,2.0,0.0,7.236937,43.701856,2872.0
63,2021-53132,2021-01-06,1,Vente,373500.0,144.0,RTE ST PIERRE FERIC,5910,6000.0,06088000LY0389,1.0,2.0,Appartement,66.0,3.0,0.0,7.240552,43.714549,5659.090909
64,2021-53133,2021-01-06,1,Vente,80450.0,5.0,RUE ABBE SALVETTI,27,6300.0,06088000IY0079,1.0,2.0,Appartement,20.0,1.0,0.0,7.292555,43.703503,4022.5


In [None]:
import project

  df_dvf = pd.read_csv('dvf.csv')


In [None]:
df_dvf

Unnamed: 0,id_mutation,date_mutation,numero_disposition,nature_mutation,valeur_fonciere,adresse_numero,adresse_nom_voie,adresse_code_voie,code_postal,id_parcelle,nombre_lots,code_type_local,type_local,surface_reelle_bati,nombre_pieces_principales,surface_terrain,longitude,latitude,prix_m_2
17,2022-60863,2022-06-21,1,Vente,630000.0,209.0,RTE DE BELLET,0745,6200.0,06088000MV0142,1.0,1.0,Maison,123.0,5.0,0.0,7.222367,43.703745,5121.951220
61,2021-53129,2021-01-07,1,Vente,175000.0,119.0,BD GAMBETTA,2755,6000.0,06088000MH0122,2.0,2.0,Appartement,44.0,2.0,0.0,7.255831,43.707262,3977.272727
62,2021-53130,2021-01-05,1,Vente,143600.0,166.0,BD DE LA MADELEINE,3820,6000.0,06088000ML0059,1.0,2.0,Appartement,50.0,2.0,0.0,7.236937,43.701856,2872.000000
63,2021-53132,2021-01-06,1,Vente,373500.0,144.0,RTE ST PIERRE FERIC,5910,6000.0,06088000LY0389,1.0,2.0,Appartement,66.0,3.0,0.0,7.240552,43.714549,5659.090909
64,2021-53133,2021-01-06,1,Vente,80450.0,5.0,RUE ABBE SALVETTI,0027,6300.0,06088000IY0079,1.0,2.0,Appartement,20.0,1.0,0.0,7.292555,43.703503,4022.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107189,2020-55959,2020-11-04,1,Vente,280000.0,83,BD MANTEGA RIGHI,3943,6100.0,06088000LW0407,2.0,2,Appartement,66.0,3.0,0.0,7.248990,43.712026,4242.424242
107193,2020-55961,2020-10-29,1,Vente,238000.0,23,AV LOUIS CAPPATTI,3734,6200.0,06088000OI0362,1.0,2,Appartement,48.0,2.0,0.0,7.207005,43.683574,4958.333333
107195,2020-55963,2020-10-26,1,Vente,500000.0,276,AV SAINTE MARGUERITE,6035,6200.0,06088000OL0212,1.0,1,Maison,61.0,4.0,0.0,7.202353,43.689002,8196.721311
107197,2020-55965,2020-10-13,1,Vente,140000.0,24,AV SAINT SYLVESTRE,5960,6100.0,06088000EH0271,1.0,2,Appartement,33.0,1.0,0.0,7.249229,43.725592,4242.424242


# 🐝 Model

In [44]:
%load_ext autoreload
%autoreload 2

In [45]:
# Import modules
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Data

In [46]:
#Data is called df_dvf, to be renamed into data
data = df_dvf

X = data.drop(columns=['price'])
y = data['price']

In [43]:
col_names = ['date', 'built', 'price', 'postal code',
                'city', 'region', 'number of units', 'property type',
                'built area', 'number of rooms', 'land area',
                'longitude', 'latitude']

col_names = ['date', 'built', 'price', 'postal_code',
                'city', 'region', 'number_of_units', 'property_type',
                'living_area', 'number_of_rooms',
                'longitude', 'latitude']

## Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X =
y =
X_scaled = scaler.fit_transform(X)

# Convert X_scaled (np array) back to a DataFrame with column names
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Concatenate into one dataframe with review_score
scaled_orders = pd.concat([X_scaled, y], axis=1)


## Correlation

In [None]:
plt.figure(figsize = (10,10))

sns.heatmap(
    data.corr(),
    cmap = 'coolwarm',
    annot = True,
    annot_kws = {"size": 10}
)

## 🐧 Linear Regression

### Univariate Regression

In [None]:
import statsmodels.formula.api as smf

model_uni = smf.ols(formula='price ~ land_area', data=data).fit()
model_uni.rsquared

model_uni.summary()

### Multivariate Regression

In [None]:
import statsmodels.formula.api as smf

model_mul = smf.ols(formula='price ~ land_area + number_of_units', data=data).fit()
model_mul.rsquared

model_mul.summary()

#### Important feature identification

In [None]:
model_mul.params.drop('Intercept')

coefficients = model_mul.params
coefficients = coefficients.drop('Intercept')
plt.bar(coefficients.index, coefficients)
plt.title('Important features correlating with review_score, p = 0.04')

#### Model Performance

In [None]:
#Residual computation (should be close to 0)
n = len(X_std)
predicted_values = model_mul.predict()
residuals = X_std['price'] - predicted_values
residuals.std() / X_std.wait_time.std() / (n-2)**0.5

#Root Mean Squared Error
rmse = np.sqrt(np.mean(residuals**2))
rmse

#Plot residuals
sns.histplot(residuals, kde=True, edgecolor='w')
plt.title('Residuals')

### Linear Regression with ML

In [48]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate, cross_val_score

## 🐓 KNN Regression

## Decision Tree

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import validation_curve
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

X = df.drop(columns=['price'])
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()

model.fit(X_train, y_train)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print('Train score:', model.score(X_test, y_test))
print('Cross val score mean', cross_val_score(model, X_train, y_train, cv = 5).mean())

## 🐬 Ensemble Model: (XGBoost recomm)

In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()

model.fit(X_train, y_train)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print( 'Train score:', model.score(X_test, y_test))
print('Cross val score mean', cross_val_score(model, X_train, y_train, cv = 5).mean())

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor()

model.fit(X_train, y_train)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print( 'Train score:', model.score(X_test, y_test))
print('Cross val score mean', cross_val_score(model, X_train, y_train, cv = 5).mean())

## 🐇 Optional: Deep Learning