# ⬇️ Imports

In [2]:
%load_ext autoreload

%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
import pandas as pd
import numpy as np
import matplotlib as plt
from pathlib import Path
import seaborn as sns


import warnings
warnings.filterwarnings('ignore')


In [None]:
# Get the data from the csv file
df_cleaned = pd.read_csv('../raw_data/dvf_full_cleaned.csv')

df_cleaned.shape

# ✅ Downloading the data

region (department) exceptions :
- 20 not existing ->2A and 2B
- 57, 67, 68 -> no data
- 69 -> 69D and 59M

In [None]:
## list all the number of regions we can download
regions = list(np.arange(1,96))
# regions = list(map(str, regions))
regions = list(np.delete(regions, [20-1, 57-1, 67-1, 68-1]))
regions = regions + ['2A', '2B']
print(regions)

In [None]:
# download all csv files

# from pathlib import Path

for index, num in enumerate(regions):
    if index < 9 :
        url = f'https://dvf-api.data.gouv.fr/dvf/csv/?dep=0{num}'
    else :
        url = f'https://dvf-api.data.gouv.fr/dvf/csv/?dep={num}'

    path = Path(f'../raw_data/dvf_{num}.csv')

    if not path.is_file():
        df = pd.read_csv(url)
        df.to_csv(path)
        print(f'Downloaded region n° {num} 💪')

    else:
        print(f'region n° {num} is already downloaded ! 🚀')


In [None]:
df = pd.DataFrame()
for num in regions[1:3]:
    df = pd.concat([df, pd.read_csv(f'../raw_data/dvf_{num}.csv')])
    print(f'Concatenated region n°{num} to the df 🌍')

In [None]:
# # concat all csv files in 1 big dataframe --> too big for the kernel -> VertexAI
# df = pd.DataFrame()
# for num in regions:
#     df = pd.concat([df, pd.read_csv(f'raw_data/dvf_{num}.csv')])
#     print(f'Concatenated region n°{num} to the df 🌍')

> To concat everything without killing the kernel --> VertexAI

In [None]:
# df.shape
# # (19575540, 42)

In [None]:
# ### save full df raw as 1 csv
# df.to_csv(f'raw_data/dvf_full_raw.csv')

In [None]:
# ## to clean all the data

# from data import clean_data
# df_cleaned = clean_data(df)
# df_cleaned.shape #(1029674, 11)

In [None]:
# df_cleaned.to_csv(f'raw_data/dvf_full_cleaned.csv')

In [None]:
# Get the data from the csv file
dvf_full_raw = pd.read_csv('../raw_data/dvf_full_raw.csv')

dvf_full_raw.shape

In [None]:
# Get the data from the csv file
df_cleaned = pd.read_csv('../raw_data/dvf_full_cleaned.csv')

df_cleaned.shape

In [None]:
df_cleaned.head()

# 🔎 Exploring cleaned data

In [None]:
df_cleaned.shape

In [None]:
mask_price = df_cleaned['price']<1000000 #smaller than 1 milion
mask_area = df_cleaned['living_area']<500 # smaller than 500 sq meters
df_without_outliers = df_cleaned[mask_price & mask_area]
df_without_outliers.shape

In [None]:
sns.scatterplot(df_without_outliers.longitude,
                df_without_outliers.latitude,
                alpha=0.01,
                size=0.01,
                hue = df_without_outliers.price)

In [None]:
df_dvf = pd.read_csv('../raw_data/dvf_93.csv')

df_dvf.shape

# 🔎 Exploring 1 region

In [None]:
df_dvf.duplicated().sum()

In [None]:
df_dvf['nature_mutation'].unique()

In [None]:
df_dvf['nature_mutation'].value_counts()

In [None]:
df_dvf.dtypes

In [None]:
df_dvf.columns

In [None]:
df_dvf.isnull().sum()

In [None]:
df_dvf[['code_postal', 'code_commune', 'nom_commune']].isnull().sum()/len(df_dvf)

In [None]:
df_dvf['nombre_lots'].astype('str').value_counts()

In [None]:
df_dvf['type_local'].unique()

# 🔅 Preprocessing

## 🧹 Keep only useful columns and rows and translate

### Removing columns

In [None]:
keep_col =['date_mutation',
            'nature_mutation',
            'valeur_fonciere',
            'code_postal',
            'code_commune',
            'code_departement',
            'nombre_lots',
            'type_local',
            'surface_reelle_bati',
            'nombre_pieces_principales',
            'longitude',
            'latitude']
df_dvf = df_dvf[keep_col]

df_dvf.info()

### Translate column-names

In [None]:
df_dvf.columns = ['date', 'built', 'price', 'postal_code',
                'city', 'region', 'number_of_units', 'property_type',
                'living_area', 'number_of_rooms',
                'longitude', 'latitude']

### Remove non-representative rows

In [None]:
df_dvf.shape

In [None]:
df_dvf['built'].unique()

In [None]:
df_dvf.shape

In [None]:
df_dvf['built'].unique()

We want to keep only :
- Houses and appartments
- Normal sales and off-plan sales
- Sales with only one unit

In [None]:
df_dvf['built'].unique()

In [None]:
print('Shape before the masks : ', df_dvf.shape)
print('Shape only normal sales', df_dvf[((df_dvf['built'] == "Vente") | (df_dvf['built'] == "Vente en l'état futur d'achèvement"))].shape)
print('Shape only 1 unit', df_dvf[((df_dvf['number_of_units'] == 1) | (df_dvf['number_of_units'] == '1'))].shape)
print('Shape only house and appartments', df_dvf[((df_dvf['property_type'] == 'Appartement') | (df_dvf['property_type'] == 'Maison'))].shape)

In [None]:
df_useful = df_dvf[((df_dvf['built'] == "Vente") | (df_dvf['built'] == "Vente en l'état futur d'achèvement")) &
                    ((df_dvf['number_of_units'] == 1) | (df_dvf['number_of_units'] == '1')) &
                    ((df_dvf['property_type'] == 'Appartement') | (df_dvf['property_type'] == 'Maison'))]
df_useful.shape

In [None]:
## checking that the masks worked correctly
print(df_useful['built'].unique())
print(df_useful['number_of_units'].unique())
print(df_useful['property_type'].unique())


### Translating the values

In [None]:
trans_dict_built = {'Vente' : 'built',
                    "Vente en l'état futur d'achèvement" : 'off-plan'}
trans_dict_type = {'Appartement' : 'appartment',
                   'Maison' : 'house'}
df_useful= df_useful.replace({'built' : trans_dict_built,
                             'property_type' : trans_dict_type})

In [None]:
## checking that the translation worked correctly
print(df_useful['built'].unique())
print(df_useful['property_type'].unique())

In [None]:
trans_dict_built = {'Vente' : 'built',
                    "Vente en l'état futur d'achèvement" : 'off-plan'}
trans_dict_type = {'Appartement' : 'appartment',
                   'Maison' : 'house'}

df_useful['built'] = df_useful['built'].replace(trans_dict_built)
df_useful['property_type'] = df_useful['property_type'].replace(trans_dict_type)

In [None]:
# checking we didn't lose rows
df_useful.shape

## 🚮 Dropping duplicates and NaNs

In [None]:
df_useful.columns

In [None]:
## dropping the column with number of units (only 1s)
df_useful = df_useful.drop(columns='number_of_units')

In [None]:
df_useful.columns

In [None]:
## checking for NaN values
df_useful.isna().sum()
df_useful = df_useful.dropna()
df_useful.shape
df_useful = df_useful.drop_duplicates()
df_useful.shape

## 🦖 Changing everything to the right type

In [None]:
col_float = ['price', 'longitude', 'latitude', 'postal_code', 'living_area', 'number_of_rooms']
col_string = ['built','city', 'region','property_type']
col_date = ['date']
### Formating data types ####
df_useful[col_float] = df_useful[col_float].apply(lambda x: pd.to_numeric(x, errors='coerce').astype('float64'))
df_useful[col_date]= df_useful[col_date].apply(lambda x: pd.to_datetime(x, errors='coerce'))

df_useful.info()

In [None]:
df_useful.shape

In [None]:
df_useful.head()

In [None]:
# import folium
# m = folium.Map(location = [-33.8, 151.2], tiles ='OpenStreetMap', zoom_start=11)
import seaborn as sns
sns.scatterplot(x = df_useful['longitude'], y = df_useful['latitude'])

## ✨ data.py : clean_data + download_csv

# 🤓 Feature engineering

# 🐧 Modelling

## 🔎 Exploring some more

In [None]:
df_cleaned.columns

In [None]:
# df_cleaned.shape # (1029674, 12)

In [None]:
# df_cleaned[mask_price].shape # (1027158, 12)

In [None]:
import matplotlib.pyplot as plt

# fig, axs = plt.subplots(1, 2, figsize=(12, 5))  # 1 row, 2 columns
mask_price = df_cleaned['price']<10000000 #smaller than 10 milion
mask_area = df_cleaned['living_area']<500 # smaller than 500 sq meters
df_without_outliers = df_cleaned[mask_price & mask_area]

# Plot Living area vs Sale price
plt.figure(figsize = (12,5))
sns.scatterplot(df_without_outliers['living_area'], df_without_outliers['price'],hue = df_without_outliers['property_type'],alpha = 0.5)
plt.show()

In [None]:
df_without_outliers['property_type'].value_counts()

In [None]:
sns.heatmap(df_without_outliers.corr(), cmap = 'vlag', annot = True)

In [None]:
df_without_outliers.info()

In [None]:
##### we have to scale data first ######

## 🐟 Naive baseline model

In [None]:
X = df_without_outliers.drop(columns=['price'])
y = df_without_outliers['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
mean_price = y_train.mean()
print(f"The mean price of {round(mean_price)}")

median_price = y_train.median()
print(f"The median price of {round(median_price)}")

In [None]:
mean_error = y_test - mean_price
rmse_mean = ((mean_error**2).mean())**(1/2)
print(f'The rmse when always predicting the mean : {round(rmse_mean)} €.')

In [None]:
median_error = y_test - median_price
rmse_median = ((median_error**2).mean())**(1/2)
print(f'The rmse when always predicting the median : {round(rmse_median)} €.')

## ⚖️ Importing the preprocessed files Andrea made : 

In [None]:
X_train_preproc = pd.read_pickle('../raw_data/X_train_preproc.pickle')
X_test_preproc = pd.read_pickle('../raw_data/X_test_preproc.pickle')
y_train = pd.read_pickle('../raw_data/y_train.pickle')
y_test = pd.read_pickle('../raw_data/y_test.pickle')

In [None]:
print(X_train_preproc.shape, X_test_preproc.shape)
print(y_train.shape, y_test.shape)

## 🐸 Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import validation_curve
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score


In [None]:
from sklearn.ensemble import RandomForestRegressor
model_rfr = RandomForestRegressor()

model_rfr.fit(X_train_preproc, y_train)
y_pred = model_rfr.predict(X_test_preproc)
mse = mean_squared_error(y_test, y_pred)
rmse = mse**(1/2)
print(f'Root Mean Squared Error: {rmse}')
print( 'Train score:', model_rfr.score(X_test_preproc, y_test))
print('Cross val score mean', cross_val_score(model_rfr, X_train_preproc, y_train, cv = 5).mean())

# playing around

In [None]:
# pip install category_encoders


In [None]:
import sys

sys.path.append('/home/anouchka/code/qadnguyen/realdata')
from prop_value.ml_logic.preprocessor import clean_data, preprocess_data

In [None]:
# url = 'https://dvf-api.data.gouv.fr/dvf/csv/?dep=93'
# df93 = pd.read_csv(url)

In [None]:
# df93.to_csv('../raw_data/dvf_93.csv')

In [None]:
df93 = pd.read_csv('../raw_data/dvf_93.csv')

In [None]:
df93.head()

In [None]:
df93_cleaned = clean_data(df93)

In [None]:
df93_cleaned.built

In [None]:
X_train_preproc, X_test_preproc, y_train, y_test = preprocess_data(df93_cleaned)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import validation_curve
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score


In [None]:
from sklearn.ensemble import RandomForestRegressor
model_rfr = RandomForestRegressor()

model_rfr.fit(X_train_preproc, y_train)
y_pred = model_rfr.predict(X_test_preproc)
mse = mean_squared_error(y_test, y_pred)
rmse = mse**(1/2)
print(f'Root Mean Squared Error: {rmse}')
print( 'Train score:', model_rfr.score(X_test_preproc, y_test))
print('Cross val score mean', cross_val_score(model_rfr, X_train_preproc, y_train, cv = 5).mean())

In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
model_xgb = XGBRegressor()

model_xgb.fit(X_train_preproc, y_train)
y_pred = model_xgb.predict(X_test_preproc)
mse = mean_squared_error(y_pred, y_test)
rmse = mse**(1/2)
print(f'Mean Squared Error: {mse}')
print( 'Train score:', model_xgb.score(X_train_preproc, y_train))
print('Cross val score mean', cross_val_score(model_xgb, X_train_preproc, y_train, cv = 5).mean())