In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental  import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


ohe = OneHotEncoder(handle_unknown='ignore')
categorical_processing = Pipeline(steps=[('ohe', ohe)])
preprocessing = ColumnTransformer(transformers=[('categorical', categorical_processing, ['conditions'])],
                                  remainder='passthrough')

In [3]:
data = pd.read_csv(r'C:\Users\luisp\Downloads\ml-challenge-2023-30412\training.csv')
test = pd.read_csv(r'C:\Users\luisp\Downloads\ml-challenge-2023-30412\test.csv')

In [4]:
data.head()

Unnamed: 0,id,price,balcony,conditions,construction_year,latitude,longitude,energy_efficiency,expenses,floor,garden,n_bathrooms,elevator,total_floors,n_rooms,proximity_to_center,surface
0,62191,2000000,,Buono stato / Abitabile,,45.437008,12.317841,,,,,,,,,1.0,
1,50213,97500,,,,45.4998,9.22192,,,,,,,,2.0,1.0,47.0
2,56997,525000,,,,41.962719,12.60297,,,,,,,,5.0,1.0,292.0
3,45581,25323,,,,45.4733,12.2255,,,,,,,,4.0,1.0,70.0
4,59454,78000,,,,45.4366,9.17507,,,,,,,,2.0,1.0,50.0


In [5]:
test.head()

Unnamed: 0,id,balcony,conditions,construction_year,latitude,longitude,energy_efficiency,expenses,floor,garden,n_bathrooms,elevator,total_floors,n_rooms,proximity_to_center,surface
0,18825,True,Nuovo / Nuova costruzione,2010.0,41.9175,12.6554,175.0,1.0,3.0,True,1.0,1.0,5.0,2.0,1.0,72.0
1,30956,,Buono stato / Abitabile,1900.0,45.4461,12.3267,175.0,0.0,0.0,True,1.0,,3.0,2.0,1.0,60.0
2,47193,,Buono stato / Abitabile,,45.456925,9.194406,,,-1.0,,3.0,0.0,,3.0,1.0,140.0
3,27799,True,Buono stato / Abitabile,1962.0,45.4838,12.259,198.87,,6.0,,,,6.0,3.0,1.0,98.0
4,22259,True,Buono stato / Abitabile,2006.0,41.8994,12.6595,175.0,65.0,4.0,,2.0,1.0,4.0,3.0,1.0,83.0


In [6]:
data.isnull().sum()

id                         0
price                      0
balcony                11814
conditions               836
construction_year      10421
latitude                   1
longitude                  1
energy_efficiency      16169
expenses               11774
floor                   1619
garden                 24265
n_bathrooms             1749
elevator               13516
total_floors           15371
n_rooms                  251
proximity_to_center        1
surface                   82
dtype: int64

In [7]:
data.columns

Index(['id', 'price', 'balcony', 'conditions', 'construction_year', 'latitude',
       'longitude', 'energy_efficiency', 'expenses', 'floor', 'garden',
       'n_bathrooms', 'elevator', 'total_floors', 'n_rooms',
       'proximity_to_center', 'surface'],
      dtype='object')

In [8]:
y = data["price"]

X = data.drop(["price"], axis=1)

In [9]:
y.unique()

array([2000000,   97500,  525000, ...,  440700, 1058000,  334000],
      dtype=int64)

In [10]:
print(f"X: {X.shape}")

X: (36295, 16)


In [11]:
X = X.drop(['balcony', 'energy_efficiency', 'expenses', 'floor', 'garden',
       'n_bathrooms', 'elevator', 'total_floors', 'n_rooms','construction_year', 'id'
       ], axis=1)

In [12]:
X.isnull().sum()

conditions             836
latitude                 1
longitude                1
proximity_to_center      1
surface                 82
dtype: int64

In [13]:
X = X.drop(X[X["latitude"].isna()].index)
X.isna().sum()

conditions             836
latitude                 0
longitude                0
proximity_to_center      0
surface                 82
dtype: int64

In [14]:
X.columns

Index(['conditions', 'latitude', 'longitude', 'proximity_to_center',
       'surface'],
      dtype='object')

In [15]:
X = preprocessing.fit_transform(X)

In [16]:
test = test.drop(['balcony', 'energy_efficiency', 'expenses', 'floor', 'garden',
       'n_bathrooms', 'elevator', 'total_floors', 'n_rooms','construction_year', 'id'
       ], axis=1)

In [17]:
test.isna().sum()

conditions             513
latitude                10
longitude               10
proximity_to_center     10
surface                164
dtype: int64

In [18]:
test.head()

Unnamed: 0,conditions,latitude,longitude,proximity_to_center,surface
0,Nuovo / Nuova costruzione,41.9175,12.6554,1.0,72.0
1,Buono stato / Abitabile,45.4461,12.3267,1.0,60.0
2,Buono stato / Abitabile,45.456925,9.194406,1.0,140.0
3,Buono stato / Abitabile,45.4838,12.259,1.0,98.0
4,Buono stato / Abitabile,41.8994,12.6595,1.0,83.0


In [19]:
test = preprocessing.transform(test)

In [20]:
print(f'Number of features in training set: {X.shape[1]}')
print(f'Number of features in testing set: {test.shape[1]}')

Number of features in training set: 9
Number of features in testing set: 9


In [23]:
X

array([[  1.        ,   0.        ,   0.        , ...,  12.31784058,
          1.        ,          nan],
       [  0.        ,   0.        ,   0.        , ...,   9.22192   ,
          1.        ,  47.        ],
       [  0.        ,   0.        ,   0.        , ...,  12.60297012,
          1.        , 292.        ],
       ...,
       [  0.        ,   0.        ,   1.        , ...,   9.2221    ,
          1.        ,  60.        ],
       [  0.        ,   1.        ,   0.        , ...,  12.4921    ,
          1.        , 196.        ],
       [  0.        ,   0.        ,   0.        , ...,   9.2213    ,
          1.        ,  84.        ]])

In [21]:
# Naive random forest
rf = RandomForestClassifier()

In [22]:
rf.fit(X, y)

ValueError: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values