In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor

## CREATE THE DATASET

* read from csv
* visualize

In [3]:
dataset = pd.read_csv("data/test_dataset.csv", index_col="id")
dataset

Unnamed: 0_level_0,prezzo,locali,superficie,bagni
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
64524238,445000.0,5.0,450.0,3.0
75218140,105000.0,4.0,100.0,2.0
76068666,99000.0,5.0,150.0,2.0
74789894,270000.0,5.0,161.0,3.0
74757644,185000.0,5.0,550.0,2.0
...,...,...,...,...
73341564,100000.0,4.0,160.0,1.0
75088158,125000.0,3.0,90.0,2.0
75565044,70000.0,3.0,90.0,1.0
75311070,140000.0,,120.0,


In [4]:
#remove rows with null values
dataset = dataset.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
dataset

Unnamed: 0_level_0,prezzo,locali,superficie,bagni
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
64524238,445000.0,5.0,450.0,3.0
75218140,105000.0,4.0,100.0,2.0
76068666,99000.0,5.0,150.0,2.0
74789894,270000.0,5.0,161.0,3.0
74757644,185000.0,5.0,550.0,2.0
...,...,...,...,...
76297024,130000.0,3.0,100.0,1.0
75346604,320000.0,3.0,90.0,1.0
73341564,100000.0,4.0,160.0,1.0
75088158,125000.0,3.0,90.0,2.0


In [5]:
#riordina il dataset mettendo il prezzo come ultima colonna (il prezzo sarà il target value)
cols = dataset.columns.tolist()
cols = cols[1:] + cols[:1]
dataset = dataset[cols]
dataset

Unnamed: 0_level_0,locali,superficie,bagni,prezzo
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
64524238,5.0,450.0,3.0,445000.0
75218140,4.0,100.0,2.0,105000.0
76068666,5.0,150.0,2.0,99000.0
74789894,5.0,161.0,3.0,270000.0
74757644,5.0,550.0,2.0,185000.0
...,...,...,...,...
76297024,3.0,100.0,1.0,130000.0
75346604,3.0,90.0,1.0,320000.0
73341564,4.0,160.0,1.0,100000.0
75088158,3.0,90.0,2.0,125000.0


## PREPARE TRAIN AND TEST SETS

* split the dataset in 75% train and 25% test sets

In [6]:
#generate random samples from a uniform distribution of of numbers from 0 to 1
#The result will be true if the number is < 0.75 and False if > 0.75
#Being a uniform distribution we will have 75% True and 25% False
dataset['is_train'] = np.random.uniform(0, 1, len(dataset)) <= .75
dataset

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0_level_0,locali,superficie,bagni,prezzo,is_train
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
64524238,5.0,450.0,3.0,445000.0,True
75218140,4.0,100.0,2.0,105000.0,True
76068666,5.0,150.0,2.0,99000.0,True
74789894,5.0,161.0,3.0,270000.0,False
74757644,5.0,550.0,2.0,185000.0,True
...,...,...,...,...,...
76297024,3.0,100.0,1.0,130000.0,False
75346604,3.0,90.0,1.0,320000.0,False
73341564,4.0,160.0,1.0,100000.0,True
75088158,3.0,90.0,2.0,125000.0,True


In [7]:
#split inside 2 variables for train e validation
train, test = dataset[dataset['is_train'] == True], dataset[dataset['is_train'] == False]
print(f"Number of observations in the training data: {len(train)}")
print(f"Number of observations in the test data: {len(test)}")

#select features
features = dataset.columns[:3]
print(features)

Number of observations in the training data: 430
Number of observations in the test data: 128
Index(['locali', 'superficie', 'bagni'], dtype='object')


## TRAINING

##### Random forest regressor
Init a random forest regressor instance with some parameters:
* n_estimators : number of trees in the random forest
* random_state : the random starting seed

##### Fit

* locali, superficie, bagni are the input values (X)
* prezzo is the target value (y)

In [8]:
regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)
X = train[features]
y = train["prezzo"]
regressor.fit(X, y)
score = {regressor.score(X, y) * 100}
print(f"This model has a training accuracy of {score}%")

This model has a training accuracy of {79.08025086100436}%


In [9]:
preds = regressor.predict(X)
#compute rmse for trained data
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())
print(rmse(preds, train["prezzo"]))

44064.44563726714


In [10]:
print(test[features].index)

Int64Index([74789894, 75218332, 75218048, 74875000, 67257439, 76030930,
            67672285, 69553290, 73009922, 75122248,
            ...
            75218106, 75218354, 75218054, 72600378, 70606778, 71376052,
            75756604, 76297024, 75346604, 75565044],
           dtype='int64', name='id', length=128)


## EVALUATION

Validate the model predicting on the testing portion of the dataset

In [11]:
preds = regressor.predict(test[features])
result = pd.DataFrame(index=test[features].index, data=preds)
result

Unnamed: 0_level_0,0
id,Unnamed: 1_level_1
74789894,298412.500000
75218332,126811.206349
75218048,109911.047980
74875000,161877.722222
67257439,76589.686147
...,...
71376052,132020.297619
75756604,124388.666667
76297024,110462.107143
75346604,93819.963958


## COMPUTE ACCURACY WITH RMSE
RMSE : root mean square error

In [12]:
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())
print(rmse(preds, test["prezzo"]))

100139.84361414035
