In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
#Reading the data
train_data = pd.read_csv('train.csv', index_col = 'Id')
test_data = pd.read_csv('test.csv', index_col = 'Id')

In [3]:
#Extracting the target and predictors
y = train_data.SalePrice
features = ['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath','BedroomAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'PoolArea']
X = train_data[features].copy()
X_test = test_data[features].copy()

In [4]:
#Splitting the data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size = 0.8, test_size = 0.2, random_state = 0)

In [5]:
X_train.head()

Unnamed: 0_level_0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,...,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,TotRmsAbvGrd,Fireplaces,GarageCars,GarageArea,PoolArea
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
619,20,11694,9,5,2007,2007,48,0,1774,1822,...,0,0,2,0,3,9,1,3,774,0
871,20,6600,5,5,1962,1962,0,0,894,894,...,0,0,1,0,2,5,0,1,308,0
93,30,13360,5,7,1921,2006,713,0,163,876,...,1,0,1,0,2,5,0,2,432,0
818,20,13265,8,5,2002,2002,1218,0,350,1568,...,1,0,2,0,3,7,2,3,857,0
303,20,13704,7,5,2001,2002,0,0,1541,1541,...,0,0,2,0,3,6,1,3,843,0


In [52]:
#Testing different types of models
from sklearn.ensemble import RandomForestRegressor

model_1 = RandomForestRegressor(n_estimators = 50, random_state = 0)

model_2 = RandomForestRegressor(n_estimators = 100, random_state = 0)

model_3 = RandomForestRegressor(n_estimators = 100, criterion = 'absolute_error', random_state = 0)

model_4 = RandomForestRegressor(n_estimators = 200, min_samples_split =20, random_state = 0)

model_5 = RandomForestRegressor(n_estimators = 100, max_depth = 7, random_state = 0)

models = [model_1, model_2, model_3, model_4, model_5]

In [53]:
#Analyzing the score of every created model
from sklearn.metrics import mean_absolute_error

#Creating a function for comparing different models
def score_model(model, X_t = X_train, X_v = X_valid, y_t = y_train, y_v = y_valid):
    model.fit(X_t, y_t)
    preds = model.predict(X_v)
    return mean_absolute_error(y_v, preds)

for i in range(0, len(models)):
    mae = score_model(models[i])
    print("Model %d MAE: %d" % (i+1, mae))

Model 1 MAE: 17887
Model 2 MAE: 17856
Model 3 MAE: 17881
Model 4 MAE: 18292
Model 5 MAE: 18363


In [8]:
#Choosing the best result
model_5.fit(X, y)

preds_test = model_5.predict(X_test)

output = pd.DataFrame({'Id':X_test.index, 'SalePrice': preds_test})

output.to_csv('Predictions.csv', index = False)

output

Unnamed: 0,Id,SalePrice
0,1461,126061.089555
1,1462,163472.826992
2,1463,150132.254325
3,1464,175546.708825
4,1465,206308.536381
...,...,...
1454,2915,86193.189622
1455,2916,84897.457008
1456,2917,157160.396339
1457,2918,129857.478014


In [55]:
#Testing different types of models
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

model = RandomForestRegressor(n_estimators = 100, random_state = 0)
model.fit(X_train, y_train)
preds = model.predict(X_valid)
mae = mean_absolute_error(y_valid, preds)
print(f"Model MAE : {mae}")




Model MAE : 17856.307669275928
