In [14]:
#importing the necessary tools of trade
import pandas as pd
from sklearn.model_selection import train_test_split as tts
from sklearn.ensemble import RandomForestRegressor as rfr
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer as si

In [3]:
#we need to load the data into our worksheet
X_full=pd.read_csv('train.csv')
X_test_full=pd.read_csv('test.csv')

In [4]:
#we want to predict the sale price(respose variables)
y=X_full.SalePrice
#we select our relevant columns for our features
features=['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
#we fit our features to make a relevat data_frame
X=X_full[features].copy()
#we neeed to create a data_frame for our test features
X_test=X_test_full[features].copy()

In [5]:
#we need to split data
X_train,X_valid, y_train, y_valid=tts(X,y,train_size=0.8,test_size=0.2,random_state=0)

In [6]:
#now we need to create random forest models
#defining the models
model_1= rfr(n_estimators=50, random_state=0)
model_2 = rfr(n_estimators=100, random_state=0)
model_3 = rfr(n_estimators=100, criterion='mae', random_state=0)
model_4 = rfr(n_estimators=200, min_samples_split=20, random_state=0)
model_5 = rfr(n_estimators=100, max_depth=7, random_state=0)

models = [model_1, model_2, model_3, model_4, model_5]

In [32]:
#To select the best model out of the five, we define a function score_model()  which  returns the mae

# Function for comparing different models
def score_model(model, X_t=X_train, X_v=X_valid, y_t=y_train, y_v=y_valid):
    model.fit(X_t, y_t)
    preds = model.predict(X_v)
    return mean_absolute_error(y_v, preds)

for i in range(0, len(models)):
    mae = score_model(models[i])
    print("Model %d MAE: %d" % (i+1, mae))

Model 1 MAE: 24015
Model 2 MAE: 23740
Model 3 MAE: 23528
Model 4 MAE: 23996
Model 5 MAE: 23706


In [22]:
predictions=model_3.predict(X_test)

In [26]:
# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = rfr(n_estimators=10, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [12]:
#we need to get the columns with missing values
cols_missing=[col for col in X_train.columns
              if X_train[col].isnull().any()]
#drop columns in training and validation data
reduced_X_train=X_train.drop(cols_missing, axis=1)
reduced_X_valid=X_valid.drop(cols_missing, axis=1)

In [29]:
#Impuitation as as way of dealing with missing values
my_imputer=si()
imputed_X_train=pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid=pd.DataFrame(my_imputer.transform(X_valid))
#Imputation  removed column names, now we need to put them back together
imputed_X_train.columns=X_train.columns
imputed_X_valid.columns=X_valid.columns

#checking the effect on the MAE:
print("MAE from Approach 2 (Imputation):")
print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))

MAE from Approach 2 (Imputation):
25474.17279843444


In [33]:
final_X_test = pd.DataFrame(my_imputer.fit_transform(X_test))

# Fill in the line below: get test predictions
preds_test = model_3.predict(final_X_test)

In [36]:
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': preds_test})
output.to_csv('submission.csv', index=False)