In [1787]:
import torch
import torch.nn as nn
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt 
import pandas_datareader as pdr
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler

In [1788]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [1789]:
initial_labels = train.columns

## Lets take care of the categorical variables

In [1790]:
Categorical = ['MasVnrType','MSSubClass','MSZoning','Street','Alley','LotShape','LandContour','Utilities','LotConfig','LandSlope','Neighborhood','Condition1','Condition2','BldgType','HouseStyle','RoofStyle','RoofMatl','Exterior1st','Exterior2nd','MasVnrArea','ExterQual','ExterCond','Foundation','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','Heating','HeatingQC','CentralAir','Electrical','KitchenQual','Functional','FireplaceQu','GarageType','GarageFinish','GarageQual','GarageCond','PavedDrive','PoolQC','Fence','MiscFeature','SaleType','SaleCondition']


In [1791]:
for cat in Categorical:
    train[cat] = train[cat].map(str)
    test[cat] = test[cat].map(str)

In [1792]:
train = train.set_index('Id')
test = test.set_index('Id')
print("index reset")

index reset


In [1793]:
ohe = OneHotEncoder(handle_unknown = 'ignore')
train_ohe = ohe.fit_transform(train[Categorical]).toarray()
test_ohe = ohe.transform(test[Categorical]).toarray()

In [1794]:
feature_labels = ohe.categories_

In [1795]:
labels = []
for names in feature_labels:
    labels += names.tolist()


In [1796]:
train_cat_index = train.index
test_cat_index = test.index

In [1797]:
train_cat = pd.DataFrame(train_ohe,columns=labels)
test_cat = pd.DataFrame(test_ohe,columns=labels)

In [1798]:
train_cat['Id'] = train_cat_index
train_cat = train_cat.set_index(['Id'])
test_cat['Id'] = test_cat_index
test_cat = test_cat.set_index(['Id'])

In [1799]:
#recombine with numerical data now
numerical = []

for name in initial_labels:
    if name not in Categorical and name != 'SalePrice' and name !='Id':
        numerical.append(name)

train_num = train[numerical]
test_num = test[numerical]

In [1800]:
#grab targets
train_target = train['SalePrice']
traing_target = train_target.fillna(train_target.mean())

In [1801]:
#concatenate the numerical and categorical features
train_features= train_num.join(train_cat)
test_features= test_num.join(test_cat)

In [1802]:
train_features = train_features.apply(lambda x: x.fillna(x.mean()),axis=0)
test_features = test_features.apply(lambda x: x.fillna(x.mean()),axis=0)

## Let's scale our values to achieve numerical stability

In [1803]:
mms = MinMaxScaler()

train_features = mms.fit_transform(train_features)
test_features = mms.transform(test_features)

In [1804]:
train_features.shape,test_features.shape

((1460, 645), (1459, 645))

In [1805]:
mms_targets = MinMaxScaler()
train_targets = mms_targets.fit_transform(pd.DataFrame(train_target))

## I want a validation set too...

In [1806]:
x_train,x_val,y_train,y_val = train_test_split(train_features,train_targets,test_size = .1)

# Time to build a model

In [1807]:
from sklearn.ensemble import RandomForestRegressor

In [1808]:
rfr = RandomForestRegressor()

In [1813]:
rfr.fit(x_train,y_train.reshape(-1))

RandomForestRegressor()

In [1823]:
score = rfr.score(x_val,y_val)
print(f"The R Squared value is {np.round(score,4)}")

The R Squared value is 0.8869


In [1827]:
rfr.predict(test_features)

array([0.12973754, 0.17301055, 0.20077875, ..., 0.16163311, 0.1097358 ,
       0.27291485])

In [1828]:
y_hat_test = mms_targets.inverse_transform(rfr.predict(test_features).reshape(-1,1))

In [1829]:
output = pd.DataFrame(y_hat_test,columns=['SalePrice'])
output['Id'] = test.index
output.set_index('Id',inplace=True)

In [1832]:
output.to_csv('Estimate.csv')