In [63]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import seaborn as sns

In [64]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [65]:
numerical_features = list(train._get_numeric_data().drop(['Id', 'PID', 'SalePrice'], axis=1))
features = numerical_features

In [66]:
for feature in numerical_features:
    train[feature] = train[feature].fillna(-999)
    test[feature] = test[feature].fillna(-999)

In [67]:
categorical_features = list(train.select_dtypes(include='object').columns)

In [68]:
for col in categorical_features:
    train[col] = train[col].fillna('N/A')
    test[col] = test[col].fillna('N/A')
    
    train_values = sorted(list(train[col].unique()))
    test_values = sorted(list(test[col].unique()))
    
    categories = set(train_values + test_values)
    
    train[col] = pd.Categorical(train[col], categories=categories)
    test[col] = pd.Categorical(test[col], categories=categories)

In [69]:
train_dummies = pd.get_dummies(train[categorical_features], drop_first=True)
test_dummies = pd.get_dummies(test[categorical_features], drop_first=True)

In [70]:
X_train = pd.concat([train_dummies, train[numerical_features]], axis=1)
X_test = pd.concat([test_dummies, test[numerical_features]], axis=1)

In [81]:
lr = LinearRegression()
X = X_train
y = train['SalePrice']
lr_fitted = lr.fit(X, y)
lr_predict = lr.predict(X)

print(lr.score(X, y))
lr_rmse = mean_squared_error(y, lr_predict) ** .5
print(lr_rmse)

0.943679606800921
18805.014016357436


In [77]:
rf = RandomForestRegressor()
rf_fitted = rf.fit(X, y)

In [78]:
test['price'] = rf.predict(X_test)

ids = test['Id']
price = test['price']
df = pd.DataFrame({'Id': ids, 'SalePrice':price}) 
df.sort_values('Id', inplace=True)
df.to_csv('./test_submission.csv', index=False)

In [86]:
X_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
MS Zoning_C (all),2051.0,0.009264,0.095825,0.0,0.0,0.0,0.0,1.0
MS Zoning_FV,2051.0,0.049244,0.216430,0.0,0.0,0.0,0.0,1.0
MS Zoning_RL,2051.0,0.779132,0.414933,0.0,1.0,1.0,1.0,1.0
MS Zoning_RM,2051.0,0.154071,0.361105,0.0,0.0,0.0,0.0,1.0
MS Zoning_A (agr),2051.0,0.000975,0.031220,0.0,0.0,0.0,0.0,1.0
MS Zoning_RH,2051.0,0.006826,0.082357,0.0,0.0,0.0,0.0,1.0
Street_Grvl,2051.0,0.003413,0.058335,0.0,0.0,0.0,0.0,1.0
Alley_Pave,2051.0,0.026816,0.161585,0.0,0.0,0.0,0.0,1.0
Alley_Grvl,2051.0,0.041443,0.199362,0.0,0.0,0.0,0.0,1.0
Lot Shape_IR2,2051.0,0.026816,0.161585,0.0,0.0,0.0,0.0,1.0
