In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import seaborn as sns

In [2]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [3]:
numerical_features = list(train._get_numeric_data().drop(['Id', 'PID', 'SalePrice'], axis=1))
features = numerical_features

In [4]:
for feature in numerical_features:
    train[feature] = train[feature].fillna(-999)
    test[feature] = test[feature].fillna(-999)

In [5]:
categorical_features = list(train.select_dtypes(include='object').columns)

In [6]:
for col in categorical_features:
    train[col] = train[col].fillna('N/A')
    test[col] = test[col].fillna('N/A')
    
    train_values = sorted(list(train[col].unique()))
    test_values = sorted(list(test[col].unique()))
    
    categories = set(train_values + test_values)
    
    train[col] = pd.Categorical(train[col], categories=categories)
    test[col] = pd.Categorical(test[col], categories=categories)

In [7]:
train_dummies = pd.get_dummies(train[categorical_features], drop_first=True)
test_dummies = pd.get_dummies(test[categorical_features], drop_first=True)

In [8]:
X_train = pd.concat([train_dummies, train[numerical_features]], axis=1)
X_test = pd.concat([test_dummies, test[numerical_features]], axis=1)

In [9]:
lr = LinearRegression()
lr_fitted = lr.fit(X_train, train['SalePrice'])
lr_predict = lr.predict(X_train)

print(lr.score(X_train, train['SalePrice']))
print(r2_score(train['SalePrice'], lr_predict))
lr_rmse = mean_squared_error(train['SalePrice'], lr_predict)
print(lr_rmse ** 0.5)


0.943679606800921
0.943679606800921
18805.014016357447


In [10]:
# Make Polynomial Features
pf = PolynomialFeatures(include_bias=False)
X_poly = pf.fit_transform(X_train)

In [11]:
## Put these in a DataFrame 
features = pf.get_feature_names(X_train.columns)

poly_df = pd.DataFrame(X_poly, columns=features)

poly_df.head()

Unnamed: 0,MS Zoning_RH,MS Zoning_FV,MS Zoning_A (agr),MS Zoning_RL,MS Zoning_I (all),MS Zoning_C (all),Street_Grvl,Alley_N/A,Alley_Grvl,Lot Shape_Reg,...,Pool Area^2,Pool Area Misc Val,Pool Area Mo Sold,Pool Area Yr Sold,Misc Val^2,Misc Val Mo Sold,Misc Val Yr Sold,Mo Sold^2,Mo Sold Yr Sold,Yr Sold^2
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,6030.0,4040100.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0,8036.0,4036081.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2010.0,4040100.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0,8040.0,4040100.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,6030.0,4040100.0


In [12]:
df_good_poly_features = X_train.loc[:, poly_df.corrwith(train['SalePrice']).abs() > .4]
X_test = X_test.loc[:, poly_df.corrwith(train['SalePrice']).abs() > .4]
df_good_poly_features.head()

Unnamed: 0,Neighborhood_NridgHt,Mas Vnr Type_None,Exter Qual_Gd,Exter Qual_Ex,Exter Qual_TA,Bsmt Qual_Ex,Bsmt Qual_TA,Heating QC_Ex,Kitchen Qual_Ex,Kitchen Qual_TA,...,Year Built,Year Remod/Add,BsmtFin SF 1,Total Bsmt SF,1st Flr SF,Gr Liv Area,Full Bath,TotRms AbvGrd,Fireplaces,Garage Area
0,0,0,1,0,0,0,1,1,0,0,...,1976,2005,533.0,725.0,725,1479,2,6,0,475.0
1,0,0,1,0,0,0,0,1,0,0,...,1996,1997,637.0,913.0,913,2122,2,8,1,559.0
2,0,1,0,0,1,0,1,0,0,0,...,1953,2007,731.0,1057.0,1057,1057,1,5,0,246.0
3,0,1,0,0,1,0,0,0,0,1,...,2006,2007,0.0,384.0,744,1444,2,7,0,400.0
4,0,1,0,0,1,0,0,0,0,1,...,1900,1993,0.0,676.0,831,1445,2,6,0,484.0


In [13]:
lr = LinearRegression()
lr_fitted = lr.fit(df_good_poly_features, train['SalePrice'])
lr_predict = lr.predict(df_good_poly_features)

print(lr.score(df_good_poly_features, train['SalePrice']))
print(r2_score(train['SalePrice'], lr_predict))
lr_rmse = mean_squared_error(train['SalePrice'], lr_predict)
print(lr_rmse ** 0.5)

0.8449165701700071
0.844916570170007
31204.952155171555


In [14]:
rf = RandomForestRegressor()
rf_fitted = rf.fit(df_good_poly_features, train['SalePrice'])
rf_predict = rf.predict(df_good_poly_features)
print(rf.score(df_good_poly_features, train['SalePrice']))
print(r2_score(train['SalePrice'], rf_predict))
rf_rmse = mean_squared_error(train['SalePrice'], rf_predict)
print(lr_rmse ** 0.5)

0.9789958803675504
0.9789958803675504
31204.952155171555


In [125]:
test['price'] = rf.predict(X_test)

In [126]:
ids = test['Id']
price = test['price']
df = pd.DataFrame({'Id': ids, 'SalePrice':price}) 
df.sort_values('Id', inplace=True)
df.to_csv('./test_submission.csv', index=False)

In [127]:
X_train.head()

Unnamed: 0,MS Zoning_RM,MS Zoning_I (all),MS Zoning_A (agr),MS Zoning_RL,MS Zoning_RH,MS Zoning_FV,Street_Grvl,Alley_N/A,Alley_Grvl,Lot Shape_IR3,...,Garage Area,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Mo Sold,Yr Sold
0,0,0,0,1,0,0,0,1,0,0,...,475.0,0,44,0,0,0,0,0,3,2010
1,0,0,0,1,0,0,0,1,0,0,...,559.0,0,74,0,0,0,0,0,4,2009
2,0,0,0,1,0,0,0,1,0,0,...,246.0,0,52,0,0,0,0,0,1,2010
3,0,0,0,1,0,0,0,1,0,0,...,400.0,100,0,0,0,0,0,0,4,2010
4,0,0,0,1,0,0,0,1,0,0,...,484.0,0,59,0,0,0,0,0,3,2010
