In [1]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import statsmodels.api as sm
import sklearn.metrics as metrics

In [2]:
s_train = pd.read_csv('datasets/train.csv')
s_test = pd.read_csv('datasets/test.csv')

In [3]:
def clean_header(df):
    
    '''This function removes wierd characters and spaces from columns and keeps everything lowercase'''
    
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')','')

In [4]:
s_train.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [5]:
# clean header script
clean_header(s_train)
clean_header(s_test)
s_train.head()

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type,saleprice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [6]:
# Changing like qualitative data to numeric for easier analysis 
s_train[['exter_qual', 'exter_cond', 'bsmt_qual', 'bsmt_cond', 'kitchen_qual', 'garage_qual', 'garage_cond']] = s_train[['exter_qual', 'exter_cond', 'bsmt_qual', 'bsmt_cond', 'kitchen_qual', 'garage_qual', 'garage_cond']].replace({'Ex':1, 'Gd':2, 'Ta':3, 'Fa':4, 'Po':5, 'NA':0}).fillna(0)
s_test[['exter_qual', 'exter_cond', 'bsmt_qual', 'bsmt_cond', 'kitchen_qual', 'garage_qual', 'garage_cond']] = s_test[['exter_qual', 'exter_cond', 'bsmt_qual', 'bsmt_cond', 'kitchen_qual', 'garage_qual', 'garage_cond']].replace({'Ex':1, 'Gd':2, 'Ta':3, 'Fa':4, 'Po':5, 'NA':0}).fillna(0)
# df['outcome_dummy'] = df['outcome'].replace({"Alive":0, "Dead":1})

In [7]:
s_train['lot_config'] =  s_train['lot_config'].map(lambda x: x.lower())
# s_test['lot_config'] = map(str.lower, s_test.lot_config)
# df['lower_desc'] = map(lambda x: x.lower(), df['Description']) print df lower() 
s_train['lot_config']

0       culdsac
1       culdsac
2        inside
3        inside
4        inside
         ...   
2046     inside
2047     inside
2048     inside
2049     corner
2050     inside
Name: lot_config, Length: 2051, dtype: object

In [8]:
s_test['lot_config'] =  s_test['lot_config'].map(lambda x: x.lower())

In [9]:
s_train[['lot_config']] = s_train[['lot_config']].replace({'inside':1, 'corner':2, 'culdsac':3, 'fr2':4, 'fr3':5}).fillna(0)
s_test[['lot_config']] = s_test[['lot_config']].replace({'inside':1, 'corner':2, 'culdsac':3, 'fr2':4, 'fr3':5}).fillna(0)

In [10]:
s_train['lot_config'].unique()

array([3, 1, 2, 4, 5])

In [11]:
s_test['lot_config'].unique()

array([1, 3, 2, 4, 5])

In [12]:
s_train['lot_shape'] =  s_train['lot_shape'].map(lambda x: x.lower())
s_test['lot_shape'] =  s_test['lot_shape'].map(lambda x: x.lower())

In [13]:
s_train[['lot_shape']] = s_train[['lot_shape']].replace({'reg':1, 'ir1':2, 'ir2':3, 'ir3':4}).fillna(0)
s_test[['lot_shape']] = s_test[['lot_shape']].replace({'reg':1, 'ir1':2, 'ir2':3, 'ir3':4}).fillna(0)

In [14]:
s_train[['lot_config']].isnull().sum()

lot_config    0
dtype: int64

In [15]:
lots_train_filtered = s_train.filter(['lot_frontage', 'lot_area', 'lot_config', 'lot_shape']).fillna(0)
lots_test_filtered = s_test.filter(['lot_frontage', 'lot_area', 'lot_config', 'lot_shape']).fillna(0)

In [16]:
lots_train_filtered

Unnamed: 0,lot_frontage,lot_area,lot_config,lot_shape
0,0.0,13517,3,2
1,43.0,11492,3,2
2,68.0,7922,1,1
3,73.0,9802,1,1
4,82.0,14235,1,2
...,...,...,...,...
2046,79.0,11449,1,2
2047,0.0,12342,1,2
2048,57.0,7558,1,1
2049,80.0,10400,2,1


In [29]:
#Trying out a very small model first. 
train_feat = lots_train_filtered
test_feat = lots_test_filtered
print(train_feat.describe())
print(test_feat.describe())

       lot_frontage       lot_area   lot_config    lot_shape
count   2051.000000    2051.000000  2051.000000  2051.000000
mean      57.944417   10065.208191     1.402730     1.404193
std       33.137332    6742.488909     0.772498     0.566514
min        0.000000    1300.000000     1.000000     1.000000
25%       43.500000    7500.000000     1.000000     1.000000
50%       63.000000    9430.000000     1.000000     1.000000
75%       78.000000   11513.500000     2.000000     2.000000
max      313.000000  159000.000000     5.000000     4.000000
       lot_frontage       lot_area  lot_config   lot_shape
count    878.000000     878.000000  878.000000  878.000000
mean      56.872437   10307.033030    1.405467    1.397494
std       34.269502   10002.674602    0.769763    0.579264
min        0.000000    1477.000000    1.000000    1.000000
25%       40.000000    7297.250000    1.000000    1.000000
50%       60.000000    9446.000000    1.000000    1.000000
75%       78.000000   11589.000000    

In [30]:
s_test['saleprice'] = np.mean(s_train['saleprice'])
s_test['saleprice']

0      181469.701609
1      181469.701609
2      181469.701609
3      181469.701609
4      181469.701609
           ...      
873    181469.701609
874    181469.701609
875    181469.701609
876    181469.701609
877    181469.701609
Name: saleprice, Length: 878, dtype: float64

In [31]:
# train_feat = s_train[['']]
# test_feat = s_test[['']]
# print(train_feat)
# print(test_feat)

In [32]:
X_train = train_feat
X_test = test_feat
print(X_train.shape)
print(X_test.shape)

(2051, 4)
(878, 4)


In [33]:
y_train = s_train['saleprice']
y_test = s_test['saleprice']
print(y_train.shape)
print(y_test.shape)

(2051,)
(878,)


In [34]:
y_test

0      181469.701609
1      181469.701609
2      181469.701609
3      181469.701609
4      181469.701609
           ...      
873    181469.701609
874    181469.701609
875    181469.701609
876    181469.701609
877    181469.701609
Name: saleprice, Length: 878, dtype: float64

In [35]:
lr = LinearRegression()

In [36]:
# Fit the model on training data
lr.fit(X_train, y_train)

LinearRegression()

In [37]:
y_pred = lr.predict(X_test)

In [38]:
y_pred

array([169010.70720279, 174268.59476673, 218837.62995018, 163251.04273293,
       173917.03674658, 129753.8323556 , 159322.49092677, 176125.29598425,
       142099.97367604, 174083.8300682 , 155432.40391966, 161722.8274231 ,
       176655.41188227, 132671.46701169, 212580.4660709 , 164422.90280011,
       168198.89634988, 169451.70100267, 177998.57475293, 214335.08129935,
       171195.71725726, 173081.08692088, 157248.78702378, 202529.03938664,
       176741.62334754, 198281.41388934, 184464.57646226, 160729.18844549,
       173658.35228192, 147947.22904933, 169229.59647789, 154383.31775477,
       211876.56779561, 176262.48576452, 176342.91586436, 210374.19613171,
       158077.04054746, 134893.66091685, 172254.43051571, 204957.19053552,
       144872.45761591, 178972.93892527, 169480.56713766, 183706.10328643,
       184637.87128502, 128979.10264453, 211033.63797906, 163024.89416441,
       164355.6826185 , 169723.61959604, 161325.50359715, 159392.85692445,
       205429.62431496, 1

In [39]:
cross_val_score(estimator=lr, X=X_train, y=y_train)

array([0.17842145, 0.0535952 , 0.15745381, 0.15500569, 0.17520191])