In [1]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import statsmodels.api as sm
import sklearn.metrics as metrics

In [2]:
s_train = pd.read_csv('datasets/train.csv')
s_test = pd.read_csv('datasets/test.csv')

In [3]:
def clean_header(df):
    
    '''This function removes wierd characters and spaces from columns and keeps everything lowercase'''
    
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')','')

In [4]:
# clean header script
clean_header(s_train)
clean_header(s_test)
s_train.head()

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type,saleprice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [27]:
s_train[['exter_qual', 'exter_cond', 'bsmt_qual', 'bsmt_cond', 'kitchen_qual', 'garage_qual', 'garage_cond']] = s_train[['exter_qual', 'exter_cond', 'bsmt_qual', 'bsmt_cond', 'kitchen_qual', 'garage_qual', 'garage_cond']].apply(lambda x: x.astype(str).str.lower().fillna(0))
s_test[['exter_qual', 'exter_cond', 'bsmt_qual', 'bsmt_cond', 'kitchen_qual', 'garage_qual', 'garage_cond']] = s_test[['exter_qual', 'exter_cond', 'bsmt_qual', 'bsmt_cond', 'kitchen_qual', 'garage_qual', 'garage_cond']].apply(lambda x: x.astype(str).str.lower().fillna(0))

In [6]:
s_test['exter_qual'].head()

0    ta
1    ta
2    gd
3    gd
4    ta
Name: exter_qual, dtype: object

In [7]:
# Changing like qualitative data to numeric for easier analysis 
s_train[['exter_qual', 'exter_cond', 'bsmt_qual', 'bsmt_cond', 'kitchen_qual', 'garage_qual', 'garage_cond']] = s_train[['exter_qual', 'exter_cond', 'bsmt_qual', 'bsmt_cond', 'kitchen_qual', 'garage_qual', 'garage_cond']].replace({'ex':1, 'gd':2, 'ta':3, 'fa':4, 'po':5, 'na':0, 'nan':0})
s_test[['exter_qual', 'exter_cond', 'bsmt_qual', 'bsmt_cond', 'kitchen_qual', 'garage_qual', 'garage_cond']] = s_test[['exter_qual', 'exter_cond', 'bsmt_qual', 'bsmt_cond', 'kitchen_qual', 'garage_qual', 'garage_cond']].replace({'ex':1, 'gd':2, 'ta':3, 'fa':4, 'po':5, 'na':0, 'nan':0})

In [8]:
s_train

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type,saleprice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2046,1587,921126030,20,RL,79.0,11449,Pave,,IR1,HLS,...,0,0,,,,0,1,2008,WD,298751
2047,785,905377130,30,RL,,12342,Pave,,IR1,Lvl,...,0,0,,,,0,3,2009,WD,82500
2048,916,909253010,50,RL,57.0,7558,Pave,,Reg,Bnk,...,0,0,,,,0,3,2009,WD,177000
2049,639,535179160,20,RL,80.0,10400,Pave,,Reg,Lvl,...,0,0,,,,0,11,2009,WD,144000


In [38]:
qual_train_filtered = pd.DataFrame(s_train[['exter_qual', 'exter_cond', 'bsmt_qual', 'bsmt_cond', 'kitchen_qual', 'garage_qual', 'garage_cond']])
qual_test_filtered = pd.DataFrame(s_test[['exter_qual', 'exter_cond', 'bsmt_qual', 'bsmt_cond', 'kitchen_qual', 'garage_qual', 'garage_cond']])

In [39]:
qual_train_filtered

['exter_qual',
 'exter_cond',
 'bsmt_qual',
 'bsmt_cond',
 'kitchen_qual',
 'garage_qual',
 'garage_cond']

In [40]:
type(qual_train_filtered)

list

In [41]:
#Trying out a very small model first. 
train_feat = qual_train_filtered
test_feat = qual_test_filtered
print(train_feat.describe())
print(test_feat.describe())
print(train_feat.info())

AttributeError: 'list' object has no attribute 'describe'

In [42]:
s_test['saleprice'] = np.mean(s_train['saleprice'])
s_test['saleprice']

0      181469.701609
1      181469.701609
2      181469.701609
3      181469.701609
4      181469.701609
           ...      
873    181469.701609
874    181469.701609
875    181469.701609
876    181469.701609
877    181469.701609
Name: saleprice, Length: 878, dtype: float64

In [43]:
X_train = train_feat
X_test = test_feat
print(X_train)
print(X_test)

['exter_qual', 'exter_cond', 'bsmt_qual', 'bsmt_cond', 'kitchen_qual', 'garage_qual', 'garage_cond']
['exter_qual', 'exter_cond', 'bsmt_qual', 'bsmt_cond', 'kitchen_qual', 'garage_qual', 'garage_cond']


In [16]:
y_train = s_train['saleprice']
y_test = s_test['saleprice']
print(y_train.shape)
print(y_test.shape)

(2051,)
(878,)


In [17]:
lr = LinearRegression()

In [18]:
# Fit the model on training data
lr.fit(X_train, y_train)

NameError: name 'X_train' is not defined

In [19]:
y_pred = lr.predict(X_test)

NameError: name 'X_test' is not defined

In [20]:
y_pred.shape

NameError: name 'y_pred' is not defined

In [21]:
# baseline R2
cross_val_score(lr, X_train, y_train).mean()

NameError: name 'X_train' is not defined

In [22]:
poly = PolynomialFeatures(include_bias=False)

In [None]:
poly

In [None]:
X_poly = poly.fit_transform(X_train) # This is the equivalent of these two:

In [None]:
poly.get_feature_names(qual_train_filtered)

In [None]:
# View X_poly in a DataFrame
pd.DataFrame(X_poly, columns=poly.get_feature_names(qual_train_filtered)).head()

In [None]:
cross_val_score(lr, X_poly, y_train)

In [None]:
sc = StandardScaler()
sc.fit(X_train) # Learning the means and the standard deviations for each column
print("Means:", sc.mean_)

print("Standard Deviations:", sc.scale_)

In [None]:
X_scaled = sc.transform(X_train)
X_scaled[0:5]

In [None]:
np.mean(X_scaled)

In [None]:
cross_val_score(lr, X_scaled, y_train).mean()

In [None]:
X_train.hist(figsize=(15, 15));

In [None]:
sns.pairplot(qual_train_filtered, y_vars=y_train, x_vars=X_train)