In [1]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import statsmodels.api as sm
import sklearn.metrics as metrics

In [2]:
s_train = pd.read_csv('datasets/train.csv')
s_test = pd.read_csv('datasets/test.csv')

In [3]:
def clean_header(df):
    
    '''This function removes wierd characters and spaces from columns and keeps everything lowercase'''
    
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')','')

In [4]:
s_train.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [5]:
# clean header script
clean_header(s_train)
clean_header(s_test)
s_train.head()

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type,saleprice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [6]:
# Changing like qualitative data to numeric for easier analysis 
s_train[['exter_qual', 'exter_cond', 'bsmt_qual', 'bsmt_cond', 'kitchen_qual', 'garage_qual', 'garage_cond']] = s_train[['exter_qual', 'exter_cond', 'bsmt_qual', 'bsmt_cond', 'kitchen_qual', 'garage_qual', 'garage_cond']].replace({'Ex':1, 'Gd':2, 'Ta':3, 'Fa':4, 'Po':5, 'NA':0})
s_test[['exter_qual', 'exter_cond', 'bsmt_qual', 'bsmt_cond', 'kitchen_qual', 'garage_qual', 'garage_cond']] = s_test[['exter_qual', 'exter_cond', 'bsmt_qual', 'bsmt_cond', 'kitchen_qual', 'garage_qual', 'garage_cond']].replace({'Ex':1, 'Gd':2, 'Ta':3, 'Fa':4, 'Po':5, 'NA':0})
# df['outcome_dummy'] = df['outcome'].replace({"Alive":0, "Dead":1})

In [7]:
s_train['lot_config'] =  map(lambda x: x.lower(), s_train['lot_config'])
# s_test['lot_config'] = map(str.lower, s_test.lot_config)
# df['lower_desc'] = map(lambda x: x.lower(), df['Description']) print df lower() 
s_train['lot_config']

0       <map object at 0x7f4da57c2460>
1       <map object at 0x7f4da57c2460>
2       <map object at 0x7f4da57c2460>
3       <map object at 0x7f4da57c2460>
4       <map object at 0x7f4da57c2460>
                     ...              
2046    <map object at 0x7f4da57c2460>
2047    <map object at 0x7f4da57c2460>
2048    <map object at 0x7f4da57c2460>
2049    <map object at 0x7f4da57c2460>
2050    <map object at 0x7f4da57c2460>
Name: lot_config, Length: 2051, dtype: object

In [8]:
s_train[['lot_config']].replace({'Inside':1, 'Corner':2, 'CulDSac':3, 'Fr2':4, 'Fr3':5})
s_test[['lot_config']].replace({'Inside':1, 'Corner':2, 'CulDSac':3, 'Fr2':4, 'Fr3':5})

Unnamed: 0,lot_config
0,1
1,1
2,1
3,1
4,1
...,...
873,1
874,1
875,1
876,FR2


In [9]:
s_train['lot_config'].unique()

array([<map object at 0x7f4da57c2460>], dtype=object)

In [10]:
s_test['lot_config'].unique()

array(['Inside', 'CulDSac', 'Corner', 'FR2', 'FR3'], dtype=object)

In [11]:
s_train[['lot_shape']].replace({'Reg':1, 'IR1':2, 'IR2':3, 'IR3':4}).fillna(0)
s_test[['lot_shape']].replace({'Reg':1, 'IR1':2, 'IR2':3, 'IR3':4}).fillna(0)

Unnamed: 0,lot_shape
0,1
1,2
2,2
3,1
4,2
...,...
873,1
874,1
875,1
876,1


In [12]:
lots_train = s_train[['lot_frontage', 'lot_area', 'lot_config', 'lot_shape']]
lots_test = s_test[['lot_frontage', 'lot_area', 'lot_config', 'lot_shape']]

In [13]:
lots_train_grouped = lots_train.groupby(['lot_shape'])
lots_test_grouped = lots_test.groupby(['lot_shape'])

In [14]:
lots_train_grouped.mean()

Unnamed: 0_level_0,lot_frontage,lot_area
lot_shape,Unnamed: 1_level_1,Unnamed: 2_level_1
IR1,74.58209,11470.709538
IR2,59.416667,19995.454545
IR3,115.666667,21102.222222
Reg,66.968595,8815.707336


In [15]:
lots_test_grouped.mean()

Unnamed: 0_level_0,lot_frontage,lot_area
lot_shape,Unnamed: 1_level_1,Unnamed: 2_level_1
IR1,75.108696,11622.22028
IR2,91.5,16078.238095
IR3,120.0,46158.142857
Reg,66.568665,8980.265957


In [16]:
#Trying out a very small model first. 
train_feat = lots_train
test_feat = lots_test
print(train_feat.describe())
print(test_feat.describe())

       lot_frontage       lot_area
count   1721.000000    2051.000000
mean      69.055200   10065.208191
std       23.260653    6742.488909
min       21.000000    1300.000000
25%       58.000000    7500.000000
50%       68.000000    9430.000000
75%       80.000000   11513.500000
max      313.000000  159000.000000
       lot_frontage       lot_area
count    718.000000     878.000000
mean      69.545961   10307.033030
std       23.533945   10002.674602
min       21.000000    1477.000000
25%       59.000000    7297.250000
50%       68.000000    9446.000000
75%       80.000000   11589.000000
max      182.000000  215245.000000


In [17]:
s_test['saleprice'] = np.mean(s_train['saleprice'])
s_test['saleprice']

0      181469.701609
1      181469.701609
2      181469.701609
3      181469.701609
4      181469.701609
           ...      
873    181469.701609
874    181469.701609
875    181469.701609
876    181469.701609
877    181469.701609
Name: saleprice, Length: 878, dtype: float64

In [18]:
# train_feat = s_train[['']]
# test_feat = s_test[['']]
# print(train_feat)
# print(test_feat)

In [19]:
X_train = train_feat
X_test = test_feat
print(X_train.shape)
print(X_test.shape)

(2051, 4)
(878, 4)


In [20]:
y_train = s_train['saleprice']
y_test = s_test['saleprice']
print(y_train.shape)
print(y_test.shape)

(2051,)
(878,)


In [21]:
y_test

0      181469.701609
1      181469.701609
2      181469.701609
3      181469.701609
4      181469.701609
           ...      
873    181469.701609
874    181469.701609
875    181469.701609
876    181469.701609
877    181469.701609
Name: saleprice, Length: 878, dtype: float64

In [22]:
lr = LinearRegression()

In [23]:
# Fit the model on training data
lr.fit(X_train, y_train)

TypeError: float() argument must be a string or a number, not 'map'

In [None]:
y_pred = lr.predict(X_test)