In [959]:
## Predicting house sale prices in Ames, Iowa
# imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, KFold


In [960]:
housing = pd.read_csv('AmesHousing.tsv', delimiter = '\t')

In [961]:
housing.head(5)

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


In [962]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2930 entries, 0 to 2929
Data columns (total 82 columns):
Order              2930 non-null int64
PID                2930 non-null int64
MS SubClass        2930 non-null int64
MS Zoning          2930 non-null object
Lot Frontage       2440 non-null float64
Lot Area           2930 non-null int64
Street             2930 non-null object
Alley              198 non-null object
Lot Shape          2930 non-null object
Land Contour       2930 non-null object
Utilities          2930 non-null object
Lot Config         2930 non-null object
Land Slope         2930 non-null object
Neighborhood       2930 non-null object
Condition 1        2930 non-null object
Condition 2        2930 non-null object
Bldg Type          2930 non-null object
House Style        2930 non-null object
Overall Qual       2930 non-null int64
Overall Cond       2930 non-null int64
Year Built         2930 non-null int64
Year Remod/Add     2930 non-null int64
Roof Style         29

In [963]:
housing.isnull().sum()

Order                0
PID                  0
MS SubClass          0
MS Zoning            0
Lot Frontage       490
Lot Area             0
Street               0
Alley             2732
Lot Shape            0
Land Contour         0
Utilities            0
Lot Config           0
Land Slope           0
Neighborhood         0
Condition 1          0
Condition 2          0
Bldg Type            0
House Style          0
Overall Qual         0
Overall Cond         0
Year Built           0
Year Remod/Add       0
Roof Style           0
Roof Matl            0
Exterior 1st         0
Exterior 2nd         0
Mas Vnr Type        23
Mas Vnr Area        23
Exter Qual           0
Exter Cond           0
                  ... 
Bedroom AbvGr        0
Kitchen AbvGr        0
Kitchen Qual         0
TotRms AbvGrd        0
Functional           0
Fireplaces           0
Fireplace Qu      1422
Garage Type        157
Garage Yr Blt      159
Garage Finish      159
Garage Cars          1
Garage Area          1
Garage Qual

In [964]:
housing.shape

(2930, 82)

In [965]:
def transform_features(df):
    return df

def select_features(df):
    return df[['Gr Liv Area', 'SalePrice']]
    
def train_and_test(df):
    train = df.iloc[:1460]
    test = df.iloc[1460:]
    
    model = LinearRegression()
    train = select_features(train).select_dtypes(include=['integer', 'float'])
    test = select_features(test).select_dtypes(include=['integer', 'float'])
    features = train.columns.drop('SalePrice')
    model.fit(train[features], train['SalePrice'])
    predictions = model.predict(test[features])
    
    mse = mean_squared_error(predictions, test['SalePrice'])
    rmse = np.sqrt(mse)
    return rmse

In [966]:
rmse = train_and_test(housing)

In [967]:
print(rmse)

57088.25161263909


In [968]:
categorical_columns = ['MS SubClass', 'MS Zoning', 'Street', 
                       'Alley', 'Lot Shape', 'Land Contour', 'Utilities', 
                       'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1', 
                       'Condition 2', 'Bldg Type', 'House Style', 'Overall Qual', 
                       'Overall Cond', 'Roof Style', 'Roof Matl', 'Exterior 1st', 
                       'Exterior 2nd', 'Mas Vnr Type', 'Exter Qual', 'Exter Cond',
                      'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1',
                      'BsmtFin Type 2', 'Heating', 'Heating QC', 'Central Air', 'Electrical', 'Kitchen Qual',
                      'Functional', 'Fireplace Qu', 'Garage Type', 'Garage Finish', 'Garage Qual', 
                      'Garage Cond', 'Paved Drive', 'Pool QC', 'Fence', 'Misc Feature']


In [969]:
def transform_features(df):
    # remove columns that leak information about the sale
    df.drop(columns=['Mo Sold', 'Yr Sold', 'Sale Type', 'Sale Condition'], inplace=True) 

    # transform appropriate columns into categorical
    for col in categorical_columns:
        df[col] = df[col].astype('category')

    # get rid of the categorical columns that dont have enough variance
    # a value that takes up more than 50% of the total values
    not_enuf_variance = []
    len_df = len(df)
    for col in df.select_dtypes(include=['category']).columns:
        max_count = df[col].value_counts().head(1)[0]
        if(max_count > (0.5 * len_df)):
            not_enuf_variance.append(col)
    
    df.drop(columns=not_enuf_variance, inplace=True)
    
    # drop columns with 5% null values
    threshold=len(df) - (int(len(df) * 0.05))
    df.dropna(how='all', inplace=True, axis=1, thresh=threshold)
    
    # for null values in categorical columns replace with mode()
    # get dummies of all categorical columns
    cat_col = df.select_dtypes(include=['category']).columns
    for col in cat_col:
        df[col] = df[col].fillna(df[col].mode()[0])
        
    # for null values in numerical columns replace with mean()
    num_col = df.select_dtypes(include=['int', 'float']).columns
    for col in num_col:
        df[col] = df[col].fillna(df[col].mean())
    
    # create new features by combining features
    df['years_until_remod'] = df['Year Remod/Add'] - df['Year Built']
    # remove years related columns
    df.drop(columns=['Year Remod/Add', 'Year Built'], inplace=True)
    # remove Order and PID columns
    df.drop(columns=['Order', 'PID'], inplace=True)

    # remove columns with quite a few zeros
    df.drop(columns=['Enclosed Porch', '3Ssn Porch', 'Screen Porch', 'Pool Area', 'Misc Val'], inplace=True)

    return df

def select_features(df):
    # remove columns with correlation of < 0.35
    corr_ = df.corr()['SalePrice'].abs() < 0.35
    df.drop(columns=corr_[corr_].index, inplace=True)
    
    df.drop(columns=['Exterior 1st', 'Exterior 2nd'], inplace=True)
    
    # convert categorical columns to get_dummies()
    cat_col = df.select_dtypes(include=['category']).columns
    
    df = pd.concat([df, pd.get_dummies(df.select_dtypes(include=['category']))], axis=1).drop(cat_col,axis=1)
    
    return df
    
def train_and_test(df, k):
    
    features = df.columns.drop("SalePrice")
    lr = LinearRegression()
    
    if k == 0:
        train = df[:1460]
        test = df[1460:]

        lr.fit(train[features], train["SalePrice"])
        predictions = lr.predict(test[features])
        mse = mean_squared_error(test["SalePrice"], predictions)
        rmse = np.sqrt(mse)

        return rmse
    
    if k == 1:
        # Randomize *all* rows (frac=1) from `df` and return
        shuffled_df = df.sample(frac=1, )
        train = df[:1460]
        test = df[1460:]
        
        lr.fit(train[features], train["SalePrice"])
        predictions_one = lr.predict(test[features])        
        
        mse_one = mean_squared_error(test["SalePrice"], predictions_one)
        rmse_one = np.sqrt(mse_one)
        
        lr.fit(test[features], test["SalePrice"])
        predictions_two = lr.predict(train[features])        
       
        mse_two = mean_squared_error(train["SalePrice"], predictions_two)
        rmse_two = np.sqrt(mse_two)
        
        avg_rmse = np.mean([rmse_one, rmse_two])
        print(rmse_one)
        print(rmse_two)
        return avg_rmse
    else:
        kf = KFold(n_splits=k, shuffle=True)
        rmse_values = []
        for train_index, test_index, in kf.split(df):
            train = df.iloc[train_index]
            test = df.iloc[test_index]
            lr.fit(train[features], train["SalePrice"])
            predictions = lr.predict(test[features])
            mse = mean_squared_error(test["SalePrice"], predictions)
            rmse = np.sqrt(mse)
            rmse_values.append(rmse)
        print(rmse_values)
        avg_rmse = np.mean(rmse_values)
        return avg_rmse

In [970]:
housing_tr = transform_features(housing)
housing_sel = select_features(housing_tr)
for k in range(1, 10):
    print("k is " + str(k))
    rmse = train_and_test(housing_sel, k)
    print('rmse is ' + str(rmse))
    print('----------------------------')

k is 1
35467.912915133136
28206.894859728072
rmse is 31837.403887430606
----------------------------
k is 2
[28206.413087809182, 31113.06775379122]
rmse is 29659.7404208002
----------------------------
k is 3
[30039.886188657012, 29128.899051833, 29266.278482901787]
rmse is 29478.35457446393
----------------------------
k is 4
[25468.8740167843, 34974.86601290404, 31172.421686624977, 26775.9085645328]
rmse is 29598.01757021153
----------------------------
k is 5
[29452.562093297543, 33890.560821080384, 30522.51365862367, 26082.878166336093, 29633.664428127595]
rmse is 29916.435833493062
----------------------------
k is 6
[27227.54478165712, 28478.810573543997, 28074.130855662803, 28591.141266312916, 34060.56063172048, 33185.00150523392]
rmse is 29936.198269021872
----------------------------
k is 7
[32355.726568822643, 27728.831692134318, 30164.522881442637, 27271.719163033973, 37713.76436288353, 24593.69827278736, 26656.117317496326]
rmse is 29497.76860837154
------------------------

In [971]:
print(rmse)

29442.664227785383
