In [45]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

In [46]:
data = pd.read_csv('AmesHousing.tsv', sep = '\t')

In [47]:
def transform_features(df):
    
    ## Drop columns with more than 5 percent missing data
    high_missings = df.isnull().sum()/len(df)
    high_missings = high_missings[high_missings >= 0.05]
    df = df.drop(high_missings.index, axis=1)
    
    ## Drop text columns with missing data
    data_missing_text_count = df.select_dtypes(include=['object']).isnull().sum()
    data_missing_text_column = data_missing_text_count[data_missing_text_count > 0]
    df = df.drop(data_missing_text_column.index, axis = 1)
    
    ## Fill the null numeric cells with the mode of that column
    missing_numeric = df.select_dtypes(include = ['integer', 'float']).isnull().sum()
    missing_numeric_col = missing_numeric[missing_numeric > 0]
    mode_value = df[missing_numeric_col.index].mode().to_dict(orient='records')[0]
    df = df.fillna(mode_value)
    
    years_sold = df['Yr Sold'] - df['Year Built']
    years_since_remod = df['Yr Sold'] - df['Year Remod/Add']
    df['Years Before Sale'] = years_sold
    df['Years Since Remod'] = years_since_remod
    df = df.drop([1702, 2180, 2181], axis=0)

    df = df.drop(["PID", "Order", "Mo Sold", "Sale Condition", "Sale Type", "Year Built", "Year Remod/Add"], axis=1)

    return df

In [48]:
def select_features(df):
    numerical_df = df.select_dtypes(include = ['integer', 'float'])
    corr_coeff = numerical_df.corr()
    corr_coeff_with_price = corr_coeff['SalePrice'].abs().sort_values()
    high_corr_coeff_with_price = corr_coeff_with_price[corr_coeff_with_price > 0.4]
    high_corr_coeff_with_price
    
    ## Drop columns with low correlation with SalePrice column
    df = df.drop(corr_coeff_with_price[corr_coeff_with_price < 0.4].index, axis=1)
    
    ## Drop categorical columns with too many unique values
    nominals = ["PID", "MS SubClass", "MS Zoning", "Street", "Alley", "Land Contour", "Lot Config", "Neighborhood", 
                "Condition 1", "Condition 2", "Bldg Type", "House Style", "Roof Style", "Roof Matl", "Exterior 1st", 
                "Exterior 2nd", "Mas Vnr Type", "Foundation", "Heating", "Central Air", "Garage Type", 
                "Misc Feature", "Sale Type", "Sale Condition"]
    numerical_nominals = []
    for col in nominals:
        if col in df.columns:
            numerical_nominals.append(col)    
    unique_cat_counts = df[numerical_nominals].apply(lambda x: len(x.value_counts()))
    high_unique_cat_counts = unique_cat_counts[unique_cat_counts > 10]
    df = df.drop(high_unique_cat_counts.index, axis=1)
    
    ## Dummy code the categorical columns
    text_cols = df.select_dtypes(include = ['object'])
    for col in text_cols:
        df[col] = df[col].astype('category')
    df = pd.concat([df, pd.get_dummies(df.select_dtypes(include = ['category']))], axis=1)
    
    return df

In [49]:
def train_and_test(df, k=0):
    numeric_df = df.select_dtypes(include=['integer', 'float'])
    features = numeric_df.columns.drop("SalePrice")
    lr = linear_model.LinearRegression()
    
    if k == 0:
        train = df[:1460]
        test = df[1460:]

        lr.fit(train[features], train["SalePrice"])
        predictions = lr.predict(test[features])
        mse = mean_squared_error(test["SalePrice"], predictions)
        rmse = np.sqrt(mse)

        return rmse
    
    if k == 1:
        # Randomize *all* rows (frac=1) from `df` and return
        shuffled_df = df.sample(frac=1, )
        train = df[:1460]
        test = df[1460:]
        
        lr.fit(train[features], train["SalePrice"])
        predictions_one = lr.predict(test[features])        
        
        mse_one = mean_squared_error(test["SalePrice"], predictions_one)
        rmse_one = np.sqrt(mse_one)
        
        lr.fit(test[features], test["SalePrice"])
        predictions_two = lr.predict(train[features])        
       
        mse_two = mean_squared_error(train["SalePrice"], predictions_two)
        rmse_two = np.sqrt(mse_two)
        
        avg_rmse = np.mean([rmse_one, rmse_two])
        print(rmse_one)
        print(rmse_two)
        return avg_rmse
    
    else:
        kf = KFold(n_splits=k, shuffle=True)
        rmse_values = []
        for train_index, test_index, in kf.split(df):
            train = df.iloc[train_index]
            test = df.iloc[test_index]
            lr.fit(train[features], train["SalePrice"])
            predictions = lr.predict(test[features])
            mse = mean_squared_error(test["SalePrice"], predictions)
            rmse = np.sqrt(mse)
            rmse_values.append(rmse)
        print(rmse_values)
        avg_rmse = np.mean(rmse_values)
        return avg_rmse

In [55]:
transformed_data = transform_features(data)
filtered_data = select_features(transformed_data)
train_and_test(filtered_data, k=5)

[25298.114110108094, 40669.40751608915, 25826.213061933526, 26082.601894184918, 26784.39812513104]


28932.146941489344