# Predicting House Sale Prices

## Imports

In [1]:
import pandas as pd
pd.options.display.max_columns = 999
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold

from sklearn.metrics import mean_squared_error
from sklearn import linear_model
from sklearn.model_selection import KFold

# Functions

In [48]:
def drop_above_cutoff_null_columns(df, cutoff):
    null_stats = df.isnull().sum()
    above_cutoff_cols = null_stats[(null_stats >= (df.shape[0] * cutoff))].index
    return df.drop(above_cutoff_cols, axis=1)

def drop_above_cutoff_nulls_in_text_columns(df, cutoff=1):
    null_stats = df.select_dtypes(include=["object"]).isnull().sum().sort_values(ascending=False)
    above_cutoff_text_cols = null_stats[null_stats > 0].index
    return df.drop(above_cutoff_text_cols, axis=1)

def fill_numerical_values(df):
    null_stats = df.select_dtypes(include=["int", "float"]).isnull().sum().sort_values(ascending=False)
    cols = null_stats[(null_stats < (df.shape[1] * 0.25))].index
    for col in cols:
        df[col] = df[col].fillna(df[col].mean())
    return df

def transform_features(df, cutoff=0.25):
    drop_above_cutoff_null_columns(df, cutoff)
    drop_above_cutoff_nulls_in_text_columns(df, 1)
    fill_numerical_values(df)
    return df

def select_features(df):
    return df[['Gr Liv Area', 'SalePrice']]

def train_and_test(df):
    train = df[:1460]
    test = df[1460:]

    numeric_train = train.select_dtypes(include=['integer', 'float'])
    numeric_test = test.select_dtypes(include=['integer', 'float'])

    features = numeric_train.columns.drop("SalePrice")
    lr = linear_model.LinearRegression()
    lr.fit(train[features], train["SalePrice"])

    predictions = lr.predict(test[features])
    mse = mean_squared_error(test["SalePrice"], predictions)
    return np.sqrt(mse)


In [49]:
df = pd.read_csv("AmesHousing.tsv", sep="\t")

In [50]:
transformed_df = transform_features(df)
filtered_df = select_features(transformed_df)

In [51]:
rmse = train_and_test(filtered_df)
print(rmse)

57088.25161263909
