In [19]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
%matplotlib inline

data = pd.read_csv("AmesHousing.txt", delimiter="\t")
train = data[:1460]
test = data[1460:]

In [20]:
def transform_features(df):
    trans_df = pd.DataFrame(df)
    #Dropping columns which should be deleted
    trans_df.drop(['PID', 'Order', 'Yr Sold', 'Mo Sold', 'Sale Type', 'Sale Condition'], axis=1, inplace=True)
    #Fiding columns with more than 25% missing
    mth = 0.25
    null_counts = train.isnull().sum()
    missing_25 = null_counts > mth*train.shape[0]
    missing_cols = null_counts[missing_25].index.tolist()
    trans_df.drop(missing_cols, axis=1, inplace=True)
    #Dropping nominal columns which seem to have too much categories
    tm_nominal = ['MS SubClass', 'Land Contour', 'Lot Config', 'Neighborhood', 'Condition 1', 'Condition 2', 'Exterior 1st', 'Exterior 2nd', 'Electrical', 'Heating', 
'BsmtFin Type 2', 'BsmtFin SF 2', 'Bsmt Exposure', 'Garage Type', 'BsmtFin Type 1', 'Mas Vnr Type', 'Street', 'House Style']
    trans_df.drop(tm_nominal, axis=1, inplace=True)
    #Normalizing numerical columns
    numerical = pd.DataFrame(trans_df.select_dtypes(include=['int64', 'float64']))
    numerical.drop(['Year Built', 'Year Remod/Add', 'Garage Yr Blt', 'SalePrice'],axis=1, inplace=True)
    numerical_cols = numerical.columns
    trans_df.fillna(trans_df[numerical_cols].mean(), inplace=True)
    trans_df[numerical_cols] = trans_df[numerical_cols]/trans_df[numerical_cols].max()
    #Year manipulation
    trans_df['Building Age'] = 2010 - trans_df['Year Built']
    trans_df['Garage Age'] = 2010 - trans_df['Garage Yr Blt']
    trans_df['years_until_remod'] = trans_df['Year Remod/Add'] - trans_df['Year Built']
    trans_df['years_from_remod'] = 2010 - trans_df['Year Remod/Add']
    trans_df.drop(['Year Built', 'Garage Yr Blt', 'Year Remod/Add'], axis=1, inplace=True)
    trans_df['Garage Age'].fillna(trans_df['Garage Age'].mean(), inplace=True)
    #Mapping some ordinal columns
    cols_to_map = ['Exter Qual',
    'Exter Cond',
    'Bsmt Qual',
    'Bsmt Cond',
    'Heating QC',
    'Kitchen Qual',
    'Garage Qual',
    'Garage Cond']
    maps = {'Ex': 1.0, 'Gd': 0.75, 'TA': 0.5, 'Fa': 0.3, 'NA': 0.0}
    for col in cols_to_map:
        trans_df[col] = trans_df[col].map(maps)
        trans_df[col].fillna(0.0, inplace=True)
    maps = {'Fin': 1.0, 'RFn': 0.5, 'Unf': 0.25, 'NA': 0.0}
    trans_df['Garage Finish'] = trans_df['Garage Finish'].map(maps)
    trans_df['Garage Finish'].fillna(0.0, inplace=True)
    maps = {'Typ': 1.0, 'Min1': 0.8, 'Min2': 0.6, 'Mod': 0.5, 'Maj1': 0.35, 'Maj2':0.2, 'Sev': 0.1, 'Sal': 0.0}
    trans_df['Functional'] = trans_df['Functional'].map(maps)
    maps = {'AllPub':1.0, 'NoSewr':0.8, 'NoSeWa':0.5, 'ELO':0.25}
    trans_df['Utilities'] = trans_df['Utilities'].map(maps)
    trans_df['Central Air'] = trans_df['Central Air'].map({'N':0.0, 'Y':1.0})
    #Temp (dropping nominal columns which seems to be not tat important)
    trans_df.drop(trans_df.select_dtypes(exclude=['float64', 'int64']).columns, axis=1, inplace=True)
    return trans_df

In [21]:
def select_features(df):
    corrs = df.corr()
    abs_corrs = np.absolute(corrs['SalePrice']).sort_values(ascending=False).drop(['SalePrice','Full Bath', 'Garage Area'])
    treshold = 0.5
    features = abs_corrs[abs_corrs>treshold].index
    return df[features]

In [22]:
from sklearn.model_selection import KFold, cross_val_score
from  sklearn.linear_model import LinearRegression

def train_and_test(cv_type):
    if cv_type == 'kfold':
        k = 10
        df = transform_features(data)
        target = 'SalePrice'
        features = select_features(df).columns
        rs = np.random.seed(1)
        kf = KFold(n_splits=k, shuffle=True, random_state=rs)
        lr = LinearRegression()
        lr.fit(df[features], df[target])
        scores = cross_val_score(lr, df[features], df[target], cv=kf, scoring='neg_mean_squared_error')
        scores = np.sqrt(np.absolute(scores))
        avg_score = scores.mean()
        return scores, avg_score
    elif cv_type=='simple':
        rmses = []
        train_df = transform_features(train)
        test_df = transform_features(test)
        target = 'SalePrice'
        features = select_features(train_df).columns
        lr = LinearRegression()
        
        lr.fit(train_df[features], train_df[target])
        predictions = lr.predict(test_df[features])
        mse = np.sum((predictions-test_df['SalePrice'])**2)
        mse = mse / len(test_df)
        rmses.append(mse**(1/2))
        
        lr.fit(test_df[features], test_df[target])
        predictions = lr.predict(train_df[features])
        mse = np.sum((predictions-train_df['SalePrice'])**2)
        mse = mse / len(train_df)
        rmses.append(mse**(1/2))
        
        return rmses
    else:
        return "Wrong CV type!"

In [23]:

train_and_test('simple')

[64682.52121124532, 68990.66182710539]

In [24]:
train_and_test('kfold')

(array([ 29584.40481516,  29892.42984898,  31677.66845456,  41216.56158272,
         44426.55895744,  29087.62388888,  26633.60588588,  32453.47835665,
         43243.32282185,  32821.34673806]), 34103.700135019011)