In [1]:
import seaborn as sns
import matplotlib.pyplot as plt

def showNanPercentage(df):
    col_with_null = []
    for (columnName, columnData) in df.iteritems():
        if columnData.isnull().sum():
            col_with_null.append(columnName)
    
    df_na = ((df[col_with_null].isnull().sum() / len(df[col_with_null])) * 100).sort_values(ascending=False)
    df_na.drop("SalePrice", inplace=True)
    df_na.plot(kind='bar')

    plt.title("NaN percentage")
    plt.show()

In [2]:
def handle_null(df):
    
    na_to_none_cols = ["Alley", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", 
                       "BsmtFinType2", "FireplaceQu", "GarageType", "GarageFinish", "GarageQual", 
                       "GarageCond", "PoolQC", "Fence", "MasVnrType", "MiscFeature"]
    for x in na_to_none_cols:
        df[x].fillna("None", inplace=True)
        
        
    df_temp = df.groupby('Neighborhood')['LotFrontage'].median()
    for i, row in df.iterrows():
        if pd.isnull(row['LotFrontage']):
            df.loc[i, 'LotFrontage'] = df_temp[row['Neighborhood']]


    na_to_0 = ['BsmtHalfBath', 'BsmtFullBath', 'GarageCars', 'TotalBsmtSF', 'BsmtUnfSF', 
               'BsmtFinSF1', 'BsmtFinSF2', 'MasVnrArea']
    for x in na_to_0:
        df[x].fillna(0.0, inplace=True)
        
        
    na_to_most_frequent = ['SaleType', 'KitchenQual', 'Electrical', 'Exterior1st', 'Exterior2nd', 'MSZoning']
    for x in na_to_most_frequent:
        df[x].fillna(df[x].mode()[0], inplace=True)
        
        
    df['Functional'].fillna('Typ', inplace=True)
    df['Utilities'].fillna('AllPub', inplace=True)

    
    #I'm not sure if I dealt with these NaNs well
    df['GarageYrBlt'].fillna(df['GarageYrBlt'].median(), inplace=True) 
    df['GarageArea'].fillna(df['GarageArea'].median(), inplace=True)
    
    return df

In [3]:
from sklearn.model_selection import cross_val_score
import numpy as np

def cv_rmse(model, X, Y, kf):
    lista = np.sqrt(-cross_val_score(model, X, Y, scoring="neg_mean_squared_error", cv=kf))
    return sum(lista)/len(lista)

def cv_abs(model, X, Y, kf):
    lista = np.sqrt(-cross_val_score(model, X, Y, scoring="neg_mean_absolute_error", cv=kf))
    return sum(lista)/len(lista)

In [4]:
from scipy.stats import skew, norm
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

def fix_skew(df):
    numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    numeric = []
    for x in df.columns:
        if df[x].dtype in numeric_dtypes:
            numeric.append(x)
    
    skew_features = df[numeric].apply(lambda x: skew(x))
    high_skew = skew_features[skew_features > 0.5]
    skew_index = high_skew.index
    
    for i in skew_index:
        df[i] = boxcox1p(df[i], boxcox_normmax(df[i] + 1))
    
    return df

In [5]:
import math

def graph_numeric(df):
    
    numeric = []
    for i in df.columns:
        if df[i].dtype in ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']:
            numeric.append(i)
                
    fig, axs = plt.subplots(ncols=2, nrows=math.ceil((len(numeric) - 1)/2), figsize=(12, 120))
    sns.color_palette("husl", 8)
    
    for i, feature in enumerate(list(df[numeric]), 1):
        if(feature=='MiscVal'):
            break
        plt.subplot(len(list(numeric)), 3, i)
        sns.scatterplot(x=feature, y='SalePrice', hue='SalePrice', palette='Blues', data=df)

        plt.xlabel('{}'.format(feature), size=15,labelpad=12.5)
        plt.ylabel('SalePrice', size=15, labelpad=12.5)
    

In [6]:
def show_corr(frame, features):
    spr = pd.DataFrame()
    spr['feature'] = features
    spr['spearman'] = [frame[f].corr(frame['SalePrice'], 'spearman') for f in features]
    spr = spr.sort_values('spearman')
    plt.figure(figsize=(6, 0.25*len(features)))
    sns.barplot(data=spr, y='feature', x='spearman', orient='h')

In [7]:
def feature_engineer(df):

    totalSF = ['GrLivArea', 'TotalBsmtSF', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 
           'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea']
    
    df['TotalArea'] = df[totalSF].sum(axis=1)
    df['TotalBath'] = (df['FullBath'] + (0.5 * df['HalfBath']) + df['BsmtFullBath'] + (0.5 * df['BsmtHalfBath']))
    #df_lasso['TotalPorchArea'] = df['OpenPorchSF'] + df['3SsnPorch'] + df['EnclosedPorch'] + df['ScreenPorch'] + df['WoodDeckSF']
    
    return df

In [None]:
def feature_engineer_ridge(all_features):
    all_features['BsmtFinType1_Unf'] = 1*(all_features['BsmtFinType1'] == 'Unf')
    all_features['HasWoodDeck'] = (all_features['WoodDeckSF'] == 0) * 1
    all_features['HasOpenPorch'] = (all_features['OpenPorchSF'] == 0) * 1
    all_features['HasEnclosedPorch'] = (all_features['EnclosedPorch'] == 0) * 1
    all_features['Has3SsnPorch'] = (all_features['3SsnPorch'] == 0) * 1
    all_features['HasScreenPorch'] = (all_features['ScreenPorch'] == 0) * 1
    all_features['YearsSinceRemodel'] = all_features['YrSold'].astype(int) - all_features['YearRemodAdd'].astype(int)
    all_features['Total_Home_Quality'] = all_features['OverallQual'] + all_features['OverallCond']
    all_features = all_features.drop(['Utilities', 'Street', 'PoolQC',], axis=1)
    all_features['TotalSF'] = all_features['TotalBsmtSF'] + all_features['1stFlrSF'] + all_features['2ndFlrSF']
    all_features['YrBltAndRemod'] = all_features['YearBuilt'] + all_features['YearRemodAdd']

    all_features['Total_sqr_footage'] = (all_features['BsmtFinSF1'] + all_features['BsmtFinSF2'] +
                                     all_features['1stFlrSF'] + all_features['2ndFlrSF'])
    all_features['Total_Bathrooms'] = (all_features['FullBath'] + (0.5 * all_features['HalfBath']) +
                                   all_features['BsmtFullBath'] + (0.5 * all_features['BsmtHalfBath']))
    all_features['Total_porch_sf'] = (all_features['OpenPorchSF'] + all_features['3SsnPorch'] +
                                  all_features['EnclosedPorch'] + all_features['ScreenPorch'] +
                                  all_features['WoodDeckSF'])
    all_features['TotalBsmtSF'] = all_features['TotalBsmtSF'].apply(lambda x: np.exp(6) if x <= 0.0 else x)
    all_features['2ndFlrSF'] = all_features['2ndFlrSF'].apply(lambda x: np.exp(6.5) if x <= 0.0 else x)
    all_features['GarageArea'] = all_features['GarageArea'].apply(lambda x: np.exp(6) if x <= 0.0 else x)
    all_features['GarageCars'] = all_features['GarageCars'].apply(lambda x: 0 if x <= 0.0 else x)
    all_features['LotFrontage'] = all_features['LotFrontage'].apply(lambda x: np.exp(4.2) if x <= 0.0 else x)
    all_features['MasVnrArea'] = all_features['MasVnrArea'].apply(lambda x: np.exp(4) if x <= 0.0 else x)
    all_features['BsmtFinSF1'] = all_features['BsmtFinSF1'].apply(lambda x: np.exp(6.5) if x <= 0.0 else x)

    all_features['haspool'] = all_features['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
    all_features['has2ndfloor'] = all_features['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
    all_features['hasgarage'] = all_features['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
    all_features['hasbsmt'] = all_features['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
    all_features['hasfireplace'] = all_features['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
    return all_features