In [47]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pd.set_option('display.max_columns', 100)
sns.set()

In [48]:
def load_test_train_data() :

    print("\n\n*****\nLoading test and train data....")
    df_train = pd.read_csv("../data/raw/train.csv",index_col=0)
    df_test = pd.read_csv("../data/raw/test.csv", index_col=0)
    df_train["data_type"] = "train"
    df_test["data_type"] = "test"
    df_test["SalePrice"] = 0

    df = pd.concat([df_train, df_test], sort=False)
    df = df.reset_index(drop=True)

    print("Train Data Shape :", df_train.shape)
    print("Test Data Shape : ", df_test.shape)
    print("Combined Data Shape : ", df.shape)

    return (df_train, df_test, df)


In [49]:
def write_processed_files(df):
    
    print("\n\n*****\nGenerating processed data files...")

    train = df[df.data_type_train == 1].reset_index(drop=True)
    test = df[df.data_type_test == 1].reset_index(drop=True)    

    train.drop("data_type_train", axis=1, inplace=True)
    train.drop("data_type_test", axis=1, inplace=True)    
    test.drop("data_type_train", axis=1,inplace=True)
    test.drop("data_type_test", axis=1,inplace=True)
    
    df.to_csv("../data/processed_data/full_processed.csv")
    train.to_csv("../data/processed_data/train_processed.csv")
    test.to_csv("../data/processed_data/test_processed.csv")

    return None



In [50]:
def get_column_names_by_type(df):
    numeric_columns = df.describe().columns
    non_numeric_columns = [i for i in df.columns if i not in numeric_columns]
    return (numeric_columns, non_numeric_columns)
    

In [51]:
#missing value counter
def na_counter(data):
    coln = list(data.columns)

    if data.isnull().any().sum() == 0:
        return pd.DataFrame(columns=['NA Count',"PERCENT"])
    
    na_dict ={}
    for col in coln:
        if data[col].isnull().any()==True:
            na_count = data[col].isnull().sum()
            na_percent = data[col].isnull().sum()/len(data)
            na_dict[col]=[na_count, na_percent]
    df_na =pd.DataFrame(na_dict).T
    df_na.columns = ['NA Count',"PERCENT"]
    df_na.index.name = "Column"
    return df_na

In [52]:
def impute_null_fields(df):
    
    print("\n\n*****\nImputing Null values....")
    print("Null counts before imputing:")
    print(na_counter(df))
    

    """ ?? """
    df["MSZoning"] = df["MSZoning"].fillna("RM")
    df["LotFrontage"] = df["LotFrontage"].fillna(0)
    
    df["MasVnrType"] = df["MasVnrType"].fillna("None") 
    df["MasVnrArea"] = df["MasVnrArea"].fillna(0.0) 
    df["BsmtQual"] = df["BsmtQual"].fillna("NoBsmt")
    df["BsmtCond"] = df["BsmtCond"].fillna("NoBsmt")
    df["BsmtExposure"] = df["BsmtExposure"].fillna("NoBsmt")
    df["BsmtFinType1"] = df["BsmtFinType1"].fillna("NoBsmt")
    df["BsmtFinType2"] = df["BsmtFinType2"].fillna("NoBsmt")
    df['Exterior1st'] = df['Exterior1st'].fillna("None") 
    df['Exterior2nd'] = df['Exterior2nd'].fillna("None")
    df['BsmtFinSF1'] = df['BsmtFinSF1'].fillna(0) 
    df['BsmtFinSF2'] = df['BsmtFinSF2'].fillna(0) 
    df['BsmtUnfSF'] = df['BsmtUnfSF'].fillna(0) 
    df['TotalBsmtSF'] = df['TotalBsmtSF'].fillna(0) 
    

    """ Only one row missing the value, impute with "SBrkr" which is the most used """
    df.Electrical = df.Electrical.fillna("SBrkr")

    """ Impute with equavelant of overall house quality : Average, equivalent "TA" """
    df.KitchenQual = df.KitchenQual.fillna("None")

    df.FireplaceQu = df.FireplaceQu.fillna("None")
    
    """ First change Garage Type for row with Null area and then change Garage Area """
    df.loc[df.GarageArea.isnull(), "GarageType"] = np.nan
    df.loc[df.GarageArea.isnull(), "GarageArea"] = 0
    
        

    """ For one record with Garage Area 360 and with missing Garage values, set to average values based on Overall Quality """
    df.loc[(df.GarageArea==360) & df.GarageFinish.isnull(), "GarageFinish"] = "Unf"
    df.loc[(df.GarageArea==360) & df.GarageQual.isnull(), "GarageQual"] = "TA"
    df.loc[(df.GarageArea==360) & df.GarageCond.isnull(), "GarageCond"] = "TA"
    """ Setting the Garage built to "Year Remodelled" """
    df.loc[(df.GarageArea==360) & df.GarageYrBlt.isnull(), "GarageYrBlt"] = df.YearRemodAdd


    
    """ Set rest of the missing Garage valies to None and GarageCars to zero"""
    df.GarageFinish = df.GarageFinish.fillna("None")
    df.GarageQual = df.GarageQual.fillna("None")
    df.GarageCond = df.GarageCond.fillna("None")
    df.GarageType = df.GarageType.fillna("None")

    df.GarageCars = df.GarageCars.fillna(0)


    """ Is it correct to fill with Zero for Year built as the scale will change compared to most values in 1900""" 
    df.GarageYrBlt = df.GarageYrBlt.fillna(0)


    """ Impiute with Typ as mentioned in documentation. There does not seem to be clear relation with
        Overall quality fields
    """
    df.loc[df.Functional.isnull(), "Functional"] = "Typ"


    """Basement Fullbath and Halfbath NA values are related to  houses without basements. Soe set them to 0 """
    df.BsmtFullBath = df.BsmtFullBath.fillna(0)
    df.BsmtHalfBath = df.BsmtHalfBath.fillna(0)
    
    
    """ ?? """
    df['SaleType'] = df['SaleType'].fillna("Oth")
    df['GarageQual'] = df['GarageQual'].fillna("None")

    print("\n\n\nNull counts after imputing:")
    print(na_counter(df))

    return df

In [53]:
def correct_data(df):
    
    print("\n\n*****\nCorrecting incorrect data values...")

    """Seems the year is 2207 is a typo, setting to the year remodeled"""
    df.loc[df.GarageYrBlt==2207, "GarageYrBlt"] = 2007
    return df

In [54]:

def transform_ordinal(df):
    typical_graded_cols = ["HeatingQC", "KitchenQual", "FireplaceQu", "GarageQual", "GarageCond"]
    typical_grade_map = {'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'None':0}

    for i in typical_graded_cols:
        df[i].replace(typical_grade_map, inplace=True)
    
    df['Functional'].replace({'Typ':7,'Min1':6,'Min2':5,'Mod':4,'Maj1':3,'Maj2':2,'Sev':1,'Sal':0}, inplace=True)
    df['GarageFinish'].replace({'Fin':3,'RFn':2,'Unf':1,'None':0}, inplace=True)
    
    return df

In [55]:
def drop_columns(df):
    
    print("\n\n*****\nDropping selective columns...")
    columsn_to_drop = ["Street", "Alley","Utilities","Condition2", "RoofMatl", "LowQualFinSF","GarageYrBlt",
                       "GarageQual", "PoolQC","Fence","MiscFeature", "GarageCond","MiscVal"]

    print("\n\n*****\nNo of columns before dropping : ", len(df.columns))
    print("No of columns to drop : ", len(columsn_to_drop))
    
    df.drop(columsn_to_drop, axis=1, inplace=True)
    print("No of columns after dropping : ", len(df.columns))

    return df

In [80]:
def print_data_stats(df):
    print("\n\n*****\nPrinting stats...")
    print("No of rows : ", df.shape[0])
    print("No of columns : ", df.shape[1])
    
    x,y = get_column_names_by_type(df)
    print("Numeric columns : ", len(x))
    print("Non Numeric columns : ", len(y))
    return df

In [99]:
import os
def process_raw_data():
    
    print("\n\n*****\n Processing raw files to generate processed files... hold tight...")
    train_df, test_df, full_df = load_test_train_data()
    
    full_df = full_df.\
            pipe(print_data_stats).\
            pipe(impute_null_fields).\
            pipe(correct_data).\
            pipe(drop_columns).\
            pipe(pd.get_dummies).\
            pipe(print_data_stats).\
            pipe(write_processed_files)

    return(train_df, test_df, full_df)
    

In [100]:
_ = process_raw_data()



*****
 Processing raw files to generate processed files... hold tight...


*****
Loading test and train data....
Train Data Shape : (1460, 81)
Test Data Shape :  (1459, 81)
Combined Data Shape :  (2919, 81)


*****
Printing stats...
No of rows :  2919
No of columns :  81
Numeric columns :  37
Non Numeric columns :  44


*****
Imputing Null values....
Null counts before imputing:
              NA Count   PERCENT
Column                          
MSZoning           4.0  0.001370
LotFrontage      486.0  0.166495
Alley           2721.0  0.932169
Utilities          2.0  0.000685
Exterior1st        1.0  0.000343
Exterior2nd        1.0  0.000343
MasVnrType        24.0  0.008222
MasVnrArea        23.0  0.007879
BsmtQual          81.0  0.027749
BsmtCond          82.0  0.028092
BsmtExposure      82.0  0.028092
BsmtFinType1      79.0  0.027064
BsmtFinSF1         1.0  0.000343
BsmtFinType2      80.0  0.027407
BsmtFinSF2         1.0  0.000343
BsmtUnfSF          1.0  0.000343
TotalBsmtSF        1.0

# Skewness between test and train data

In [None]:
xx = df.groupby("data_type").agg(lambda x: len(x.unique()))

In [None]:
xx[numeric_columns][xx>15]

# Skewness check for columns

In [None]:
numeric_columns

In [None]:
for i in numeric_columns:
    df[i].hist()
    plt.xlabel(i)
    plt.show()

In [None]:
from scipy.stats import skew
from scipy.stats import skewtest

skew(df["1stFlrSF"])

numeric_df = df[numeric_columns]

numeric_df.apply(skew).sort_values(ascending=False)



In [None]:
skewtest(df["1stFlrSF"])

# Extreme value check

In [None]:
df2.min()

In [None]:
df2.max()

In [None]:
""" 2207 for Garage built is odd value"""
df[df.GarageYrBlt==2207]

In [None]:

df = correct_data(df)
df2 = df.iloc[:,38:64]

In [None]:
""" Checking the extreme value of 15 for Total rooms"""
df[df.TotRmsAbvGrd == 15]

In [None]:
""" Does not see totally odd, given other values abouve 10"""
df.TotRmsAbvGrd.value_counts()

# Dummy variable genaration
