# This notebook contains the analysis to find impactful features
This also explore the Normalization and the outcome is used in dataloader to generate normalized data

In [1]:
from data_loader import DataLoader
from neuralnetwork import create_nn_regressor
from configurations import args
from base_regressor_plot import BaseRegressorPlot
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd


In [2]:
data_loader = DataLoader(args)

In [3]:
# Analysis showed these features are impactful
categorical_features = ['MSSubClass', 'MSZoning', 'LotShape',
                        'LandContour', 'LotConfig', 'LandSlope', 
                        'Neighborhood', 'Condition1', 'Condition2',
                        'BldgType', 'HouseStyle', 'RoofStyle', 'Heating',
                        'HeatingQC']
nominal_features = ['2ndFlrSF', '1stFlrSF', 'PoolArea', 'YearRemodAdd', 
                    'MasVnrArea', 'YearBuilt', 'KitchenAbvGr',
                     'GrLivArea', 'OverallQual',
                      'BedroomAbvGr', 'TotalBsmtSF', 'LotArea', 
                      'OverallCond',]
                    
full_list = nominal_features + categorical_features + ['SalePrice']


In [4]:
df = data_loader.extract_features(full_list)


## Check if there is any missing vairable

In [5]:
def print_feature_with_na_vals():
    '''This will count the na in each column and 
    print out the columns with NA and number of na in that column'''
    number_of_na = df.isna().sum() 
    print(number_of_na[number_of_na > 0])
print_feature_with_na_vals()

MasVnrArea    8
dtype: int64


Checking the number of NA in dataset showed that there are only 8 NA in MasVnrArea so we can drop them

In [6]:
def drop_na_from_df(data):
    before = data.shape[0]
    print(f'Before dropping NA {data.shape}')
    data = data.dropna()
    print(f'After dropping NA {data.shape}, dropped {before - data.shape[0]}')
    return data
df = drop_na_from_df(df)


Before dropping NA (1460, 28)
After dropping NA (1452, 28), dropped 8


Categorical Features with unique values which are unique in dataset will cause problem they need to be deleted since they create a std of zero either in the training set or the test set. This will break the normalization process.

Note: We need to run the code multipe times since when dropping a row it might make another row with a single value

In [7]:
def is_unique_value_in_cat_features(data):
    ''' This return '''
    is_any_unique_value = False
    for f in categorical_features:
        if (data[f].value_counts() == 1).any():
            for i, v in data[f].value_counts().items():
                if v == 1:
                    print('unique value:', i, v)
            is_any_unique_value = True
    return is_any_unique_value

def remove_unique_value_of_cat_features(data):
    before = data.shape[0]
    print(f'Before dropping NA {data.shape}')
    for f in categorical_features:
        if (data[f].value_counts() == 1).any():
            remove_list = []
            for i, v in data[f].value_counts().items():
                if v == 1:
                    print('removing:', i, v)
                    remove_list.append(i)
            data = data[~data[f].isin(remove_list)]
    print(f'Before dropping NA {data.shape}, dropped {before - data.shape[0]}')
    return data


print('is_unique_value_in_cat_features', is_unique_value_in_cat_features(df))
df = remove_unique_value_of_cat_features(df)


unique value: PosA 1
unique value: RRAn 1
unique value: RRAe 1
unique value: Floor 1
unique value: Po 1
is_unique_value_in_cat_features True
Before dropping NA (1452, 28)
removing: PosA 1
removing: RRAn 1
removing: RRAe 1
removing: Shed 1
removing: Floor 1
removing: Po 1
Before dropping NA (1446, 28), dropped 6


# Onehot Encoding is being used for categorical feature

In [8]:
def encode_onehot(data, column_name):
    ''' This onhot encode the categorical columns and drop the original column
    
    '''  
    categorical_values = data[column_name].unique()
    data_to_encode = data.pop(column_name)

    for cat_value in categorical_values:
        col_name = column_name+str(cat_value)
        data[col_name] = (data_to_encode == cat_value) * 1.0


def encode_cat_features(data, features):
    for f in features:
        encode_onehot(data, f)


encode_cat_features(df, categorical_features)

  data[col_name] = (data_to_encode == cat_value) * 1.0


In [9]:
df.shape

(1446, 116)

In [10]:
# Split data but since we need further analysis just combine back the Sales Price
X_train, X_test, y_train, y_test = data_loader.split_data_df(
    df, combine_back=True)


# Analyzing the correlation of nominal features and drop columns if they are highly correlated

In [11]:
def drop_highly_corrlated_features(X_train, X_test):
    X_train = X_train.drop(['1stFlrSF', '2ndFlrSF'], axis=1)
    X_test = X_test.drop(['1stFlrSF', '2ndFlrSF'], axis=1)
    # assuming this is final analysis so take out the Sales Proce
    y_train = X_train.pop('SalePrice')
    y_test = X_test.pop('SalePrice')
    return X_train, X_test, y_train, y_test
    
X_train, X_test, y_train, y_test = drop_highly_corrlated_features(X_train, X_test)


# Normilize Data

First checking out the stats

In [12]:
X_train.describe()


Unnamed: 0,PoolArea,YearRemodAdd,MasVnrArea,YearBuilt,KitchenAbvGr,GrLivArea,OverallQual,BedroomAbvGr,TotalBsmtSF,LotArea,...,RoofStyleFlat,HeatingGasA,HeatingGasW,HeatingGrav,HeatingWall,HeatingOthW,HeatingQCEx,HeatingQCGd,HeatingQCTA,HeatingQCFa
count,1156.0,1156.0,1156.0,1156.0,1156.0,1156.0,1156.0,1156.0,1156.0,1156.0,...,1156.0,1156.0,1156.0,1156.0,1156.0,1156.0,1156.0,1156.0,1156.0,1156.0
mean,2.555363,1985.16782,106.194637,1971.225779,1.046713,1513.214533,6.07526,2.865917,1055.370242,10354.016436,...,0.006055,0.982699,0.008651,0.00346,0.00346,0.00173,0.515571,0.165225,0.286332,0.032872
std,39.271719,20.677768,183.807864,30.258255,0.211114,525.223273,1.382154,0.815551,436.827395,9478.61451,...,0.077614,0.130447,0.092645,0.058747,0.058747,0.041577,0.499974,0.371544,0.452242,0.178379
min,0.0,1950.0,0.0,1872.0,1.0,334.0,1.0,0.0,0.0,1300.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1967.0,0.0,1953.0,1.0,1126.0,5.0,2.0,795.75,7500.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,1994.5,0.0,1972.0,1.0,1467.0,6.0,3.0,990.0,9391.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
75%,0.0,2004.0,169.25,2000.25,1.0,1768.0,7.0,3.0,1300.5,11475.75,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
max,738.0,2010.0,1600.0,2010.0,2.0,5642.0,10.0,8.0,6110.0,215245.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Normalize data 

In [13]:
def get_stats():
    stats = X_train.describe()
    return stats.transpose()

def normalize(data, stats):
    return (data - stats['mean']) / stats['std']

def get_normilze_data():
    # using the same stats for both train and test
    stats = get_stats()
    norm_X_train = normalize(X_train, stats)
    norm_X_test = normalize(X_test, stats)
    return norm_X_train, norm_X_test


norm_X_train, norm_X_test = get_normilze_data()


In [14]:
norm_X_train.shape, norm_X_test.shape, len(norm_X_train.keys())

((1156, 113), (290, 113), 113)


If there was problem then some will be Nan because of divided to zero) so vefirying that there is no missing value after normalization


In [47]:

norm_X_train.isna().values.any(), norm_X_test.isna().values.any()

(False, False)

There is not NA value so normilzation was successful