# This notebook contains the analysis to find impactful features
This also explore the Normalization and the outcome is used in dataloader to generate normalized data

In [16]:
from data_loader import DataLoader
from configurations import args

In [17]:
data_loader = DataLoader(args)

## Analysis discoverd impactful feature in this analysis we evaluate them closely

In [18]:

categorical_features = ['MSSubClass', 'MSZoning', 'LotShape',
                        'LandContour', 'LotConfig', 'LandSlope', 
                        'Neighborhood', 'Condition1', 'Condition2',
                        'BldgType', 'HouseStyle', 'RoofStyle', 'Heating',
                        'HeatingQC']
nominal_features = ['2ndFlrSF', '1stFlrSF', 'PoolArea', 'YearRemodAdd', 
                    'MasVnrArea', 'YearBuilt', 'KitchenAbvGr',
                     'GrLivArea', 'OverallQual',
                      'BedroomAbvGr', 'TotalBsmtSF', 'LotArea', 
                      'OverallCond',]
                    
full_list = nominal_features + categorical_features + ['SalePrice']


In [19]:
df = data_loader.extract_features(full_list)


## Check if there is any missing vairable

In [20]:
def print_feature_with_na_vals():
    '''This will count the na in each column and 
    print out the columns with NA and number of na in that column'''
    number_of_na = df.isna().sum() 
    print(number_of_na[number_of_na > 0])
print_feature_with_na_vals()

MasVnrArea    8
dtype: int64


Checking the number of NA in dataset showed that there are only 8 NA in MasVnrArea so we can drop them

In [21]:
def drop_na_from_df(data):
    before = data.shape[0]
    print(f'Before dropping NA {data.shape}')
    data = data.dropna()
    print(f'After dropping NA {data.shape}, dropped {before - data.shape[0]}')
    return data
df = drop_na_from_df(df)


Before dropping NA (1456, 28)
After dropping NA (1448, 28), dropped 8


Categorical Features with unique values which are unique in dataset will cause problem they need to be deleted since they create a std of zero either in the training set or the test set. This will break the normalization process.

Note: We need to run the code multipe times since when dropping a row it might make another row with a single value

In [22]:
def is_unique_value_in_cat_features(data):
    ''' This return '''
    is_any_unique_value = False
    for f in categorical_features:
        if (data[f].value_counts() == 1).any():
            for i, v in data[f].value_counts().items():
                if v == 1:
                    print('unique value:', i, v)
            is_any_unique_value = True
    return is_any_unique_value

def remove_unique_value_of_cat_features(data):
    before = data.shape[0]
    print(f'Before dropping NA {data.shape}')
    for f in categorical_features:
        if (data[f].value_counts() == 1).any():
            remove_list = []
            for i, v in data[f].value_counts().items():
                if v == 1:
                    print('removing:', i, v)
                    remove_list.append(i)
            data = data[~data[f].isin(remove_list)]
    print(f'Before dropping NA {data.shape}, dropped {before - data.shape[0]}')
    return data


print('is_unique_value_in_cat_features', is_unique_value_in_cat_features(df))
df = remove_unique_value_of_cat_features(df)


unique value: PosA 1
unique value: PosN 1
unique value: RRAn 1
unique value: RRAe 1
unique value: Floor 1
unique value: Po 1
is_unique_value_in_cat_features True
Before dropping NA (1448, 28)
removing: PosA 1
removing: PosN 1
removing: RRAn 1
removing: RRAe 1
removing: Shed 1
removing: Floor 1
removing: Po 1
Before dropping NA (1441, 28), dropped 7


# Onehot Encoding is being used for categorical feature

In [23]:
def encode_onehot(data, column_name):
    ''' This onhot encode the categorical columns and drop the original column
    
    '''  
    categorical_values = data[column_name].unique()
    data_to_encode = data.pop(column_name)

    for cat_value in categorical_values:
        col_name = column_name+str(cat_value)
        data[col_name] = (data_to_encode == cat_value) * 1.0


def encode_cat_features(data, features):
    for f in features:
        encode_onehot(data, f)


encode_cat_features(df, categorical_features)

  data[col_name] = (data_to_encode == cat_value) * 1.0


In [24]:
df.shape

(1441, 115)

In [25]:
# Split data but since we need further analysis just combine back the Sales Price
X_train, X_test, y_train, y_test = data_loader.split_data_df(
    df, combine_back=True)


# Analyzing the correlation of nominal features and drop columns if they are highly correlated

In [26]:
def drop_highly_corrlated_features(X_train, X_test):
    X_train = X_train.drop(['1stFlrSF', '2ndFlrSF'], axis=1)
    X_test = X_test.drop(['1stFlrSF', '2ndFlrSF'], axis=1)
    # assuming this is final analysis so take out the Sales Proce
    y_train = X_train.pop('SalePrice')
    y_test = X_test.pop('SalePrice')
    return X_train, X_test, y_train, y_test
    
X_train, X_test, y_train, y_test = drop_highly_corrlated_features(X_train, X_test)


# Normilize Data

First checking out the stats

In [27]:
X_train.describe()


Unnamed: 0,PoolArea,YearRemodAdd,MasVnrArea,YearBuilt,KitchenAbvGr,GrLivArea,OverallQual,BedroomAbvGr,TotalBsmtSF,LotArea,...,RoofStyleFlat,HeatingGasA,HeatingGasW,HeatingGrav,HeatingWall,HeatingOthW,HeatingQCEx,HeatingQCGd,HeatingQCTA,HeatingQCFa
count,1152.0,1152.0,1152.0,1152.0,1152.0,1152.0,1152.0,1152.0,1152.0,1152.0,...,1152.0,1152.0,1152.0,1152.0,1152.0,1152.0,1152.0,1152.0,1152.0,1152.0
mean,2.59809,1984.900174,99.959201,1970.802083,1.048611,1510.681424,6.085938,2.876736,1042.581597,10342.03125,...,0.007812,0.977431,0.013889,0.00434,0.002604,0.001736,0.499132,0.171875,0.296875,0.032118
std,39.765308,20.585005,172.030292,30.366955,0.223077,502.691564,1.365611,0.826629,412.395673,9743.305174,...,0.088081,0.148591,0.117081,0.065766,0.050987,0.041649,0.500216,0.377436,0.457079,0.17639
min,0.0,1950.0,0.0,1872.0,0.0,438.0,1.0,0.0,0.0,1300.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1967.0,0.0,1954.0,1.0,1125.75,5.0,2.0,791.75,7500.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,1993.0,0.0,1972.0,1.0,1457.0,6.0,3.0,984.0,9423.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,2004.0,164.25,2000.0,1.0,1779.0,7.0,3.0,1278.0,11475.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
max,738.0,2010.0,1600.0,2010.0,3.0,3627.0,10.0,8.0,3200.0,215245.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Normalize data 

In [28]:
def get_stats():
    stats = X_train.describe()
    return stats.transpose()

def normalize(data, stats):
    return (data - stats['mean']) / stats['std']

def get_normilze_data():
    # using the same stats for both train and test
    stats = get_stats()
    norm_X_train = normalize(X_train, stats)
    norm_X_test = normalize(X_test, stats)
    return norm_X_train, norm_X_test


norm_X_train, norm_X_test = get_normilze_data()


In [29]:
norm_X_train.shape, norm_X_test.shape, len(norm_X_train.keys())

((1152, 112), (289, 112), 112)


If there was problem then some will be Nan because of divided to zero) so vefirying that there is no missing value after normalization


In [30]:

norm_X_train.isna().values.any(), norm_X_test.isna().values.any()

(False, False)

There is not NA value so normilzation was successful