In [643]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
import seaborn as sbn

In [644]:
dataset_description = pd.read_csv('../Data/Dataset_Description.csv')

In [645]:
# dataset_description

In [646]:
population_data_raw = pd.read_csv('../Data/population.csv')
# population_data_raw 

In [647]:
def preprocessing(df):

    #replace ? with Nan
    df = df.replace({' ?' :  np.nan})

    #drop columns with more than 40$ NaN values
    cols = df.columns
    missing_percentages = (df.isna().sum() / len(df)) * 100
    cols_to_drop = []
    for col in cols:
        if(missing_percentages[col] > 40.0):
            cols_to_drop.append(col)
    print(f'Dropping = {cols_to_drop}')
    df = df.drop(cols_to_drop, axis=1)
    return df

In [648]:
data = preprocessing(population_data_raw)
print(data.shape)

Dropping = ['MIGMTR1', 'MIGMTR3', 'MIGMTR4', 'MIGSUN']
(199523, 36)


In [649]:
def plot_histogram(df, numeric_columns, categorical_columns):
    # features = df.columns
    # for feat in features:
    #     plt.figure(figsize=(25,5))
    #     if(feat in numeric_columns):
    #         df[feat].hist(bins=10,edgecolor='black', linewidth=1.5)

    #     else:
    #         unique_cat_values = len(df[feat].unique())
    #         print(f'No. of unique Categories : {unique_cat_values}')
    #         df[feat].hist(bins = unique_cat_values, edgecolor='black', linewidth=1.5)
    #     plt.xlabel(feat)
    #     plt.ylabel('Count')
    #     plt.title(f'Histogram : {feat}')
    #     plt.savefig(f'../Plots/Feature_Analysis/{feat}_hist', facecolor='w', bbox_inches='tight')
    #     plt.show()
    features = df.columns
    for feat in features:
        plt.figure(figsize=(25, 5))
        # if(feat in numeric_columns)
        unique_cat_values = len(df[feat].unique())
        print(f'No. of unique Categories : {unique_cat_values}')
        df[feat].astype(str).hist(bins = unique_cat_values, edgecolor='black', linewidth=1.5)
        plt.xlabel(feat)
        plt.ylabel('Count')
        plt.title(f'Histogram : {feat}')
        plt.savefig(f'../Plots/Feature_Analysis/{feat}_hist', facecolor='w', bbox_inches='tight')
        plt.show()

In [650]:
def feature_analysis(df):
    data_columns = df.columns
    numeric_columns = ['AAGE', 'AHRSPAY', 'CAPGAIN', 'CAPLOSS', 'DIVVAL', 'WKSWORK']
    all_cols_set = set(data_columns)
    numerical_cols_set = set(numeric_columns)
    categorical_cols = list(all_cols_set - numerical_cols_set)
    
    for col in numeric_columns:
        df[col] = pd.cut(x=df[col], bins=6)
    # plot_histogram(df, numeric_columns, categorical_cols)
    total_count = df.shape[0]

    cols_to_drop = []
    for col in data_columns:
        max_freq = df[col].value_counts().max()
        max_perc = (max_freq / total_count) * 100
        if(max_perc > 70.0):
            cols_to_drop.append(col)
    print(f'Dropping {len(cols_to_drop)} cols \n {cols_to_drop}')
    df = df.drop(cols_to_drop, axis=1)
    return df

    # print(df)


    # print(categorical_cols)


In [651]:
data = feature_analysis(data)

Dropping 19 cols 
 ['AHRSPAY', 'AHSCOL', 'ARACE', 'AREORGN', 'AUNMEM', 'AUNTYPE', 'CAPGAIN', 'CAPLOSS', 'DIVVAL', 'GRINREG', 'GRINST', 'PARENT', 'PEFNTVTY', 'PEMNTVTY', 'PENATVTY', 'PRCITSHP', 'SEOTR', 'VETQVA', 'VETYN']


In [652]:
data.isna().sum()

AAGE        0
ACLSWKR     0
ADTIND      0
ADTOCC      0
AHGA        0
AMARITL     0
AMJIND      0
AMJOCC      0
ASEX        0
AWKSTAT     0
FILESTAT    0
HHDFMX      0
HHDREL      0
MIGSAME     0
NOEMP       0
WKSWORK     0
YEAR        0
dtype: int64

In [653]:
def column_modifications(df):
    column_modes = df.mode()
    data_cols = df.columns
    for col in data_cols:
        missing_vals = df[col].isna().sum()
        if(missing_vals > 0):
            df[col] = df[col].fillna(column_modes[col][0])
    one_hot_encoded = pd.get_dummies(df, columns=list(df.columns))
    df = one_hot_encoded

    return df        


In [654]:
data = column_modifications(data)
print(data.shape)

(199523, 257)


CLUSTERING

In [655]:
# # print(data.isna().sum())
# column_modes = data.mode()
# upd_data_cols = data.columns
# for col in upd_data_cols:
#     missing_vals = data[col].isna().sum()
#     if(missing_vals > 0):
#         data[col] = data[col].fillna(column_modes[col][0])
# # data.isna().sum()