# Imports and Reading the data

In [2]:
# Loading required packages
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt # pyplot as plt
import itertools
import shap

from dython.nominal import theils_u, correlation_ratio
import category_encoders as ce
from sklearn.preprocessing import RobustScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, auc, roc_curve, roc_auc_score, precision_recall_curve

from sklearn import datasets, linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from pandas.api.types import is_numeric_dtype
from IPython.display import display, HTML
import matplotlib.ticker as tick
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [4]:
# load the dataset
data = pd.read_csv(r"C:\Users\Priya  Sharma\OneDrive\Desktop\Practise\Datasets\Income_Classification\income_evaluation.csv")

# check shape of dataset
print("shape of the data:", data.shape)
data.head()

shape of the data: (32561, 15)


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


# Research

**Each entry contains the following information about an individual:**

1.	**age**: the age of an individual: Integer greater than 0
    
2.	**workclass**: a general term to represent the employment status of an individual: Private, Self¬emp¬not¬inc, Self¬emp¬inc, Federal¬gov, Local¬gov, State¬gov, Without¬pay, Never¬worked.
    
3.	**fnlwgt**: final weight. In other words, this is the number of people the census believes the entry represents: Integer greater than 0 
    
4.	**education**: the highest level of education achieved by an individual: Bachelors, Some¬college, 11th, HS¬grad, Prof¬school, Assoc¬acdm, Assoc¬voc, 9th, 7th¬8th, 12th, Masters, 1st¬4th, 10th, Doctorate, 5th¬6th, Preschool.
    
5.	**education¬num**: the highest level of education achieved in numerical form: Integer greater than 0
    
6.	**marital¬status**: marital status of an individual. Married¬civ¬spouse corresponds to a civilian spouse while Married¬AF¬spouse is a spouse in the Armed Forces: Married¬civ¬spouse, Divorced, Never¬married, Separated, Widowed, Married¬spouse-absent, Married¬AF¬spouse.
    
7.	**occupation**: the general type of occupation of an individual: Tech¬support, Craft¬repair, Other¬service, Sales, Exec¬managerial, Prof¬specialty, Handlers¬cleaners, Machine¬op¬inspct, Adm¬clerical, Farming¬fishing, Transport¬moving, Priv¬house¬serv, Protective¬serv, Armed¬Forces.
    
8.	**relationship**: represents what this individual is relative to others. For example an individual could be a Husband. Each entry only has one relationship attribute and is somewhat redundant with marital status. We might not make use of this attribute: Wife, Own¬child, Husband, Not¬in¬family, Other¬relative, Unmarried.
    
9.	**race**: Descriptions of an individual’s race: White, Asian¬Pac¬Islander, Amer¬Indian¬Eskimo, Other, Black.
    
10.	**sex**: the biological sex of the individual: Male, Female
    
11.	**capital¬gain**: capital gains for an individual: Integer greater than or equal to 0 
    
12.	**capital¬loss**: capital loss for an individual: Integer greater than or equal to 0 
    
13.	**hours¬per¬week**: the hours an individual has reported to work per week: continuous
    
14.	**country of origin for an individual**: United¬States, Cambodia, England, Puerto¬Rico, Canada, Germany, Outlying¬US(Guam-USVI¬etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican¬Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El¬Salvador, Trinadad &Tobago, Peru, Hong, Holand¬Netherlands.
    
15.	**the label**: whether or not an individual makes more than USD 50,000 annually: <=50k, >50k


For column name reference , check http://cseweb.ucsd.edu/classes/sp15/cse190-c/reports/sp15/048.pdf

# Functions

In [3]:
def clean_column_names(df):
    df.columns = df.columns.str.upper()
    df.columns = df.columns.str.replace('-', '_')
    df.columns = df.columns.str.strip()
    return df.columns

In [4]:
def reformat_large_tick_values(tick_val, pos):
    """
    Turns large tick values (in the billions, millions and thousands) such as 4500 into 4.5K and also appropriately 
    turns 4000 into 4K (no zero after the decimal).
    """
    if tick_val >= 1000000000:
        val = round(tick_val/1000000000, 1)
        new_tick_format = '{:}B'.format(val)
        
    elif tick_val >= 1000000:
        val = round(tick_val/1000000, 1)
        new_tick_format = '{:}M'.format(val)
        
    elif tick_val >= 1000:
        val = round(tick_val/1000, 1)
        new_tick_format = '{:}K'.format(val)
        
    elif tick_val < 1000:
        new_tick_format = round(tick_val, 1)
        
    else:
        new_tick_format = tick_val

    # make new_tick_format into a string value
    new_tick_format = str(new_tick_format)
    
    # code below will keep 4.5M as is but change values such as 4.0M to 4M since that zero after the decimal isn't needed
    index_of_decimal = new_tick_format.find(".")
    
    if index_of_decimal != -1:
        value_after_decimal = new_tick_format[index_of_decimal + 1]
        if value_after_decimal == "0":
            # remove the 0 after the decimal point since it's not needed
            new_tick_format = new_tick_format[0: index_of_decimal] + new_tick_format[index_of_decimal + 2:]
            
    return new_tick_format

In [5]:
def compute_correlation(df:'pd.DataFrame')->'pd.DataFrame':
    """
    Function to compute correlation values as follows:
    - Pearson correlation (numeric-numeric)
    - Correlation Ratio (numeric-categorical)
    - Theil's U (categorical-categorical)
    
    Args:
        df: Dataset for which to compute correlation matrix
        
    Returns:
        Correlation matrix as dataframe
    """
    # Get list of columns
    list_of_columns = df.columns
    
    # Initialize empty dataframe for correlation matrix
    corr_df = pd.DataFrame(index=list_of_columns, columns=list_of_columns)
    
    # Iterate over each column
    for i in range(len(list_of_columns)):
        # For each column, iterate over list of columns again to get pair-wise columns
        # Note: We are iterating over (i, j) and (j, i) separately as Theil's U is not symmetric
        for j in range(len(list_of_columns)):
            if is_numeric_dtype(df[list_of_columns[i]]):
                if is_numeric_dtype(df[list_of_columns[j]]):
                    # Case 1: Both are numeric
                    corr_value = np.corrcoef(df[list_of_columns[i]], df[list_of_columns[j]])
                    corr_df.loc[list_of_columns[i], list_of_columns[j]] = corr_value[0][1]
                else:
                    # Case 2: One is categorical
                    corr_value = correlation_ratio(df[list_of_columns[j]], df[list_of_columns[i]])
                    corr_df.loc[list_of_columns[i], list_of_columns[j]] = corr_value
            elif is_numeric_dtype(df[list_of_columns[j]]):
                if is_numeric_dtype(df[list_of_columns[i]]):
                    # Case 1: Both are numeric
                    corr_value = np.corrcoef(df[list_of_columns[i]], df[list_of_columns[j]])
                    corr_df.loc[list_of_columns[i], list_of_columns[j]] = corr_value[0][1]
                else:
                    # Case 2: One is categorical
                    corr_value = correlation_ratio(df[list_of_columns[i]], df[list_of_columns[j]])
                    corr_df.loc[list_of_columns[i], list_of_columns[j]] = corr_value
            else:
                # Case 3: Both are categorical
                corr_value = theils_u(df[list_of_columns[i]], df[list_of_columns[j]])
                corr_df.loc[list_of_columns[i], list_of_columns[j]] = corr_value
    return(pd.DataFrame(corr_df.astype('float64').round(2)))
