In [None]:
# Sneha Dubey
# Dr. Jagota
# CSEN 281
# 21 October 2024

In [2]:
# Imports
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from scipy.stats import chi2_contingency
from sklearn.feature_selection import mutual_info_classif

In [3]:
# Loading In & Cleaning the Data

In [4]:
columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]

filename = "adult.data"
originalData = pd.read_csv(filename, header = None)

originalData.columns = columns

originalData.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
cleanedData = originalData.drop(columns=['fnlwgt'])
cleanedData = cleanedData.dropna()
cleanedData.columns

Index(['age', 'workclass', 'education', 'education-num', 'marital-status',
       'occupation', 'relationship', 'race', 'sex', 'capital-gain',
       'capital-loss', 'hours-per-week', 'native-country', 'income'],
      dtype='object')

In [6]:
cleanedData['income'] = cleanedData['income'].str.strip()
cleanedData['encodedIncome'] = cleanedData['income'].map({'>50K': 1, '<=50K': 0})
cleanedData.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,encodedIncome
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,0
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,0
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,0
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,0
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,0


In [7]:
numericFeatures = ["age", "education-num", "capital-gain", "capital-loss", "hours-per-week"]
categoricalFeatures = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country"]

numericData = cleanedData[["age", "education-num", "capital-gain", "capital-loss", "hours-per-week", "encodedIncome"]]
categoricalData = cleanedData[["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country", "encodedIncome"]]

In [8]:
# Problem 1 - Proximity Measures for Numerical Features

In [9]:
# Mutual Information

mutualInformation = mutual_info_classif(numericData[numericFeatures], numericData['encodedIncome'])
mutualInformationFormatted = dict(zip(numericFeatures, mutualInformation))

numericFeatureMIRankings = sorted(mutualInformationFormatted.items(), key=lambda x: x[1], reverse=True)

print("Mutual Information Rankings:")
for i in range (0, len(numericFeatureMIRankings)):
    print(i+1, numericFeatureMIRankings[i][0], "-", numericFeatureMIRankings[i][1])

Mutual Information Rankings:
1 capital-gain - 0.08164260611218443
2 education-num - 0.06857868782498189
3 age - 0.06730755077323769
4 hours-per-week - 0.03767839557785968
5 capital-loss - 0.03169207947226549


In [10]:
# Correlation

correlationMatrix = numericData.corr()['encodedIncome'].drop('encodedIncome')
numericFeatureCMRankings = correlationMatrix.sort_values(ascending=False)

print("Correlation Matrix Rankings:")
for i in range (0, len(numericFeatureCMRankings)):
    print(i+1, numericFeatureCMRankings.index[i], "-", numericFeatureCMRankings[numericFeatureCMRankings.index[i]])

Correlation Matrix Rankings:
1 education-num - 0.33515395269094045
2 age - 0.234037102648857
3 hours-per-week - 0.22968906567081132
4 capital-gain - 0.22332881819538056
5 capital-loss - 0.15052631177035342


In [11]:
# Problem 2 - Proximity Measures for Categorical Features

In [12]:
# Simple Matching Coefficient

smc = {feature: sum(categoricalData[feature] == categoricalData['encodedIncome']) / len(categoricalData[feature]) for feature in categoricalFeatures}

categoricalFeatureSMCRankings = sorted(smc.items(), key=lambda x: x[1], reverse=True)

print("Simple Matching Coefficient Rankings:")
for i in range (0, len(categoricalFeatureSMCRankings)):
    print(i+1, categoricalFeatureSMCRankings[i][0], "-", categoricalFeatureSMCRankings[i][1])

Simple Matching Coefficient Rankings:
1 workclass - 0.0
2 education - 0.0
3 marital-status - 0.0
4 occupation - 0.0
5 relationship - 0.0
6 race - 0.0
7 sex - 0.0
8 native-country - 0.0


In [13]:
# Chi-Square Test of Independence

chi2 = {feature: chi2_contingency(pd.crosstab(categoricalData[feature], categoricalData['encodedIncome']))[0] for feature in categoricalFeatures}

categoricalFeatureCSRankings = sorted(chi2.items(), key=lambda x: x[1], reverse=True)

print("Chi-Square Rankings:")
for i in range (0, len(categoricalFeatureCSRankings)):
    print(i+1, categoricalFeatureCSRankings[i][0], "-", categoricalFeatureCSRankings[i][1])

Chi-Square Rankings:
1 relationship - 6699.07689685885
2 marital-status - 6517.741653663022
3 education - 4429.653302288619
4 occupation - 4031.974280247181
5 sex - 1517.813409134445
6 workclass - 1045.7085997281692
7 race - 330.9204310085741
8 native-country - 317.2303857833171
