In [2]:
import numpy as np
import pandas as pd


POS_STR = ' >50K'

train_file_path='adult_data.csv'
test_file_path='adult_test.csv'



def load_train_data(train_file_path, valid_rate=0.25, is_df=True):
    data_frame = pd.read_csv(train_file_path).sample(frac=1, random_state=11)
    np.random.seed(11)
    mask = np.random.rand(len(data_frame)) < 1 - valid_rate
    train_df, valid_df = data_frame.iloc[mask, :], data_frame.iloc[~mask, :]
    if is_df:
        return train_df, valid_df

    train_labels = [1 if x == POS_STR else 0 for x in train_df.iloc[:, 14].values]
    valid_labels = [1 if x == POS_STR else 0 for x in valid_df.iloc[:, 14].values]
    return train_df.iloc[:, :14].values, np.array(train_labels), valid_df.iloc[:, :14].values, np.array(valid_labels)


def load_test_data(test_file_path, is_df=True):
    data_frame = pd.read_csv(test_file_path)
    if is_df:
        return data_frame

    test_labels = [1 if x == POS_STR else 0 for x in data_frame.iloc[:, 14].values]
    return data_frame.iloc[:, :14].values, np.array(test_labels)


In [3]:
Xtr,Xva=load_train_data(train_file_path)

In [4]:
Xte=load_test_data(test_file_path)

In [5]:
Xtr.corr()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
age,1.0,-0.078252,0.030518,0.080028,0.053824,0.067805
fnlwgt,-0.078252,1.0,-0.040856,-0.000322,-0.012357,-0.018081
education-num,0.030518,-0.040856,1.0,0.124092,0.078441,0.154133
capital-gain,0.080028,-0.000322,0.124092,1.0,-0.032003,0.081663
capital-loss,0.053824,-0.012357,0.078441,-0.032003,1.0,0.051736
hours-per-week,0.067805,-0.018081,0.154133,0.081663,0.051736,1.0


In [6]:
Xtr.drop('education',axis=1,inplace=True)
Xva.drop('education',axis=1,inplace=True)
Xte.drop('education',axis=1,inplace=True)

In [61]:
def gen_onehot_features(X,featureList):
    for feature in featureList:
        get_dummy_features= pd.get_dummies(X[feature])
        X=pd.concat([X,get_dummy_features],axis=1)
    return X  


In [7]:
Xtr['income'] = Xtr['income'].map( {' <=50K':0, ' >50K':1} )

Xte['income'] = Xte['income'].map( {' <=50K.':0, ' >50K.':1} )

Xva['income'] = Xva['income'].map( {' <=50K':0, ' >50K':1} )

In [8]:
Xtr.columns = [
   "Age", "WorkClass", "fnlwgt", 'EducationNum',
   'MaritalStatus', 'Occupation', 'Relationship', 'Race', 'Gender',
   'CapitalGain', 'CapitalLoss', 'HoursPerWeek', 'NativeCountry', 'Income'
]
Xva.columns = [
   "Age", "WorkClass", "fnlwgt", 'EducationNum',
   'MaritalStatus', 'Occupation', 'Relationship', 'Race', 'Gender',
   'CapitalGain', 'CapitalLoss', 'HoursPerWeek', 'NativeCountry', 'Income'
]
Xte.columns = [
   "Age", "WorkClass", "fnlwgt", 'EducationNum',
   'MaritalStatus', 'Occupation', 'Relationship', 'Race', 'Gender',
   'CapitalGain', 'CapitalLoss', 'HoursPerWeek', 'NativeCountry', 'Income'
]

In [9]:
# Convert income to 0 and 1
Ytr = Xtr["Income"].values
Xtr.drop("Income", axis=1, inplace=True,)

In [10]:
# Convert income to 0 and 1
Yva = Xva["Income"].values
Xva.drop("Income", axis=1, inplace=True,)

In [11]:
# Convert income to 0 and 1
Yte = Xte["Income"].values
Xte.drop("Income", axis=1, inplace=True,)

In [12]:
Xtr.Age = Xtr.Age.astype(float)
Xtr.fnlwgt = Xtr.fnlwgt.astype(float)
Xtr.EducationNum = Xtr.EducationNum.astype(float)
Xtr.HoursPerWeek = Xtr.HoursPerWeek.astype(float)

Xva.Age = Xva.Age.astype(float)
Xva.fnlwgt = Xva.fnlwgt.astype(float)
Xva.EducationNum = Xva.EducationNum.astype(float)
Xva.HoursPerWeek = Xva.HoursPerWeek.astype(float)

Xte.Age = Xte.Age.astype(float)
Xte.fnlwgt = Xte.fnlwgt.astype(float)
Xte.EducationNum = Xte.EducationNum.astype(float)
Xte.HoursPerWeek = Xte.HoursPerWeek.astype(float)

In [13]:
Xtr = pd.get_dummies(Xtr, columns=[
   "WorkClass",
   'MaritalStatus', 'Occupation', 'Relationship', 'Race', 'Gender', 'NativeCountry',
])

Xva = pd.get_dummies(Xva, columns=[
   "WorkClass",
   'MaritalStatus', 'Occupation', 'Relationship', 'Race', 'Gender', 'NativeCountry',
])

Xte = pd.get_dummies(Xte, columns=[
   "WorkClass",
   'MaritalStatus', 'Occupation', 'Relationship', 'Race', 'Gender', 'NativeCountry',
])

In [14]:
Xva.insert(loc=65, column='NativeCountry_ Holand-Netherlands', value=0)
Xte.insert(loc=65, column='NativeCountry_ Holand-Netherlands', value=0)

In [15]:
# Standardize data (0 mean, 1 stdev)
from sklearn.preprocessing import StandardScaler
import pandas
import numpy

def standardization(X):
    Xscaler = X.filter(["Age", "fnlwgt", "EducationNum", "HoursPerWeek"],axis=1)
    array = Xscaler.values
    scaler = StandardScaler().fit(Xscaler)
    rescaledX = scaler.transform(Xscaler)
    return rescaledX  

Xtr_scaled = standardization(Xtr)
Xtr.Age = Xtr_scaled[:,0]
Xtr.fnlwgt = Xtr_scaled[:,1]
Xtr.EducationNum = Xtr_scaled[:,2]
Xtr.HoursPerWeek = Xtr_scaled[:,3]

Xva_scaled = standardization(Xva)
Xva.Age = Xva_scaled[:,0]
Xva.fnlwgt = Xva_scaled[:,1]
Xva.EducationNum = Xva_scaled[:,2]
Xva.HoursPerWeek = Xva_scaled[:,3]

Xte_scaled = standardization(Xte)
Xte.Age = Xte_scaled[:,0]
Xte.fnlwgt = Xte_scaled[:,1]
Xte.EducationNum = Xte_scaled[:,2]
Xte.HoursPerWeek = Xte_scaled[:,3]


In [16]:
# Normalize data (length of 1)
from sklearn.preprocessing import Normalizer
import pandas
import numpy

def normalization(X):
    Xnormal = X.filter(["CapitalGain","CapitalLoss"],axis=1)
    scaler = Normalizer().fit(Xnormal)
    normalizedX = scaler.transform(Xnormal)
    return normalizedX

Xtr_normal = normalization(Xtr)
Xtr.CapitalGain = Xtr_normal[:,0]
Xtr.CapitalLoss = Xtr_normal[:,1]

Xva_normal = normalization(Xva)
Xva.CapitalGain= Xva_normal[:,0]
Xva.CapitalLoss= Xva_normal[:,1]

Xte_normal = normalization(Xte)
Xte.CapitalGain= Xte_normal[:,0]
Xte.CapitalLoss= Xte_normal[:,1]

In [22]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

clf = LogisticRegression(random_state=0).fit(Xtr, Ytr)
Ytrhat = clf.predict(Xtr)
auc_tr = roc_auc_score(Ytr, Ytrhat)
accuracy_tr = clf.score(Xtr, Ytr)
print(f"AUC for training data {auc_tr}")
print(f"Acuracy for training data {accuracy_tr}")

AUC for training data 0.7573724971483353
Acuracy for training data 0.8443452259042313


In [23]:
Yvahat = clf.predict(Xva)
auc_val = roc_auc_score(Yva, Yvahat)
accuracy_val = clf.score(Xva, Yva)
print(f"AUC for validation data {auc_val}")
print(f"Acuracy for validation data {accuracy_val}")

AUC for validation data 0.7661489799492164
Acuracy for validation data 0.8485517918507609


In [24]:
Ytehat = clf.predict(Xte)
auc_test = roc_auc_score(Yte, Ytehat)
accuracy_test = clf.score(Xte, Yte)
print(f"AUC for test data {auc_test}")
print(f"Acuracy for test data {accuracy_test}")

AUC for test data 0.756698764934916
Acuracy for test data 0.8458939868558443
