#### Load Data

In [1]:
import pandas as pd
import numpy as np

dfTrain = pd.read_csv('trained.csv')
dfTest = pd.read_csv('tested.csv')

#### Encoder method 1: use one-hot encoder

In [2]:
def oneHotBind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[feature_to_encode])
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop(feature_to_encode, axis=1)
    return(res)

In [3]:
dfTrain = oneHotBind(dfTrain,['workclass','education','marital-status','occupation','relationship','race','sex','native-country'])
dfTest  = oneHotBind(dfTest, ['workclass','education','marital-status','occupation','relationship','race','sex','native-country'])

In [4]:
for attributes in dfTrain.keys():
    if attributes not in dfTest.keys():
        print("Adding missing feature {}".format(attributes))
        dfTest[attributes] = 0

Adding missing feature native-country_Holand-Netherlands


#### Encoder method 2: use LabelEncoder

In [3]:
from sklearn import preprocessing

def encode_features(df_train, df_test):
    features = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race','sex','native-country']
    df_combined = pd.concat([df_train[features], df_test[features]])
    
    for feature in features:
        le = preprocessing.LabelEncoder()
        le = le.fit(df_combined[feature])
        df_train[feature] = le.transform(df_train[feature])
        df_test[feature] = le.transform(df_test[feature])
    return df_train, df_test

In [5]:
dfTrain, dfTest = encode_features(dfTrain, dfTest)
X_train,Y_train = dfTrain.iloc[:,:-1].values, dfTrain.iloc[:, -1].values
X_test,Y_test = dfTest.iloc[:, :-1].values, dfTest.iloc[:, -1].values

#### Set up training and testing dataset

In [5]:
X_train,Y_train = dfTrain.iloc[:,1:].values, dfTrain.iloc[:, 0].values

In [6]:
X_test,Y_test = dfTest.iloc[:, 1:].values, dfTest.iloc[:, 0].values

### Decision Tree

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

tree = DecisionTreeClassifier()
tree.fit(X_train, Y_train)
predictions = tree.predict(X_test)

print("Accuracy: " + str(accuracy_score(Y_test, predictions)))
print(classification_report(Y_test, predictions))

Accuracy: 0.8073705179282868
              precision    recall  f1-score   support

       <=50K       0.85      0.90      0.88     11360
        >50K       0.63      0.52      0.57      3700

   micro avg       0.81      0.81      0.81     15060
   macro avg       0.74      0.71      0.72     15060
weighted avg       0.80      0.81      0.80     15060



### Naive Bayes

In [8]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train, Y_train)
predictions = gnb.predict(X_test)

print("Accuracy: " + str(accuracy_score(Y_test, predictions)))
print(classification_report(Y_test, predictions))

Accuracy: 0.74800796812749
              precision    recall  f1-score   support

       <=50K       0.76      0.98      0.85     11360
        >50K       0.37      0.03      0.06      3700

   micro avg       0.75      0.75      0.75     15060
   macro avg       0.56      0.51      0.46     15060
weighted avg       0.66      0.75      0.66     15060



## ScikitLearn Method Q2

#### Load and Clean Data

In [9]:
import pandas as pd
import numpy as np

dfTrain = pd.read_csv('trainOri.csv',skipinitialspace=True)
dfTest = pd.read_csv('testOri.csv',skipinitialspace=True)
dfTest["income"] = dfTest["income"].str.replace(".","")
dfTrain = dfTrain[(dfTrain.values !='?').all(axis=1)]
dfTest = dfTest[(dfTest.values !='?').all(axis=1)]

#### Convert to binary value for numerical attributed based on their mean value

In [10]:
def numericalBinary(dataset, features):
    dataset[features] = np.where(dataset[features] >= dataset[features].mean(), 1,0)

In [11]:
numericalBinary(dfTrain,['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week'])
numericalBinary(dfTest,['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week'])

#### one-hot encoding for categorical attribute

In [12]:
def oneHotBind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[feature_to_encode])
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop(feature_to_encode, axis=1)
    return(res)


In [13]:
dfTrain = oneHotBind(dfTrain,['workclass','education','marital-status','occupation','relationship','race','sex','native-country'])
dfTest  = oneHotBind(dfTest, ['workclass','education','marital-status','occupation','relationship','race','sex','native-country'])

#### Tricky: Testing dataset doesn't contain all attributes in training dataset 

In [14]:
for attributes in dfTrain.keys():
    if attributes not in dfTest.keys():
        print("Adding missing feature {}".format(attributes))
        dfTest[attributes] = 0

Adding missing feature native-country_Holand-Netherlands


In [15]:
def encode_income(dataset):
    le = preprocessing.LabelEncoder()
    le = le.fit(dataset['income'])
    dataset['income'] = le.transform(dataset['income'])
    return dataset

### K-Means

In [16]:
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn import preprocessing
encode_income(dfTrain)
encode_income(dfTest)

X_train = dfTrain.loc[:,dfTrain.columns !='income'].values
Y_train = dfTrain['income'].values
X_test = dfTest.loc[:,dfTest.columns !='income'].values
Y_test = dfTest['income'].values

In [18]:
kmeans = KMeans(n_clusters=3)
kmeans.fit(X_train)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [19]:
score = metrics.accuracy_score(Y_train,kmeans.predict(X_train))
print('Accuracy:{0:f}'.format(score))

Accuracy:0.273158


In [20]:
cluster_map = pd.DataFrame()
cluster_map['data_index'] = dfTrain.index.values
cluster_map['cluster'] = kmeans.labels_

In [22]:
kmeans.cluster_centers_

array([[ 2.79105431e-01,  4.78466454e-01,  2.74249201e-01,
         5.38019169e-02,  3.46325879e-02,  2.84984026e-01,
         2.70926518e-02,  4.93290735e-02,  7.88370607e-01,
         2.41533546e-02,  7.33546326e-02,  3.71884984e-02,
         5.11182109e-04,  3.47603834e-02,  4.89456869e-02,
         2.03194888e-02,  5.36741214e-03,  1.16293930e-02,
         1.63578275e-02,  1.82747604e-02,  3.11821086e-02,
         3.46325879e-02,  1.52971246e-01,  6.90095847e-03,
         3.39808307e-01,  3.64217252e-02,  2.30031949e-03,
         1.21405751e-02,  2.27987220e-01,  2.15335463e-01,
         3.23092247e-17,  4.98402556e-03,  2.31309904e-02,
         6.91884984e-01,  4.66453674e-02,  1.80191693e-02,
         7.71884984e-02,  7.66773163e-04,  1.69329073e-01,
         8.85623003e-02,  4.69009585e-02,  9.61022364e-02,
         7.01597444e-02,  1.21022364e-01,  8.94568690e-04,
         1.00830671e-01,  2.50479233e-02,  1.08242812e-01,
         2.84984026e-02,  6.64536741e-02,  1.20459198e-1

### KNN

In [15]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

dfTrain = pd.read_csv('trainOri.csv',skipinitialspace=True)
dfTest = pd.read_csv('testOri.csv',skipinitialspace=True)[-11:]
dfTest["income"] = dfTest["income"].str.replace(".","")
dfTrain = dfTrain[(dfTrain.values !='?').all(axis=1)]
dfTest = dfTest[(dfTest.values !='?').all(axis=1)]

# Convert to binary value for numerical attributes based on their mean value
def numericalBinary(dataset, features):
    dataset[features] = np.where(dataset[features] >= dataset[features].mean(), 1,0)

numericalBinary(dfTrain,['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week'])
numericalBinary(dfTest,['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week'])  

# One-hot encoding for categorical attribute
def oneHotBind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[feature_to_encode])
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop(feature_to_encode, axis=1)
    return(res)
dfTrain = oneHotBind(dfTrain,['workclass','education','marital-status','occupation','relationship','race','sex','native-country'])
dfTest  = oneHotBind(dfTest, ['workclass','education','marital-status','occupation','relationship','race','sex','native-country'])


#### Tricky: Testing dataset doesn't contain all attributes in training dataset

In [17]:
dfTrain

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,income,workclass_Federal-gov,workclass_Local-gov,workclass_Private,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,1,0,1,1,0,0,<=50K,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,0,1,0,0,0,<=50K,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,1,0,0,0,0,<=50K,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,1,1,0,0,0,0,<=50K,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,0,1,1,0,0,0,<=50K,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,0,1,1,0,0,0,<=50K,0,0,1,...,0,0,0,0,0,0,0,1,0,0
6,1,0,0,0,0,0,<=50K,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7,1,1,0,0,0,1,>50K,0,0,0,...,0,0,0,0,0,0,0,1,0,0
8,0,0,1,1,0,1,>50K,0,0,1,...,0,0,0,0,0,0,0,1,0,0
9,1,0,1,1,0,0,>50K,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [12]:
for attributes in dfTrain.keys():
    if attributes not in dfTest.keys():
        print("Adding missing feature {}".format(attributes))
        dfTest[attributes] = 0

Adding missing feature workclass_Federal-gov
Adding missing feature workclass_Self-emp-not-inc
Adding missing feature workclass_State-gov
Adding missing feature workclass_Without-pay
Adding missing feature education_10th
Adding missing feature education_11th
Adding missing feature education_12th
Adding missing feature education_1st-4th
Adding missing feature education_5th-6th
Adding missing feature education_7th-8th
Adding missing feature education_9th
Adding missing feature education_Assoc-acdm
Adding missing feature education_Assoc-voc
Adding missing feature education_Doctorate
Adding missing feature education_Preschool
Adding missing feature education_Prof-school
Adding missing feature education_Some-college
Adding missing feature marital-status_Married-AF-spouse
Adding missing feature marital-status_Married-spouse-absent
Adding missing feature marital-status_Separated
Adding missing feature marital-status_Widowed
Adding missing feature occupation_Armed-Forces
Adding missing feature

In [11]:
def encode_income(dataset):
    le = preprocessing.LabelEncoder()
    le = le.fit(dataset['income'])
    dataset['income'] = le.transform(dataset['income'])
    return dataset

In [12]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

# Convert income to binary
encode_income(dfTrain)
encode_income(dfTest)

X_train = dfTrain.loc[:,dfTrain.columns !='income'].values
Y_train = dfTrain['income'].values
X_test = dfTest.loc[:,dfTest.columns !='income'].values
Y_test = dfTest['income'].values

# Make predictions on validation dataset
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, Y_train)
predictions = knn.predict(X_test)

print("Accuracy: " + str(accuracy_score(Y_test, predictions)))
print(classification_report(Y_test, predictions))


Accuracy: 0.8
              precision    recall  f1-score   support

           0       1.00      0.78      0.88         9
           1       0.33      1.00      0.50         1

   micro avg       0.80      0.80      0.80        10
   macro avg       0.67      0.89      0.69        10
weighted avg       0.93      0.80      0.84        10



### Support Vector Machine (SVM)

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

dfTrain = pd.read_csv('trainOri.csv',skipinitialspace=True)
dfTest = pd.read_csv('testOri.csv',skipinitialspace=True)
dfTest["income"] = dfTest["income"].str.replace(".","")
dfTrain = dfTrain[(dfTrain.values !='?').all(axis=1)]
dfTest = dfTest[(dfTest.values !='?').all(axis=1)]

# Convert to binary value for numerical attributed based on their mean value
def numericalBinary(dataset, features):
    dataset[features] = np.where(dataset[features] >= dataset[features].mean(), 1,0)

numericalBinary(dfTrain,['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week'])
numericalBinary(dfTest,['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week'])  

# One-hot encoding for categorical attribute
def oneHotBind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[feature_to_encode])
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop(feature_to_encode, axis=1)
    return(res)
dfTrain = oneHotBind(dfTrain,['workclass','education','marital-status','occupation','relationship','race','sex','native-country'])
dfTest  = oneHotBind(dfTest, ['workclass','education','marital-status','occupation','relationship','race','sex','native-country'])


In [2]:
for attributes in dfTrain.keys():
    if attributes not in dfTest.keys():
        print("Adding missing feature {}".format(attributes))
        dfTest[attributes] = 0

Adding missing feature native-country_Holand-Netherlands


In [3]:
def encode_income(dataset):
    le = preprocessing.LabelEncoder()
    le = le.fit(dataset['income'])
    dataset['income'] = le.transform(dataset['income'])
    return dataset

In [6]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

# Convert income to binary
encode_income(dfTrain)
encode_income(dfTest)

X_train = dfTrain.loc[:,dfTrain.columns !='income'].values
Y_train = dfTrain['income'].values
X_test = dfTest.loc[:,dfTest.columns !='income'].values
Y_test = dfTest['income'].values

svm = SVC(gamma='auto')
svm.fit(X_train, Y_train)
predictions = svm.predict(X_test)

print("Accuracy: " + str(accuracy_score(Y_test, predictions)))
print(classification_report(Y_test, predictions))


Accuracy: 0.8391102257636123
              precision    recall  f1-score   support

           0       0.86      0.94      0.90     11360
           1       0.75      0.52      0.62      3700

   micro avg       0.84      0.84      0.84     15060
   macro avg       0.80      0.73      0.76     15060
weighted avg       0.83      0.84      0.83     15060



### Neural Network

In [23]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

dfTrain = pd.read_csv('trainOri.csv',skipinitialspace=True)
dfTest = pd.read_csv('testOri.csv',skipinitialspace=True)
dfTest["income"] = dfTest["income"].str.replace(".","")
dfTrain = dfTrain[(dfTrain.values !='?').all(axis=1)]
dfTest = dfTest[(dfTest.values !='?').all(axis=1)]

# Convert to binary value for numerical attributed based on their mean value
def numericalBinary(dataset, features):
    dataset[features] = np.where(dataset[features] >= dataset[features].mean(), 1,0)

numericalBinary(dfTrain,['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week'])
numericalBinary(dfTest,['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week'])  

# One-hot encoding for categorical attribute
def oneHotBind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[feature_to_encode])
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop(feature_to_encode, axis=1)
    return(res)
dfTrain = oneHotBind(dfTrain,['workclass','education','marital-status','occupation','relationship','race','sex','native-country'])
dfTest  = oneHotBind(dfTest, ['workclass','education','marital-status','occupation','relationship','race','sex','native-country'])


In [24]:
for attributes in dfTrain.keys():
    if attributes not in dfTest.keys():
        print("Adding missing feature {}".format(attributes))
        dfTest[attributes] = 0

Adding missing feature native-country_Holand-Netherlands


In [25]:
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier



X_train = dfTrain.loc[:,dfTrain.columns !='income'].values
Y_train = dfTrain['income'].values
X_test = dfTest.loc[:,dfTest.columns !='income'].values
Y_test = dfTest['income'].values

mlp = MLPClassifier(hidden_layer_sizes=(104,104,104))
mlp.fit(X_train,Y_train)

predictions = mlp.predict(X_test)
print("Accuracy: " + str(accuracy_score(Y_test, predictions)))
print(classification_report(Y_test,predictions))

Accuracy: 0.8125498007968127
              precision    recall  f1-score   support

       <=50K       0.87      0.89      0.88     11360
        >50K       0.63      0.59      0.61      3700

   micro avg       0.81      0.81      0.81     15060
   macro avg       0.75      0.74      0.74     15060
weighted avg       0.81      0.81      0.81     15060

