In [1]:
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier, KernelDensity
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [2]:
#Loading Data into Dataframe
df1 = pd.read_csv('adult.data')

# Adult Dataset

In [3]:
#Drop Rows with Missing Values
#Renaming Columns for Legibility
df1.replace({' ?': np.NaN}, inplace=True)
df1 = df1.dropna()
adultdf1 = df1
adultdf1 = adultdf1.rename(columns={' State-gov': 'State_gov'})
adultdf1 = adultdf1.rename(columns={' Bachelors': 'Bachelors'})
adultdf1 = adultdf1.rename(columns={' Never-married': 'Marital_status'})
adultdf1 = adultdf1.rename(columns={' Adm-clerical': 'Adm_clerical'})
adultdf1 = adultdf1.rename(columns={' Not-in-family': 'Family_status'})
adultdf1 = adultdf1.rename(columns={' 2174': 'Capital_gain'})
adultdf1 = adultdf1.rename(columns={' 0': 'Capital_loss'})
adultdf1 = adultdf1.rename(columns={' 40': 'Hours/week'})
adultdf1 = adultdf1.rename(columns={' United-States': 'Country'})
adultdf1 = adultdf1.rename(columns={' <=50K': 'Income'})

#First, label encoding the state-gov column and then converting into one-hot-encoded columns
labelencoder = LabelEncoder()
adultdf1['State_gov'] = labelencoder.fit_transform(adultdf1['State_gov'])
enc = OneHotEncoder(handle_unknown='ignore')
enc_df = pd.DataFrame(enc.fit_transform(adultdf1[['State_gov']]).toarray())

#Converted education year column into numerical column using .map. 
#Since was ordinal data, did not one-hot-encode.
year_ord_map = {' Preschool': 1, ' 1st-4th': 2, ' 5th-6th': 3, 
               ' 7th-8th': 4, ' 9th': 5, ' 10th': 6, ' 11th': 7, ' 12th': 8, ' HS-grad': 9, ' Some-college': 10, ' Assoc-voc': 11,
                ' Assoc-acdm' : 12, ' Bachelors': 13, ' Prof-school': 14, ' Masters': 15, ' Doctorate': 16}
adultdf1['Bachelors'] = adultdf1['Bachelors'].map(year_ord_map)

#One-hot-encoded marital status; first put encoded data into seperate dataframe.
adultdf1['Marital_status'] = labelencoder.fit_transform(adultdf1['Marital_status'])
enc2 = OneHotEncoder(handle_unknown='ignore')
enc2_df = pd.DataFrame(enc2.fit_transform(adultdf1[['Marital_status']]).toarray())

#One-hot-encoded job; first put encoded data into seperate dataframe.
adultdf1['Adm_clerical'] = labelencoder.fit_transform(adultdf1['Adm_clerical'])
enc3 = OneHotEncoder(handle_unknown='ignore')
enc3_df = pd.DataFrame(enc3.fit_transform(adultdf1[['Adm_clerical']]).toarray())

#One-hot-encoded family status; first put encoded data into seperate dataframe.
adultdf1['Family_status'] = labelencoder.fit_transform(adultdf1['Family_status'])
enc4 = OneHotEncoder(handle_unknown='ignore')
enc4_df = pd.DataFrame(enc4.fit_transform(adultdf1[['Family_status']]).toarray())

#One-hot-encoded race; first put encoded data into seperate dataframe.
adultdf1[' White'] = labelencoder.fit_transform(adultdf1[' White'])
enc5 = OneHotEncoder(handle_unknown='ignore')
enc5_df = pd.DataFrame(enc5.fit_transform(adultdf1[[' White']]).toarray())

#One-hot-encoded Gender; first put encoded data into seperate dataframe.
adultdf1[' Male'] = labelencoder.fit_transform(adultdf1[' Male'])
enc6 = OneHotEncoder(handle_unknown='ignore')
enc6_df = pd.DataFrame(enc6.fit_transform(adultdf1[[' Male']]).toarray())

#one-hot-encoded Country; first put encoded data into seperate dataframe.
adultdf1['Country'] = labelencoder.fit_transform(adultdf1['Country'])
enc7 = OneHotEncoder(handle_unknown='ignore')
enc7_df = pd.DataFrame(enc7.fit_transform(adultdf1[['Country']]).toarray())

#Dropped all the numerical columns
adultdf1 = adultdf1.drop(['State_gov'], axis = 1)
adultdf1 = adultdf1.drop(['Marital_status'], axis = 1)
adultdf1 = adultdf1.drop(['Adm_clerical'], axis = 1)
adultdf1 = adultdf1.drop(['Family_status'], axis = 1)
adultdf1 = adultdf1.drop([' White'], axis = 1)
adultdf1 = adultdf1.drop([' Male'], axis = 1)
adultdf1 = adultdf1.drop(['Country'], axis = 1)

#Dropped the columns which had no significance to the data. If we kept this data, it would skew some data points.
adultdf1 = adultdf1.drop(columns=[' 13'])
adultdf1 = adultdf1.drop(columns=['Capital_gain'])
adultdf1 = adultdf1.drop(columns=['Capital_loss'])

#Combined all the one-hot-encoded data into one list so easier to join later on.
dfs = [enc_df, enc2_df, enc3_df, enc4_df, enc5_df, enc6_df, enc7_df]

#Changed income to class label 1 and 0 using .map method.
income_ord_map = {' <=50K': 0, ' >50K': 1}
adultdf1['Income'] = adultdf1['Income'].map(income_ord_map)

#Combined original dataframe with one-hot-encoded data. Now all data is numerical and stored in adultdf1.
dfss = pd.concat(dfs, axis = 1)
adultdf1 = adultdf1.reset_index(drop=True)
adultdf1 = dfss.join(adultdf1)
adultdf1 = adultdf1.dropna()

#Converted dataframe into array.
adult_arr = adultdf1.to_numpy()

In [4]:
#Accuracy function. Compares predicted labels vs actual labels and counts accuracy.
def prediction(predictions, Y_given):
    wrong = 0
    counter = 0
    for test, train in zip(predictions, Y_given):
        if test == train:
            wrong = wrong
        else:
            wrong = wrong + 1
        counter = counter + 1
    accuracy = 1 - (wrong/counter)
    return accuracy

# SVM

In [5]:
#Three lists to store training accuracy for each trial, test accuracy for each trial, and best parameters for each trial.
SVM_train_accuracy = []
SVM_test_accuracy = []
SVM_best_params = []

#Each trial
for i in range(3):
    data_svm = shuffle(adult_arr)
#Data Splitting, train-test-split    
    X_svm = data_svm[:, 0:-1]
    Y_svm = data_svm[:, -1]
    X_train_svm, X_test_svm, Y_train_svm, Y_test_svm = train_test_split(X_svm, Y_svm, test_size = 25161/30161, random_state=42,
                                                                        stratify = Y_svm)
#Scaling training data using StandardScaler
    scaler_svm = preprocessing.StandardScaler().fit(X_train_svm)
    X_train_svm = scaler_svm.transform(X_train_svm)
#Param-grid
    parameters = [{'kernel': ['rbf'], 'gamma': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0], 'C': [10**-7, 10**-6, 10**-5,
                                                                                                     10**-4, 10**-3, 10**-2,
                                                                                                     10**-1]},
                  {'kernel': ['poly'], 'degree': [2, 3], 'C': [10**-7, 10**-6, 10**-5, 10**-4, 10**-3, 10**-2, 10**-1]},
                  {'kernel': ['linear'], 'C': [10**-7, 10**-6, 10**-5, 10**-4, 10**-3, 10**-2, 10**-1]}]
#SVC class, grid search, and fit training data and training labels
    svc = svm.SVC(gamma = 'auto')
    grid_search_svm = GridSearchCV(svc, parameters, cv = 5, error_score = np.nan)
    grid_search2_svm = grid_search_svm.fit(X_train_svm, Y_train_svm)
#Printing and Storing best params into list.
    best_params_svm = grid_search2_svm.best_params_
    print("Best params: {}".format(best_params_svm))
    SVM_best_params.append('Trial ' + str(i + 1) + ': ' + str(best_params_svm))
#Best estimator is refitted so just used .predict on training data to find predicted training values.
#Used prediction pre-defined function to count accuracy on predicted training set.
    train_predictions_svm = grid_search2_svm.best_estimator_.predict(X_train_svm) #.best_estimator_
    train_accuracy_svm = prediction(train_predictions_svm, Y_train_svm)
#Printing and storing training accuracy into list.
    print("Train accuracy: {}".format(train_accuracy_svm))
    SVM_train_accuracy.append('Trial ' + str(i + 1) + ': ' + str(train_accuracy_svm))
#Scaling testing data with my training data scaler. This ensures training and testing data are scaled the same.
#Printing and storing test accuracy into list.
    X_test_svm = scaler_svm.transform(X_test_svm)
    test_predictions_svm = grid_search2_svm.best_estimator_.predict(X_test_svm) #.best_estimator_
    test_accuracy_svm = prediction(test_predictions_svm, Y_test_svm)
    print("Test accuracy: {}".format(test_accuracy_svm))
    SVM_test_accuracy.append('Trial ' + str(i + 1) + ': ' + str(test_accuracy_svm))

Best params: {'C': 0.1, 'kernel': 'linear'}
Train accuracy: 0.8344
Test accuracy: 0.8275903183498271
Best params: {'C': 0.1, 'kernel': 'linear'}
Train accuracy: 0.8314
Test accuracy: 0.8254838837884027
Best params: {'C': 0.1, 'kernel': 'linear'}
Train accuracy: 0.838
Test accuracy: 0.826437740948293


# kNN

In [6]:
#Three lists to store training accuracy for each trial, test accuracy for each trial, and best parameters for each trial.
kNN_train_accuracy = []
kNN_test_accuracy = []
kNN_best_params = []

#Setting K parameters
p = (np.linspace(1,500,25))
p = p.astype('int64')

#For Each Trial
for i in range(3):
    data_knn = shuffle(adult_arr)
#Data Splitting, Train-test-split
    X_knn = data_knn[:, 0:-1]
    Y_knn = data_knn[:, -1]
    X_train_knn, X_test_knn, Y_train_knn, Y_test_knn = train_test_split(X_knn, Y_knn, test_size = 25161/30161, random_state=42,
                                                                        stratify = Y_knn)
#Scaling training data using StandardScaler
    scaler_knn = preprocessing.StandardScaler().fit(X_train_knn)
    X_train_knn = scaler_knn.transform(X_train_knn)
#Param-grid, initializing KNN Class, and fitting with gridsearch
#Printing and storing best params in list
    params = [{'weights' : ['uniform', 'distance'], 'metric' : ['minkowski'],'n_neighbors': p}]
    neighbor = KNeighborsClassifier()
    grid_search_knn = GridSearchCV(neighbor, params, cv=5, error_score = np.nan)
    grid_search_knn2 = grid_search_knn.fit(X_train_knn, Y_train_knn)
    best_params_knn = grid_search_knn2.best_params_
    print("Best params: {}".format(best_params_knn))
    kNN_best_params.append('Trial ' + str(i + 1) + ': ' + str(best_params_knn))
#Predicting training data with best_estimator
#Calculating training accuracy with defined prediction function
#Printing and storing training accuracy in list.
    train_predictions_knn = grid_search_knn2.best_estimator_.predict(X_train_knn)
    train_accuracy_knn = prediction(train_predictions_knn, Y_train_knn)
    print("Train accuracy: {}".format(train_accuracy_knn))
    kNN_train_accuracy.append('Trial ' + str(i + 1) + ': ' + str(train_accuracy_knn))
#Scaling testing data with my training data scaler. This ensures training and testing data are scaled the same.
#Printing and storing test accuracy into list.
    X_test_knn = scaler_knn.transform(X_test_knn)
    test_predictions_knn = grid_search_knn2.best_estimator_.predict(X_test_knn)
    test_accuracy_knn = prediction(test_predictions_knn, Y_test_knn)
    print("Test accuracy: {}".format(test_accuracy_knn))
    kNN_test_accuracy.append('Trial ' + str(i + 1) + ': ' + str(test_accuracy_knn))

Best params: {'metric': 'minkowski', 'n_neighbors': 416, 'weights': 'distance'}
Train accuracy: 1.0
Test accuracy: 0.8241325861452248
Best params: {'metric': 'minkowski', 'n_neighbors': 458, 'weights': 'distance'}
Train accuracy: 1.0
Test accuracy: 0.8241723301935535
Best params: {'metric': 'minkowski', 'n_neighbors': 312, 'weights': 'distance'}
Train accuracy: 1.0
Test accuracy: 0.8200786932156909


# Random Forest-Transformed Data

In [9]:
#Three lists to store training accuracy for each trial, test accuracy for each trial, and best parameters for each trial.
rf_train_accuracy = []
rf_test_accuracy = []
rf_best_params = []

#For each trial
for i in range(3):
    data_rf = shuffle(adult_arr)
#Data splitting, train-test-split
    X_rf = data_rf[:, 0:-1]
    Y_rf = data_rf[:, -1]
    X_train_rf, X_test_rf, Y_train_rf, Y_test_rf = train_test_split(X_rf, Y_rf, test_size = 25161/30161, random_state=42)#, stratify = Y_rf)
#Param-grid, initialize random forest class, and fitting training data with grid search.
    params = [{'n_estimators' : [1024], 'max_features' : [1, 2, 4, 6, 8, 12, 16, 20]}]
    forest = RandomForestClassifier()
    grid_search_rf = GridSearchCV(forest, params, cv=5, error_score = np.nan)
    grid_search_rf2 = grid_search_rf.fit(X_train_rf, Y_train_rf)
#Printing and storing best params into list.
    best_params_rf = grid_search_rf2.best_params_
    print("Best params: {}".format(best_params_rf))
    rf_best_params.append('Trial ' + str(i + 1) + ': ' + str(best_params_rf))
#Using best_estimator to predict training data.
#Storing and printing training accuracy into list.
#Using defined prediction function to calculate total accuracy.
    train_predictions_rf = grid_search_rf2.best_estimator_.predict(X_train_rf)
    train_accuracy_rf = prediction(train_predictions_rf, Y_train_rf)
    print("Train accuracy: {}".format(train_accuracy_rf))
    rf_train_accuracy.append('Trial ' + str(i + 1) + ': ' + str(train_accuracy_rf))
#Predicting Test data with best_estimator.predict.
#Using prediction function to calculate accuracy.
#Printing and storing Test accuracy into list.
    test_predictions_rf = grid_search_rf2.best_estimator_.predict(X_test_rf)
    test_accuracy_rf = prediction(test_predictions_rf, Y_test_rf)
    print("Test accuracy: {}".format(test_accuracy_rf))
    rf_test_accuracy.append('Trial ' + str(i + 1) + ': ' + str(test_accuracy_rf))

Best params: {'max_features': 16, 'n_estimators': 1024}
Train accuracy: 1.0
Test accuracy: 0.8247684909184849
Best params: {'max_features': 8, 'n_estimators': 1024}
Train accuracy: 1.0
Test accuracy: 0.8231787289853345
Best params: {'max_features': 20, 'n_estimators': 1024}
Train accuracy: 1.0
Test accuracy: 0.8244902825801836


# Random Forest-Original Data

In [9]:
# rf2_train_accuracy = []
# rf2_test_accuracy = []
# rf2_best_params = []
# df1 = df1.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
# df1_arr = df1.to_numpy()

# for i in range(1):
#     data_rf2 = shuffle(df1_arr)
#     X_rf2 = data_rf2[:, 0:-1]
#     Y_rf2 = data_rf2[:, -1]
#     X_train_rf2, X_test_rf2, Y_train_rf2, Y_test_rf2 = train_test_split(X_rf2, Y_rf2, test_size = 25161/30161, random_state=42)
#     params_rf2 = [{'n_estimators' : [1024], 'max_features' : [1, 2, 4, 6, 8, 12, 16, 20]}]
#     forest_rf2 = RandomForestClassifier()
#     grid_search_rf2 = GridSearchCV(forest_rf2, params_rf2, cv=5, error_score = np.nan)
#     grid_search_rf3 = grid_search_rf2.fit(X_train_rf2, Y_train_rf2)
#     best_params_rf2 = grid_search_rf3.best_params_
#     print("Best params: {}".format(best_params))
#     rf2_best_params.append('Trial ' + str(i + 1) + ': ' + str(best_params_rf2))
#     train_predictions_rf2 = grid_search_rf3.best_estimator_.predict(X_train_rf2)
#     train_accuracy_rf2 = prediction(train_predictions_rf2, Y_train_rf2)
#     print("Train accuracy: {}".format(train_accuracy_rf2))
#     rf2_train_accuracy.append('Trial ' + str(i + 1) + ': ' + str(train_accuracy_rf2))
#     test_predictions_rf2 = grid_search_rf3.best_estimator_.predict(X_test_rf2)
#     test_accuracy_rf2 = prediction(test_predictions_rf2, Y_test_rf2)
#     print("Test accuracy: {}".format(test_accuracy_rf2))
#     rf2_test_accuracy.append('Trial ' + str(i + 1) + ': ' + str(test_accuracy_rf2))