In [14]:
%tensorflow_version 2.x
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import clear_output
import tensorflow as tf

Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.


In [15]:
from google.colab import files 
traindata = files.upload()
testdata = files.upload()

Saving adult.data to adult (1).data


Saving adult.test to adult (1).test


In [16]:
#naming columns - unnecessary columns labeled as r0 through r3 
columns = ['age', 'workclass', 'r0', 'education', 'r1', 'marital_status', 'occupation', 'relationship', 'ethnicity', 'gender', 'r2', 'r3', 'hrs_per_week', 'native_country', 'income'] 


#read csv files into panda dataframe
from io import StringIO 
traindata = pd.read_csv(StringIO(traindata['adult.data'].decode('utf-8')), header = None, names = columns) 
testdata = pd.read_csv(StringIO(testdata['adult.test'].decode('utf-8')), header = None, names = columns) 

#removing 1st row of testdata which contains title 
testdata.drop(0, inplace = True) 
testdata = testdata.reset_index()

print(traindata.head())
print(testdata.head())

   age          workclass      r0   education  r1       marital_status  \
0   39          State-gov   77516   Bachelors  13        Never-married   
1   50   Self-emp-not-inc   83311   Bachelors  13   Married-civ-spouse   
2   38            Private  215646     HS-grad   9             Divorced   
3   53            Private  234721        11th   7   Married-civ-spouse   
4   28            Private  338409   Bachelors  13   Married-civ-spouse   

           occupation    relationship ethnicity   gender    r2  r3  \
0        Adm-clerical   Not-in-family     White     Male  2174   0   
1     Exec-managerial         Husband     White     Male     0   0   
2   Handlers-cleaners   Not-in-family     White     Male     0   0   
3   Handlers-cleaners         Husband     Black     Male     0   0   
4      Prof-specialty            Wife     Black   Female     0   0   

   hrs_per_week  native_country  income  
0            40   United-States   <=50K  
1            13   United-States   <=50K  
2       

In [17]:
'''
Data Cleaning 
'''

#concatenating training and testing data as one for data cleaning, will be separated later 
fulldata = pd.concat([traindata, testdata]) 


#dropping repetitive columns
fulldata = fulldata.drop(['r0', 'r1', 'r2', 'r3'], axis = 1)


#labels columns should not be categorical, replace with numerical 
#I spent hours trying to debug this !! Especially as some elements had a period after them 
fulldata['income'] = fulldata['income'].replace({' <=50K': 0, ' >50K': 1})
fulldata['income'] = fulldata['income'].replace({' <=50K.': 0, ' >50K.': 1})


#checking that each column contains elements of the same dtype 
for x in fulldata.columns: 
  print(x, pd.Series([type(y) for y in fulldata[x]]).value_counts()) 

#as age contains a mix of int and str, convert all to int 
fulldata['age'] = pd.to_numeric(fulldata['age']) 
  


#checking for empty values
for x in fulldata.columns : 
  print(pd.isna(fulldata[x]).value_counts()) 


#separting columns into categorical and numerical 
numerical_var = ['age', 'hrs_per_week'] 
categorical_var = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'ethnicity', 'gender', 'native_country']

#filling in empty values - mean value for numerical, most occurring for categorical 
for x in numerical_var:
  fulldata[x].fillna(fulldata[x].mean()) 

for x in categorical_var: 
  fulldata[x].fillna(fulldata[x].value_counts().index[0]) 


#separating dataset into training and testing data again 
traindata = fulldata[0: len(traindata)] 
testdata = fulldata[len(traindata):len(fulldata)] 

#separating features and labels
traineval = traindata.pop('income')
testeval = testdata.pop('income')


age <class 'int'>    32561
<class 'str'>    16281
dtype: int64
workclass <class 'str'>    48842
dtype: int64
education <class 'str'>    48842
dtype: int64
marital_status <class 'str'>    48842
dtype: int64
occupation <class 'str'>    48842
dtype: int64
relationship <class 'str'>    48842
dtype: int64
ethnicity <class 'str'>    48842
dtype: int64
gender <class 'str'>    48842
dtype: int64
hrs_per_week <class 'float'>    48842
dtype: int64
native_country <class 'str'>    48842
dtype: int64
income <class 'int'>    48842
dtype: int64
index <class 'float'>    48842
dtype: int64
False    48842
Name: age, dtype: int64
False    48842
Name: workclass, dtype: int64
False    48842
Name: education, dtype: int64
False    48842
Name: marital_status, dtype: int64
False    48842
Name: occupation, dtype: int64
False    48842
Name: relationship, dtype: int64
False    48842
Name: ethnicity, dtype: int64
False    48842
Name: gender, dtype: int64
False    48842
Name: hrs_per_week, dtype: int64
False    488

In [18]:
#creating feature columns 
feature_columns = [] 

for x in categorical_var: 
  vocabulary = []
  vocabulary = traindata[x].unique()
  feature_columns.append(tf.feature_column.categorical_column_with_vocabulary_list(key = x, vocabulary_list= vocabulary))

for x in numerical_var: 
  feature_columns.append(tf.feature_column.numeric_column(key = x, dtype = tf.float32)) 

print(feature_columns) 


[VocabularyListCategoricalColumn(key='workclass', vocabulary_list=(' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov', ' Local-gov', ' ?', ' Self-emp-inc', ' Without-pay', ' Never-worked'), dtype=tf.string, default_value=-1, num_oov_buckets=0), VocabularyListCategoricalColumn(key='education', vocabulary_list=(' Bachelors', ' HS-grad', ' 11th', ' Masters', ' 9th', ' Some-college', ' Assoc-acdm', ' Assoc-voc', ' 7th-8th', ' Doctorate', ' Prof-school', ' 5th-6th', ' 10th', ' 1st-4th', ' Preschool', ' 12th'), dtype=tf.string, default_value=-1, num_oov_buckets=0), VocabularyListCategoricalColumn(key='marital_status', vocabulary_list=(' Never-married', ' Married-civ-spouse', ' Divorced', ' Married-spouse-absent', ' Separated', ' Married-AF-spouse', ' Widowed'), dtype=tf.string, default_value=-1, num_oov_buckets=0), VocabularyListCategoricalColumn(key='occupation', vocabulary_list=(' Adm-clerical', ' Exec-managerial', ' Handlers-cleaners', ' Prof-specialty', ' Other-service', ' Sal

In [19]:
#creating input function 
def input_fn(features, label, epochs = 20, training = True, batch = 32): 
  def input_fn_inner(): 
    input = tf.data.Dataset.from_tensor_slices((dict(features), label))
    if training: 
      input = input.shuffle(1000)
    input = input.batch(batch).repeat(epochs) 
    return input
  return input_fn_inner 
train_input_fn = input_fn(traindata, traineval) 
test_input_fn = input_fn(testdata, testeval, training = False)


In [20]:
#training and evaluating the model
model = tf.estimator.LinearClassifier(feature_columns = feature_columns)
model.train(test_input_fn) 
print(model.evaluate(test_input_fn)) 




{'accuracy': 0.8340397, 'accuracy_baseline': 0.76377374, 'auc': 0.88811594, 'auc_precision_recall': 0.6956861, 'average_loss': 0.354375, 'label/mean': 0.23622628, 'loss': 0.35430133, 'precision': 0.7459157, 'prediction/mean': 0.19466501, 'recall': 0.45111805, 'global_step': 10180}


In [21]:
#takes user input of candidate number to predict if salary is above/below 50K 
results_list = list(model.predict(test_input_fn)) 
subject_num = int(input('Please enter participant number you want to predict for: '))
classid = results_list[subject_num]['class_ids']
print(results_list[subject_num]['probabilities'][classid]) 

print(traindata.iloc[subject_num])
if classid == 0: 
  print("\nThis person's income is predicted to be below 50K. ") 
if classid == 1: 
  print("This person's income is predicted to be above 50K. ") 

if testeval[subject_num] == 0: 
  print('Actual income is below 50K. ') 
if testeval[subject_num] == 1: 
  print('Actual income is above 50K. ') 


#accuracy of model 
misprediction = 0
total = 0
for x in range(len(testeval)): 
  total = total + 1
  if testeval[x] != results_list[x]['class_ids']: 
    misprediction = misprediction + 1

print('\nAccuracy: ', (total - misprediction) / total)


Please enter participant number you want to predict for: 353
[0.90659326]
age                                46
workclass                   Local-gov
education                        11th
marital_status     Married-civ-spouse
occupation           Transport-moving
relationship                  Husband
ethnicity                       White
gender                           Male
hrs_per_week                     30.0
native_country          United-States
index                             NaN
Name: 353, dtype: object

This person's income is predicted to be below 50K. 
Actual income is above 50K. 

Accuracy:  0.8340396781524476
