<a href="https://colab.research.google.com/github/shivan222/census-data/blob/main/Census_Linear_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
!pip install -q sklearn

In [25]:
%tensorflow_version 2.x

In [26]:
from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import clear_output
from six.moves import urllib

import tensorflow.compat.v2.feature_column as fc

import tensorflow as tf

In [27]:
dftrain = pd.read_csv('https://raw.githubusercontent.com/shivan222/census-data/main/census_train.csv')
dfeval = pd.read_csv('https://raw.githubusercontent.com/shivan222/census-data/main/census_eval.csv')
y_train = dftrain.pop('over50k')
y_eval = dfeval.pop('over50k')

In [28]:
CATEGORICAL_COLUMNS = ['workclass', 'education', 'marital_status', 'occupation', 'relationship',
                       'race', 'sex', 'native_country']
NUMERIC_COLUMNS = ['age', 'education_years']

feature_columns = []
for feature_name in CATEGORICAL_COLUMNS:
  vocabulary = dftrain[feature_name].unique()
  feature_columns.append(tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabulary))

for feature_name in NUMERIC_COLUMNS:
  feature_columns.append(tf.feature_column.numeric_column(feature_name, dtype=tf.float32))

In [29]:
def make_input_fn(data_df, label_df, num_epochs=10, shuffle=True, batch_size=32):
  def input_function():
    ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
    if shuffle:
      ds = ds.shuffle(1000)
    ds = ds.batch(batch_size).repeat(num_epochs)
    return ds
  return input_function

train_input_fn = make_input_fn(dftrain, y_train)
eval_input_fn = make_input_fn(dfeval, y_eval, num_epochs=1, shuffle=False)

In [30]:
linear_est = tf.estimator.LinearClassifier(feature_columns=feature_columns)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp8bi3r3d6', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [31]:
linear_est.train(train_input_fn)

INFO:tensorflow:Calling model_fn.




INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 0...
INFO:tensorflow:Saving checkpoints for 0 into /tmp/tmp8bi3r3d6/model.ckpt.
INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 0...
INFO:tensorflow:loss = 0.6931472, step = 0
INFO:tensorflow:global_step/sec: 255.894
INFO:tensorflow:loss = 0.30316022, step = 100 (0.392 sec)
INFO:tensorflow:global_step/sec: 582.157
INFO:tensorflow:loss = 0.3517918, step = 200 (0.175 sec)
INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 220...
INFO:tensorflow:Saving checkpoints for 220 into /tmp/tmp8bi3r3d6/model.ckpt.
INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 220...
INFO:tensorflow:Loss for final step: 0.7636061.


<tensorflow_estimator.python.estimator.canned.linear.LinearClassifierV2 at 0x7f04e25f1fd0>

In [32]:
result = linear_est.evaluate(eval_input_fn)

INFO:tensorflow:Calling model_fn.




INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2021-07-15T15:47:08
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp8bi3r3d6/model.ckpt-220
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Inference Time : 0.83572s
INFO:tensorflow:Finished evaluation at 2021-07-15-15:47:09
INFO:tensorflow:Saving dict for global step 220: accuracy = 0.8109756, accuracy_baseline = 0.7743902, auc = 0.82634604, auc_precision_recall = 0.5208013, average_loss = 0.4216292, global_step = 220, label/mean = 0.22560975, loss = 0.4328747, precision = 0.6363636, prediction/mean = 0.18268928, recall = 0.3783784
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 220: /tmp/tmp8bi3r3d6/model.ckpt-220


In [33]:
clear_output()
print(result)

{'accuracy': 0.8109756, 'accuracy_baseline': 0.7743902, 'auc': 0.82634604, 'auc_precision_recall': 0.5208013, 'average_loss': 0.4216292, 'label/mean': 0.22560975, 'loss': 0.4328747, 'precision': 0.6363636, 'prediction/mean': 0.18268928, 'recall': 0.3783784, 'global_step': 220}


In [43]:
result = list(linear_est.predict(eval_input_fn))

for i in range(60, 65):
  if y_eval[i] == 0:
    Over50k = "no"
  else:
    Over50k = "yes"

  print("\nperson #" + str(i))
  print(dfeval.loc[i])
  print("\nProbability of salary being Over $50k: ")
  print(result[i]['probabilities'][1])
  print("\nIs their salary actually over $50k? " + Over50k)
  print("\n~~~~~~~~~~~~~~~")


INFO:tensorflow:Calling model_fn.




INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp8bi3r3d6/model.ckpt-220
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.

person #60
age                            36
workclass                 Private
education            Some-college
education_years                10
marital_status      Never-married
occupation           White-Collar
relationship        Not-in-family
race                        Black
sex                        Female
hours_per_week                 36
native_country      United-States
Name: 60, dtype: object

Probability of salary being Over $50k: 
0.032599315

Is their salary actually over $50k? no

~~~~~~~~~~~~~~~

person #61
age                                 43
workclass                    State-gov
education                    Bachelors
education_years                     13
marital_status      Married-civ-spouse
occupation                White-Collar
r