In [None]:
import tensorflow as tf
import pandas as pd
from sklearn import linear_model
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
COLUMNS = ['age','workclass', 'fnlwgt', 'education', 'education_num', 
           'marital','occupation', 'relationship', 'race', 'sex', 
           'capital_gain','capital_loss','hours_week','native_country',
           'label']

In [None]:
#index-col=false remove first column
df_train = pd.read_csv('Adult.csv', skipinitialspace=True, names = COLUMNS, index_col=False)
df_test = pd.read_csv('Adult_Test.csv',skiprows = 1, skipinitialspace=True, names = COLUMNS, index_col=False)

In [None]:
df_train.head()
#we interpret that my target value or label or y or class is a string of categorial class


In [None]:
print(df_train.shape, df_test.shape)
print(df_train.dtypes)

In [None]:
#map the values >=50k with 0 and <=50k with 1
#mapping is done using dictionary
#this can also be done using get_Dummies
label = {'<=50K': 0,'>50K': 1}
#replace label[50k] or label['<=50k'] with its value, here 50k is a key and will pass tht value to dftrain .label according to dictionary
df_train.label = [label[item] for item in df_train.label]
label_t = {'<=50K.': 0,'>50K.': 1}
df_test.label = [label_t[item] for item in df_test.label] 

In [None]:
df_train.head()
df_test.head()

In [None]:
#replace ? with nan and drop it
df_train=df_train.replace('?',np.nan)
df_train.dropna(inplace=True)
df_test=df_test.replace('?',np.nan)
df_test.dropna(inplace=True)

In [None]:
df_test.info()

In [None]:
#categorial columns from train set
cat_vars=['workclass','education','marital','occupation','relationship','race','sex']
for var in cat_vars:
    cat_list=pd.get_dummies(df_train[var],prefix=var)
    df_train = pd.concat([df_train,cat_list], axis = 1)
#removing categorial data from test set
cat_vars1=['workclass','education','marital','occupation','relationship','race','sex']
for var in cat_vars:
    cat_list=pd.get_dummies(df_test[var],prefix=var)
    data=df_test.join(cat_list)
    df_test=data

In [None]:
df_train.columns

In [None]:
df_train.head()

In [None]:
df_train.columns

In [None]:
df_train.head()

In [None]:
df_train=df_train.drop(['workclass','education','relationship','race','sex','native_country','marital','occupation'],axis=1)

In [None]:
df_test=df_test.drop(['workclass','education','marital','occupation','relationship','race','sex','native_country'],axis=1)

In [None]:
df_train.columns

In [None]:
x=df_train.drop(['label'],axis=1)
y=df_train['label']
xtrain,xtext,ytrain,ytest=train_test_split(x,y,test_size=0.2, random_state=1)

#logistic Regression model will give prediction of y 
#log_reg=linear_model.LogisticRegression(c=4)
#you can change C value
log_reg=linear_model.LogisticRegression()
log_reg.fit(xtrain,ytrain)
log_reg.score(xtext,ytest)

#confusion matrix
ypred=log_reg.predict(xtext) 
from sklearn.metrics import confusion_matrix
print(confusion_matrix(ytest,ypred))

#to print precision recall
from sklearn.metrics import classification_report
print(classification_report(ytest,ypred))


In [None]:
x=df_train.drop(['label'],axis=1)
y=df_train['label']

In [None]:
xtrain,xtext,ytrain,ytest=train_test_split(x,y,test_size=0.2, random_state=1)

In [None]:
log_reg=linear_model.LogisticRegression()
log_reg.fit(xtrain,ytrain)

In [None]:
x.info()

In [None]:
x.shape

In [None]:
y.shape

In [None]:
print(df_train["label"].value_counts())
### The model will be correct in atleast 70% of the case
print(df_test["label"].value_counts())
## Unbalanced label
print(df_train.dtypes)

In [None]:
CONTI_FEATURES  = ['age', 'fnlwgt','capital_gain', 'education_num', 'capital_loss', 'hours_week']
### Define the categorical list
CATE_FEATURES = ['workclass', 'education', 'marital', 'occupation', 'relationship', 'race', 'sex', 'native_country']

In [None]:
def print_transformation(feature, continuous = True, size = 2): 
    #X = fc.numeric_column(feature)
    ## Create feature name
    feature_names = [
    feature]

    ## Create dict with the data
    d = dict(zip(feature_names, [df_train[feature]]))

    ## Convert age
    if continuous == True:
        c = tf.feature_column.numeric_column(feature)
        feature_columns = [c]
    else: 
        c = tf.feature_column.categorical_column_with_hash_bucket(feature, hash_bucket_size=size) 
        c_indicator = tf.feature_column.indicator_column(c)
        feature_columns = [c_indicator]
    
## Use input_layer to print the value
    input_layer = tf.feature_column.input_layer(
        features=d,
        feature_columns=feature_columns
        )
    ## Create lookup table
    zero = tf.constant(0, dtype=tf.float32)
    where = tf.not_equal(input_layer, zero)
    ## Return lookup tble
    indices = tf.where(where)
    values = tf.gather_nd(input_layer, indices)
    ## Initiate graph
    sess = tf.Session()
    ## Print value
    print(sess.run(input_layer))
print_transformation(feature = "age", continuous = True)

In [None]:
continuous_features = [tf.feature_column.numeric_column(k) for k in CONTI_FEATURES]

In [None]:
continuous_features.

In [None]:
relationship = tf.feature_column.categorical_column_with_vocabulary_list(
    'relationship', [
        'Husband', 'Not-in-family', 'Wife', 'Own-child', 'Unmarried',
        'Other-relative'])

In [None]:
categorical_features = [tf.feature_column.categorical_column_with_hash_bucket(k, hash_bucket_size=1000) for k in CATE_FEATURES]

In [None]:
categorical_features

## Logistic Regression

In [None]:
# model = tf.estimator.LinearRegressor(
# 
# model_dir="ongoing/train", 
# feature_columns=categorical_features+ continuous_features)
model = tf.estimator.LinearClassifier(
    n_classes = 2,
    model_dir="ongoing/train", 
    feature_columns=categorical_features+ continuous_features)

In [None]:
FEATURES = ['age','workclass', 'fnlwgt', 'education', 'education_num', 'marital', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_week', 'native_country']
LABEL= 'label'
def get_input_fn(data_set, num_epochs=None, n_batch = 128, shuffle=True):
    return tf.estimator.inputs.pandas_input_fn(
       x=pd.DataFrame({k: data_set[k].values for k in FEATURES}),
       y = pd.Series(data_set[LABEL].values),
       batch_size=n_batch,   
       num_epochs=num_epochs,
       shuffle=shuffle)

In [None]:
model.train(input_fn=get_input_fn(df_train, 
                                      num_epochs=None,
                                      n_batch = 128,
                                      shuffle=False),
                                      steps=1000)

In [None]:
model.evaluate(input_fn=get_input_fn(df_test, 
                                      num_epochs=1,
                                      n_batch = 128,
                                      shuffle=False),
                                      steps=1000)