In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import precision_score, recall_score

from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

In [2]:
data = pd.read_csv('census.csv')

In [3]:
data.head()

Unnamed: 0,age,workclass,education_level,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38,Private,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53,Private,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28,Private,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K


In [4]:
!pip install pandas_profiling



In [5]:
import pandas_profiling as pp

In [6]:
report = pp.ProfileReport(data, title='Data Assessment')
report.to_file('data_assessment.html')
report



In [21]:
data.dtypes

age                  int64
workclass           object
education_level     object
education-num      float64
marital-status      object
occupation          object
relationship        object
race                object
sex                 object
capital-gain       float64
capital-loss       float64
hours-per-week     float64
native-country      object
income              object
dtype: object

In [8]:
data.columns

Index(['age', 'workclass', 'education_level', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')

In [9]:
cont_names = ['age','hours-per-week']
cat_names = ['workclass','education_level','marital-status','race','sex','occupation']

In [10]:
df_cat = pd.get_dummies(data[cat_names].astype(str))
df_cont = data[cont_names]

In [11]:
y2 = pd.get_dummies(data['income'])
y2

Unnamed: 0,<=50K,>50K
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0
...,...,...
45217,1,0
45218,1,0
45219,1,0
45220,1,0


In [12]:
Y = y2.drop('<=50K',axis=1)
Y

Unnamed: 0,>50K
0,0
1,0
2,0
3,0
4,0
...,...
45217,0
45218,0
45219,0
45220,0


In [13]:
X = pd.concat([df_cat, df_cont], axis=1)

In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.4, random_state = 42)

In [17]:
model = LogisticRegression()

In [18]:
model.fit(X_train, Y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [19]:
accuracy = model.score(X_test, Y_test)
accuracy

0.8325501686107579

In [20]:
Y_pred = model.predict(X_test)
Y_pred

array([1, 0, 0, ..., 0, 0, 0], dtype=uint8)

In [38]:
confusion = confusion_matrix(Y_test, Y_pred)
confusion

array([[12605,  1031],
       [ 1998,  2455]], dtype=int64)

In [39]:
recall = recall_score(Y_test, Y_pred)
recall

0.5513137210869077

In [40]:
precision = precision_score(Y_test, Y_pred)
precision

0.704245553643144

In [41]:
model2 = XGBClassifier()

In [42]:
model2.fit(X_train, Y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [43]:
Y_pred = model2.predict(X_test)
accuracy2 = model2.score(X_test, Y_test)
accuracy2

0.8387970589861241

In [44]:
confusion2 = confusion_matrix(Y_test, Y_pred)
confusion2

array([[12761,   875],
       [ 2041,  2412]], dtype=int64)

In [45]:
precision2 = precision_score(Y_test, Y_pred)
precision2

0.7337998174627319

In [46]:
recall2 = recall_score(Y_test, Y_pred)
recall2

0.5416573096788682