## 1. Import Dataset

In [1]:
import numpy as np
import pandas as pd

In [2]:
us = pd.read_csv('adult-training.csv')

In [3]:
us.head()

Unnamed: 0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


###  data overview

In [64]:
import pandas_profiling

In [195]:
pandas_profiling.ProfileReport(us)



## 2. Data Preprocessing

In [5]:
us.replace(' ?', np.nan, inplace=True)

In [7]:
us[' <=50K'] = us[' <=50K'].apply(lambda x: 1 if x==' >50K' else 0)

In [8]:
cat_features = us.select_dtypes(include=['object']).axes[1]
for col in cat_features:
       print (col, us[col].nunique())

 State-gov 8
 Bachelors 16
 Never-married 7
 Adm-clerical 14
 Not-in-family 6
 White 5
 Male 2
 United-States 41


### to dummy variables 

In [9]:
for col in cat_features:
    us = pd.concat([us, pd.get_dummies(us[col], prefix=col, prefix_sep=':')], axis=1)
    us.drop(col, axis=1, inplace=True)

In [10]:
us.head()

Unnamed: 0,39,77516,13,2174,0,40,<=50K,State-gov: Federal-gov,State-gov: Local-gov,State-gov: Never-worked,...,United-States: Portugal,United-States: Puerto-Rico,United-States: Scotland,United-States: South,United-States: Taiwan,United-States: Thailand,United-States: Trinadad&Tobago,United-States: United-States,United-States: Vietnam,United-States: Yugoslavia
0,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,38,215646,9,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,53,234721,7,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,28,338409,13,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,37,284582,14,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [11]:
usx = us.drop(' <=50K',axis=1)
usy = us[' <=50K'] 

## 3. Modeling (without scaling)

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(usx,usy, test_size=0.3, random_state=42)

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [14]:
lr = LogisticRegression()
lr.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [15]:
y_pred = lr.predict(X_train)
y_pred2 = lr.predict(X_test)

In [209]:
print('training dataset score :',lr.score(X_train,y_train).round(3))
print('test dataset score :',lr.score(X_test,y_test).round(3))

training dataset score : 0.797
test dataset score : 0.798


In [210]:
print(classification_report(y_train, y_pred))

             precision    recall  f1-score   support

          0       0.80      0.97      0.88     17324
          1       0.72      0.26      0.38      5468

avg / total       0.78      0.80      0.76     22792



In [211]:
print(classification_report(y_test, y_pred2))

             precision    recall  f1-score   support

          0       0.80      0.97      0.88      7395
          1       0.74      0.26      0.39      2373

avg / total       0.79      0.80      0.76      9768



## 4. Modeling (with scaling)

In [16]:
from sklearn.preprocessing import StandardScaler

In [17]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train2 = scaler.transform(X_train)
X_test2 = scaler.transform(X_test)

In [18]:
lr2 = LogisticRegression()
lr2.fit(X_train2,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [19]:
y_pred3 = lr2.predict(X_train2)
y_pred4 = lr2.predict(X_test2)

In [20]:
print('training dataset score :',lr2.score(X_train2,y_train).round(3))
print('test dataset score :',lr2.score(X_test2,y_test).round(3))

training dataset score : 0.854
test dataset score : 0.852


In [22]:
print(classification_report(y_train, y_pred3))

             precision    recall  f1-score   support

          0       0.88      0.93      0.91     17324
          1       0.74      0.60      0.66      5468

avg / total       0.85      0.85      0.85     22792



In [24]:
print(classification_report(y_test,y_pred4))

             precision    recall  f1-score   support

          0       0.88      0.93      0.90      7395
          1       0.74      0.61      0.67      2373

avg / total       0.85      0.85      0.85      9768



### much better result with scaling!