In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (StandardScaler, PolynomialFeatures)
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score,recall_score,precision_score, classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder
import os
import statsmodels.api as sm

In [3]:
data= pd.read_csv('adult.csv',na_values=' ?')

In [4]:
data = data.applymap(lambda x: x.strip() if isinstance(x, str) else x)

raw_data= data.drop(['education'], axis = 1)


In [5]:
data = raw_data.copy()

In [6]:
data['country'] = data['country'].apply(lambda x: 'United-States' if x=='United-States' else 'NA')

In [7]:
#cols with ? in them
column_list = data.columns
data[column_list] = data[column_list].replace('?',np.nan )

In [8]:
#check nulls
for column in column_list:
    print('Number of null recs in col ',column,'is ',data[column].isnull().sum())
#workclass, occupation, country have nulls

Number of null recs in col  age is  0
Number of null recs in col  workclass is  1836
Number of null recs in col  fnlwgt is  0
Number of null recs in col  education-num is  0
Number of null recs in col  marital-status is  0
Number of null recs in col  occupation is  1843
Number of null recs in col  relationship is  0
Number of null recs in col  race is  0
Number of null recs in col  sex is  0
Number of null recs in col  capital-gain is  0
Number of null recs in col  capital-loss is  0
Number of null recs in col  hours-per-week is  0
Number of null recs in col  country is  0
Number of null recs in col  salary is  0


In [9]:

workclass_mode =str(data['workclass'].mode()[0])
occupation_mode =str(data['occupation'].mode()[0])
country_mode =str(data['country'].mode()[0])

data['workclass']=data['workclass'].fillna(workclass_mode)
#data.iloc[27]

data['occupation']=data['occupation'].fillna(occupation_mode)
data['country']=data['country'].fillna(country_mode)

In [19]:
data_ohc = data.copy()

#nunique- no of uniques
mask = data_ohc.dtypes == np.object

#add education-num to be one hot encoded
mask['education-num'] = True
# we use this true false sequence to filter cols
categorical_cols = data_ohc.columns[mask]

################################
#easy method to one hot encode

ohe=OneHotEncoder(sparse=False)
data_ohc_encoded = pd.DataFrame (ohe.fit_transform(data_ohc[categorical_cols]))
data_ohc_encoded.columns = ohe.get_feature_names(categorical_cols)
data_ohc.drop(categorical_cols ,axis=1, inplace=True)
data_ohc= pd.concat([data_ohc, data_ohc_encoded ], axis=1)
#data_ohc is final

data_ohc.shape[1] - data.shape[1]

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  mask = data_ohc.dtypes == np.object


53

In [20]:

#############################
data_ohc=data_ohc.drop('salary_<=50K', axis = 1)
#############################

In [21]:
data_ohc_temp = data_ohc.head(1000)

In [22]:
y_col = 'salary_>50K'
feature_cols = [x for x in data_ohc.columns if x != y_col]

X = data_ohc[feature_cols]
y = data_ohc[y_col]

In [None]:
#X = data_ohc_temp[feature_cols]
#y = data_ohc_temp[y_col]

In [23]:
#################################
#significance check

results = sm.OLS(y, X).fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:            salary_>50K   R-squared:                       0.366
Model:                            OLS   Adj. R-squared:                  0.365
Method:                 Least Squares   F-statistic:                     328.9
Date:                Tue, 18 Jan 2022   Prob (F-statistic):               0.00
Time:                        17:43:02   Log-Likelihood:                -11123.
No. Observations:               32561   AIC:                         2.236e+04
Df Residuals:                   32503   BIC:                         2.285e+04
Df Model:                          57                                         
Covariance Type:            nonrobust                                         
                                           coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------


In [24]:
X = X.drop(
['workclass_Never-worked', 'marital-status_Married-AF-spouse','occupation_Tech-support','relationship_Wife',
'workclass_Federal-gov', 'workclass_Self-emp-inc', 'occupation_Priv-house-serv','occupation_Prof-specialty',
'occupation_Protective-serv','occupation_Armed-Forces', 'occupation_Sales', 'sex_Female','country_NA'] , axis = 1
)


In [None]:
estimator = Pipeline([   ("scaler", StandardScaler()),
       			 ("polynomial_features", PolynomialFeatures()),
        			 ("logistic_regression", LogisticRegression(class_weight='balanced'))])

params = {
  			  'polynomial_features__degree': [1],
                'logistic_regression__max_iter':[1000],
#                'logistic_regression__C': np.logspace(-4, 4, 5),
                'logistic_regression__C': [1],
#                'logistic_regression__penalty' : ['l1', 'l2','elasticnet','none'],
                'logistic_regression__penalty' : ['l1'],
#                'logistic_regression__solver' : ['liblinear','lbfgs','newton-cg','sag','saga']
                'logistic_regression__solver' : ['liblinear']
}

kf = KFold(shuffle=True, random_state=72018, n_splits=3)

#grid = GridSearchCV(estimator, params, cv=kf,scoring='f1' ,verbose = True)
grid = GridSearchCV(estimator, params,cv = kf ,scoring='f1' )
grid.fit(X, y)

grid.best_score_, grid.best_params_

grid.best_estimator_
#- to find best score and parameters
y_predict = grid.predict(X)



print('Confusion Matrix \n',confusion_matrix(y,y_predict))
print('Accuracy Score:', accuracy_score(y,y_predict))
print('Area Under Curve:', roc_auc_score(y,y_predict))
print('Recall score: ',recall_score(y,y_predict))
print('precision score: ',precision_score(y,y_predict))
print('f1 score: ',f1_score(y,y_predict))
print('classification_report: \n',classification_report(y,y_predict))

In [None]:
"""OUTPUTS
Recall: what percentage of actual high income people did we predict

Recall is desired in our case since these are high revenue people we need to predict. 
we can afford to wrongly predict people with low income as high, 
but need to identify the high ones correctly

-------------------------------------------------------
with balanced weight

(0.6797920977097817,
 {'logistic_regression__C': 0.0001,
  'logistic_regression__max_iter': 100,
  'logistic_regression__penalty': 'none',
  'logistic_regression__solver': 'lbfgs',
  'polynomial_features__degree': 1})

Confusion Matrix 
 [[19654  5066]
 [ 1174  6667]]
Accuracy Score: 0.8083596941125887
Area Under Curve: 0.8226694623192586
Recall score:  0.8502741997194235
precision score:  0.5682263700673315
f1 score:  0.6812097680596709
"""

# Ridge classifier 
## gives Recalls as 0.75 0.85

In [26]:

# Trying Ridge Classifier
#getting 0.75, 0.85 as recalls
for i in range(0,1000,100):
    ridge_classi = RidgeClassifier(alpha= i, class_weight= "balanced")
    model=ridge_classi.fit(X,y)
    y_predict = model.predict(X)
    print("alpha is :", i)
#    print(model.coef_)
    print(classification_report(y,y_predict))
    

alpha is : 0
              precision    recall  f1-score   support

         0.0       0.94      0.76      0.84     24720
         1.0       0.53      0.86      0.65      7841

    accuracy                           0.78     32561
   macro avg       0.74      0.81      0.75     32561
weighted avg       0.84      0.78      0.80     32561

alpha is : 100
              precision    recall  f1-score   support

         0.0       0.94      0.75      0.84     24720
         1.0       0.52      0.86      0.65      7841

    accuracy                           0.78     32561
   macro avg       0.73      0.81      0.74     32561
weighted avg       0.84      0.78      0.79     32561

alpha is : 200
              precision    recall  f1-score   support

         0.0       0.94      0.75      0.84     24720
         1.0       0.52      0.86      0.65      7841

    accuracy                           0.78     32561
   macro avg       0.73      0.80      0.74     32561
weighted avg       0.84      0.

# Lasso Classifier
### Gives Recalls as 0.80 nd 0.85

In [27]:
# Trying Lasso Classification    
#getting 0.8, 0.85 as recalls (lasso)
for i in range(1,10,1):
    lr_regressor = LogisticRegression(penalty='l1',C = i/10,max_iter=10000, solver='liblinear', class_weight='balanced')
    model=lr_regressor.fit(X,y)
    y_predict = model.predict(X)
    print("alpha is :", i/10)
#    print(model.coef_)
    print(classification_report(y,y_predict))
    print(confusion_matrix(y,y_predict))

alpha is : 0.1
              precision    recall  f1-score   support

         0.0       0.94      0.79      0.86     24720
         1.0       0.56      0.85      0.68      7841

    accuracy                           0.81     32561
   macro avg       0.75      0.82      0.77     32561
weighted avg       0.85      0.81      0.82     32561

[[19567  5153]
 [ 1176  6665]]
alpha is : 0.2
              precision    recall  f1-score   support

         0.0       0.94      0.79      0.86     24720
         1.0       0.57      0.85      0.68      7841

    accuracy                           0.81     32561
   macro avg       0.76      0.82      0.77     32561
weighted avg       0.85      0.81      0.82     32561

[[19642  5078]
 [ 1169  6672]]
alpha is : 0.3
              precision    recall  f1-score   support

         0.0       0.94      0.80      0.86     24720
         1.0       0.57      0.85      0.68      7841

    accuracy                           0.81     32561
   macro avg       0.

# SVC
### Recalls as 0.52 0.52

In [None]:
# Trying Support Vector Classifier
# very low socre - recalls 0.52, 0.52
svc_classifier = SVC()
parameters = {
    'class_weight':['balanced'],
    'C': [0.1,0.6,1],
    'kernel':['poly', 'rbf', 'sigmoid'],
    'degree' : [1,2]
    
}
model = GridSearchCV(estimator =svc_classifier,param_grid = parameters, cv = 5, scoring = 'f1' )
model_cv=model.fit(X,y)
y_predict = model_cv.predict(X)
#print("C is :", i/10)
    #print(model.coef_)
print(classification_report(y,y_predict))
print(confusion_matrix(y,y_predict))
model.best_estimator_
# SVC(C=0.1, class_weight='balanced', degree=1, kernel='sigmoid')

# Random Forest
### Recalls as 0.87 0.91
#### Best performance till now

In [28]:
# Trying Random Forest
# Random forest Recall scores 0.87, 0.91
#this is the best model till now
random_forest_classifier = RandomForestClassifier()
parameters = {
    'class_weight':['balanced'],
#    'n_estimators': [100,200,500,1000],
    'n_estimators': [50,70,100],    
    'criterion':['gini', 'entropy'],
#    'min_samples_leaf' : [5,10,20,50]
    'min_samples_leaf' : [3,4,5]
    
}
model = GridSearchCV(estimator =random_forest_classifier,param_grid = parameters, cv = 5, scoring = 'f1' )
model_cv=model.fit(X,y)
y_predict = model_cv.predict(X)
print(classification_report(y,y_predict))
print(confusion_matrix(y,y_predict))
print(model.best_estimator_)
#RandomForestClassifier(class_weight='balanced', min_samples_leaf=3, n_estimators=70)


              precision    recall  f1-score   support

         0.0       0.97      0.87      0.92     24720
         1.0       0.69      0.91      0.78      7841

    accuracy                           0.88     32561
   macro avg       0.83      0.89      0.85     32561
weighted avg       0.90      0.88      0.88     32561

[[21502  3218]
 [  735  7106]]
RandomForestClassifier(class_weight='balanced', criterion='entropy',
                       min_samples_leaf=3)


In [None]:
#Random forest output
"""
              precision    recall  f1-score   support

         0.0       0.97      0.87      0.92     24720
         1.0       0.69      0.91      0.78      7841

    accuracy                           0.88     32561
   macro avg       0.83      0.89      0.85     32561
weighted avg       0.90      0.88      0.88     32561

[[21534  3186]
 [  728  7113]]
RandomForestClassifier(class_weight='balanced', min_samples_leaf=3,
                       n_estimators=70)
"""