In [17]:
%%time
import time
start = time.time()
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import  GridSearchCV, train_test_split as tts
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score, roc_curve
import warnings
warnings.filterwarnings("ignore")

CPU times: user 112 µs, sys: 0 ns, total: 112 µs
Wall time: 120 µs


In [18]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,Private,77053,HS-grad,9,Widowed,Prof-specialty,Not-in-family,White,Female,0,4356,40,United-States,0
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,0
2,66,Private,186061,Some-college,10,Widowed,Prof-specialty,Unmarried,Black,Female,0,4356,40,United-States,0
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,0
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,0


In [19]:
%%time
df = pd.read_csv("adult.csv")

#Replace ? with NaN and then replce NaN by Mean/mode of the column
removal= df.isin(['?']).any()[df.isin(['?']).any()==True].index.tolist()
for col in removal:
        df[col]= df[col].replace("?", np.NaN)
        try:
            df[col] = df[col].astype(float)
            mean = (df[col].mean(skipna=True))
        except:
            mean = df[col].mode()
            mean = mean[0]
        df[col]= df[col].replace(np.NaN, mean)
df_clean = df

# label encode Y axis (income) Since there are two values : <=50k and >50k
le = LabelEncoder()
df["income"] = le.fit_transform(df["income"])

# Select X axis
X = df.drop(["income"],1)

# Select y axis
y = df["income"]

#Select Numerical columns 
num = X.select_dtypes(include = np.number)

#Select Categorical columns 
cat = X.select_dtypes(exclude = np.number)

# make all values of categorical to lower case to make them standard
for x in list(cat):
    cat[x] = cat[x].str.lower()

CPU times: user 1.33 s, sys: 0 ns, total: 1.33 s
Wall time: 2.07 s


In [20]:
# Drop Education column since we have education.num column which is same as education
cat.drop(["education"],1,inplace = True)

# OneHotEncoding all categorical columns
cat = pd.get_dummies(cat, drop_first=True)

# Scale Numerical columns
scaler = StandardScaler()
num = pd.DataFrame(scaler.fit_transform(num), columns=list(num))

In [21]:
def remove_outlier(X,y,z):
    scaler = StandardScaler()
    scaled_X = pd.DataFrame(scaler.fit_transform(X), columns=list(X))
    for column_name in list(X):
        scaled_X = scaled_X[(scaled_X[column_name] > -3 ) & (scaled_X[column_name] < 3 )]
        indexes = scaled_X.index
        X = X.loc[indexes]
        y = y.loc[indexes]
        z = z.loc[indexes]
    return(X,y,z)

print(num.shape)
print(y.shape)
print(cat.shape)
# num,y,cat = remove_outlier(num,y,cat)
# num.shape

(32561, 6)
(32561,)
(32561, 76)


In [4]:
X = pd.concat([num,cat],1)
y = y

In [5]:
log_reg = LogisticRegression(random_state=42)

In [6]:

log_reg.fit(X,y)
log_reg.score(X,y)

0.8538286174064638

In [7]:
%%time
X_train, X_test, y_train, y_test = tts(X,y,test_size = 0.25, random_state = 42)
log_reg.fit(X_train,y_train)
y_pred = log_reg.predict(X_test)
print (accuracy_score(y_test,y_pred))
print (classification_report(y_test,y_pred))

0.8507442671315543
              precision    recall  f1-score   support

           0       0.88      0.93      0.91      5786
           1       0.71      0.56      0.63      1671

    accuracy                           0.85      7457
   macro avg       0.80      0.75      0.77      7457
weighted avg       0.84      0.85      0.84      7457

CPU times: user 678 ms, sys: 60.7 ms, total: 739 ms
Wall time: 632 ms


In [8]:
print (confusion_matrix(y_test,y_pred))


[[5401  385]
 [ 728  943]]


In [9]:
print (roc_auc_score(y_test,y_pred))

0.7488964054674577


In [10]:

def extract_best_features(model,n):
    features = list(X)
    fs = pd.DataFrame()
    ranking = pd.Series(model.coef_[0])
    fs["features"] = features
    fs["ranking"] = ranking
    fs["ranking"] = np.abs(fs["ranking"])
    fs = fs.sort_values(["ranking"], ascending=False)
    fs = fs[:n]
    return fs

In [11]:
log_reg

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=42, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [12]:
features_1 = extract_best_features(log_reg,15)
features_1 = features_1["features"].tolist()

In [13]:
features_1

['capital.gain',
 'marital.status_married-af-spouse',
 'marital.status_married-civ-spouse',
 'relationship_wife',
 'occupation_priv-house-serv',
 'workclass_without-pay',
 'native.country_south',
 'native.country_laos',
 'relationship_own-child',
 'occupation_farming-fishing',
 'workclass_self-emp-not-inc',
 'native.country_dominican-republic',
 'native.country_vietnam',
 'native.country_greece',
 'capital.loss']

In [14]:
from sklearn.feature_selection import SelectKBest, f_classif
skb = SelectKBest(score_func=f_classif, k = 15)
skb.fit(X,y)

SelectKBest(k=15, score_func=<function f_classif at 0x7f8454fda6a8>)

In [15]:
features = list(X)
fs = pd.DataFrame()
p_values = pd.Series(skb.pvalues_)
fs["features"] = features
fs["p_values"] = p_values
fs = fs.sort_values(["p_values"], ascending=True)
fs = fs[:15]

In [16]:
features_2 = fs
features_2 = features_2["features"].tolist()

In [17]:
from sklearn.feature_selection import RFECV, RFE

In [None]:
%%time
rfe_cv = RFECV(estimator=log_reg,min_features_to_select=15, cv = 5)
rfe_cv.fit(X,y)

In [None]:
boolean = rfe_cv.get_support().tolist()

In [None]:
from itertools import compress
features_3 = list(compress(list(X), boolean))

In [None]:
features_3 = features_3[:15]

In [None]:
import statsmodels.api as sm
X_3 = sm.add_constant(X)
X_train, X_test, y_train, y_test = tts(X_3,y,random_state = 42, test_size = 0.25)
model = sm.Logit(y_train,X_train).fit(method = 'bfgs')
model.summary()

def logit_fs(X,y):
    X_sm = sm.add_constant(X)
    model = sm.Logit(y.astype(float),X_sm.astype(float)).fit(method='lbfgs')
    results_summary = model.summary()
    results_as_html = results_summary.tables[1].as_html()
    p_df = pd.read_html(results_as_html, header=0, index_col=0)[0]
    for i in range(p_df.shape[0]):
        if p_df['P>|z|'].max()>=.05:
            p_df = p_df[p_df['P>|z|']!=p_df['P>|z|'].max()]
            list_new = list(p_df.index)
            X_sm = X_sm[list_new]
            model = sm.Logit(y,X_sm).fit()
            results_summary = model.summary()
            results_as_html = results_summary.tables[1].as_html()
            p_df = pd.read_html(results_as_html, header=0, index_col=0)[0]
    features = list(p_df.index)[1:]
    return(features,results_summary)
features_4, summary = logit_fs(X_train,y_train)
features_4 = features_4[:15]

In [None]:
features_4

In [None]:
def print_metrics(model, feature_list_to_use):
    X_1 = X[feature_list_to_use]
    X_train, X_test, y_train, y_test = tts(X_1,y,random_state = 42, test_size = 0.25)
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    print ("Accuracy_score is " + str(accuracy_score(y_test,y_pred)))
    print (classification_report(y_test,y_pred))
    print ("AUC_score is " + str(roc_auc_score(y_test,y_pred)))

In [None]:
# features_1= features_1["features"].tolist()

In [None]:
# features_2 = features_2["features"].tolist()

In [None]:
features_4

In [None]:
#All features, normal Logistic regression
model = LogisticRegression(random_state=42)
print_metrics(model, list(X))

In [None]:
#skb_features, normal Logistic regression
model = LogisticRegression(random_state=42)
print_metrics(model, features_2)

In [None]:
#rfe_cv_features, normal Logistic regression
model = LogisticRegression(random_state=42)
print_metrics(model, features_3)

In [None]:
#sm_logit_features, normal Logistic regression
model = LogisticRegression(random_state=42)
print_metrics(model, features_4)

In [None]:
#All features, normal Logistic regression
model = LogisticRegression(random_state=42, class_weight="balanced")
print_metrics(model, list(X))

In [None]:
#skb_features, normal Logistic regression
model = LogisticRegression(random_state=42, class_weight="balanced")
print_metrics(model, features_2)

In [None]:
#rfe_cv_features, normal Logistic regression
model = LogisticRegression(random_state=42, class_weight = "balanced")
print_metrics(model, features_3)

In [None]:
#sm_logit_features, normal Logistic regression
model = LogisticRegression(random_state=42, class_weight="balanced")
print_metrics(model, features_4)

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [None]:
X_1 = X[features_2]
X_train, X_test, y_train, y_test = tts(X_1,y,test_size = 0.25, random_state = 42)
params= {"C":np.arange(0.1,10,0.1), 
        "penalty":["l1", "l2"]}

In [None]:
params

In [None]:
model = LogisticRegression(random_state=42, class_weight="balanced")
log_reg_cv = GridSearchCV(param_grid=params, cv = 5, estimator=model )

In [None]:
%%time
log_reg_cv.fit(X_train, y_train)

In [None]:
log_reg_cv.best_params_

In [None]:
print ("Run time is "+str(time.time() - start))

In [None]:
model = log_reg_cv.best_estimator_

In [None]:
model

In [None]:
print_metrics(model=model, feature_list_to_use=features_2)

In [None]:
!pip install imblearn

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
X_1 = X[features_2]
# X_train, X_test, y_train, y_test = tts(X_1,y,test_size = 0.25, random_state = 42)

In [None]:
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_sample(X_1, y.ravel())

In [None]:
X_train, X_test, y_train, y_test = tts(X_res,y_res,test_size = 0.25, random_state = 42)

In [None]:
# model.fit(X_train,y_train)

In [None]:
# y_pred=model.predict(X_test)

In [None]:
# print (classification_report(y_test,y_pred))

In [None]:
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print ("Accuracy_score is " + str(accuracy_score(y_test,y_pred)))
print (classification_report(y_test,y_pred))
print ("AUC_score is " + str(roc_auc_score(y_test,y_pred)))