In [68]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [43]:
df=pd.read_csv('data/adult.csv')

In [44]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,,77053,HS-grad,9,Widowed,,Not-in-family,White,Female,0,4356,40,United-States,0
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,0
2,66,,186061,Some-college,10,Widowed,,Unmarried,Black,Female,0,4356,40,United-States,0
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,0
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,0


In [45]:
df.drop(['fnlwgt'], axis=1, inplace=True)

In [46]:
X = df.drop(['income'], axis=1)
y = df['income']

In [47]:
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [48]:
categorical_cols

Index(['workclass', 'education', 'marital.status', 'occupation',
       'relationship', 'race', 'sex', 'native.country'],
      dtype='object')

In [49]:
numerical_cols

Index(['age', 'education.num', 'capital.gain', 'capital.loss',
       'hours.per.week'],
      dtype='object')

In [50]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [51]:
num_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
        ('scaler', StandardScaler())
    ])
cat_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
        ('onehotencoding',OneHotEncoder())
    ])
preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline, ['age','education.num', 'capital.gain', 'capital.loss','hours.per.week']),
('cat_pipeline',cat_pipeline,categorical_cols)
])

In [52]:
preprocessor

In [53]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30, random_state=10)

In [54]:
X_train = preprocessor.fit_transform(X_train)

In [55]:
X_test = preprocessor.fit_transform(X_test)

In [56]:
from sklearn.linear_model import LogisticRegression

In [69]:
model = LogisticRegression()

In [70]:
model.fit(X_train, y_train)

In [71]:
from sklearn.metrics import confusion_matrix

In [72]:
model.score(X_test, y_test)

0.8044835704780428

In [73]:
y_pred=model.predict(X_test)

In [74]:
confusion_matrix(y_pred, y_test)

array([[7259, 1752],
       [ 158,  600]], dtype=int64)

In [23]:
from sklearn.model_selection import GridSearchCV

In [75]:
param = {
    'max_iter' :[250, 500, 1000],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'class_weight': ['dict','balanced'],
    'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky' 'sag' 'saga'],
}

In [76]:
gsc = GridSearchCV(model, param, cv=20)

In [None]:
gsc.fit(X_train,y_train)

In [None]:
gsc.score(X_test, y_test)

In [None]:
gsc.best_params_