In [63]:
import pandas as pd
from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.preprocessing import StandardScaler # HAndling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [65]:
names_col = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 
             'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 
             'hours_per_week', 'native_country', 'income']
df = pd.read_csv("data/adult.data", names=names_col)

In [67]:
df = df.replace(to_replace=' ?', value='', inplace=False)

In [68]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [69]:
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [70]:

workclass_categories = [' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
       ' Local-gov', '', ' Self-emp-inc', ' Without-pay',
       ' Never-worked']

education_categories = [' Bachelors', ' HS-grad', ' 11th', ' Masters', ' 9th',
       ' Some-college', ' Assoc-acdm', ' Assoc-voc', ' 7th-8th',
       ' Doctorate', ' Prof-school', ' 5th-6th', ' 10th', ' 1st-4th',
       ' Preschool', ' 12th']
marital_status_categories = [' Never-married', ' Married-civ-spouse', ' Divorced',
       ' Married-spouse-absent', ' Separated', ' Married-AF-spouse',
       ' Widowed']
occupation_categories = [' Adm-clerical', ' Exec-managerial', ' Handlers-cleaners',
       ' Prof-specialty', ' Other-service', ' Sales', ' Craft-repair',
       ' Transport-moving', ' Farming-fishing', ' Machine-op-inspct',
       ' Tech-support', '', ' Protective-serv', ' Armed-Forces',
       ' Priv-house-serv']

relationship_categories = [' Not-in-family', ' Husband', ' Wife', ' Own-child', ' Unmarried',
       ' Other-relative']

race_categories = [' White', ' Black', ' Asian-Pac-Islander', ' Amer-Indian-Eskimo',' Other']

sex_categories = [' Male', ' Female']

native_country_categories = [' United-States', ' Cuba', ' Jamaica', ' India', '', ' Mexico',
       ' South', ' Puerto-Rico', ' Honduras', ' England', ' Canada',
       ' Germany', ' Iran', ' Philippines', ' Italy', ' Poland',
       ' Columbia', ' Cambodia', ' Thailand', ' Ecuador', ' Laos',
       ' Taiwan', ' Haiti', ' Portugal', ' Dominican-Republic',
       ' El-Salvador', ' France', ' Guatemala', ' China', ' Japan',
       ' Yugoslavia', ' Peru', ' Outlying-US(Guam-USVI-etc)', ' Scotland',
       ' Trinadad&Tobago', ' Greece', ' Nicaragua', ' Vietnam', ' Hong',
       ' Ireland', ' Hungary', ' Holand-Netherlands']

In [71]:
y = df.income.replace({" <=50K": 0, " >50K": 1})
X = df.drop(["income"], axis= 1)

In [72]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=30)

In [73]:
X_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
16917,36,Private,182074,HS-grad,9,Never-married,Handlers-cleaners,Own-child,White,Male,0,0,40,United-States
31078,18,Private,141626,Some-college,10,Never-married,Tech-support,Own-child,White,Male,2176,0,20,United-States
14943,31,Private,214235,HS-grad,9,Married-civ-spouse,Other-service,Husband,White,Male,0,0,65,United-States
2658,49,Private,277434,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,50,United-States
31540,61,,69285,HS-grad,9,Married-civ-spouse,,Husband,White,Male,0,0,37,United-States


In [74]:
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
    ]
)

# Categorigal Pipeline
cat_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OrdinalEncoder(categories=[workclass_categories,education_categories,marital_status_categories,occupation_categories,relationship_categories,race_categories,sex_categories,native_country_categories])),
    ('scaler',StandardScaler())
    ]

)

preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols),
('cat_pipeline',cat_pipeline,categorical_cols)
])

In [75]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())

In [76]:
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [77]:
X_train.head()

Unnamed: 0,num_pipeline__age,num_pipeline__fnlwgt,num_pipeline__education_num,num_pipeline__capital_gain,num_pipeline__capital_loss,num_pipeline__hours_per_week,cat_pipeline__workclass,cat_pipeline__education,cat_pipeline__marital_status,cat_pipeline__occupation,cat_pipeline__relationship,cat_pipeline__race,cat_pipeline__sex,cat_pipeline__native_country
0,-0.18447,-0.073413,-0.433675,-0.146309,-0.217065,-0.036989,-0.252418,-0.702886,-0.866192,-0.782872,1.015855,-0.353194,-0.704642,-0.25452
1,-1.510861,-0.456592,-0.043314,0.149568,-0.217065,-1.658547,-0.252418,0.458817,-0.866192,1.58643,1.015855,-0.353194,-0.704642,-0.25452
2,-0.552912,0.231261,-0.433675,-0.146309,-0.217065,1.989959,-0.252418,-0.702886,-0.061831,-0.190546,-0.377667,-0.353194,-0.704642,-0.25452
3,0.773479,0.82997,-0.433675,-0.146309,-0.217065,0.77379,-0.252418,-0.702886,-0.061831,0.697942,-0.377667,-0.353194,-0.704642,-0.25452
4,1.65774,-1.141907,-0.433675,-0.146309,-0.217065,-0.280223,2.199543,-0.702886,-0.061831,1.882593,-0.377667,-0.353194,-0.704642,-0.25452


In [78]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, classification_report
from sklearn.model_selection import cross_validate

In [79]:
lgr_model = LogisticRegression(random_state=42)
lgr_model.fit(X_train, y_train)

In [80]:
pred = lgr_model.predict(X_test)
acc = accuracy_score(y_test, pred)
f1 = f1_score(y_test, pred)
print(f"Accuracy: {round(acc,4)}, F1-score: {round(f1, 4)}")

Accuracy: 0.8244, F1-score: 0.5438


In [81]:
results = cross_validate(lgr_model, X_train, y_train, cv=10, scoring=['accuracy', 'f1'], return_train_score=True)

In [82]:
print(f'Accuracy - train: {results["train_accuracy"].mean()}, | Validation: {results["test_accuracy"].mean()}')
print(f'F1-score - train: {results["train_f1"].mean()}, | Validation: {results["test_f1"].mean()}')

Accuracy - train: 0.825474821130514, | Validation: 0.8251136039968285
F1-score - train: 0.5557396406490602, | Validation: 0.5551045855884569
