## Model Training


In [2]:
# Importing Libraries
import numpy as np
import pandas as pd


In [3]:
df = pd.read_csv('data/census_income.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [4]:
def white_space_remover(dataframe):
   for i in dataframe.columns:
      if dataframe[i].dtype == 'O':
         dataframe[i] = dataframe[i].map(str.strip)
      else:
         pass
white_space_remover(df)

In [5]:
df.drop_duplicates(keep='first',inplace=True)

In [6]:
income_map = {'<=50K' :0 ,'>50K':1}

df['income'] = df['income'].map(income_map)

In [7]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,0
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,0
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,0
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,0
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,0


In [8]:
df.replace('?',np.nan,inplace=True)

In [9]:
df.drop(labels=['fnlwgt','capital.loss'],axis=1,inplace=True)

In [10]:
df.head()

Unnamed: 0,age,workclass,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,hours.per.week,native.country,income
0,90,,HS-grad,9,Widowed,,Not-in-family,White,Female,0,40,United-States,0
1,82,Private,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,18,United-States,0
2,66,,Some-college,10,Widowed,,Unmarried,Black,Female,0,40,United-States,0
3,54,Private,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,40,United-States,0
4,41,Private,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,40,United-States,0


In [11]:
X = df.iloc[:,:-1]
y = df['income']

In [12]:
X.head()

Unnamed: 0,age,workclass,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,hours.per.week,native.country
0,90,,HS-grad,9,Widowed,,Not-in-family,White,Female,0,40,United-States
1,82,Private,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,18,United-States
2,66,,Some-college,10,Widowed,,Unmarried,Black,Female,0,40,United-States
3,54,Private,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,40,United-States
4,41,Private,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,40,United-States


In [13]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: income, dtype: int64

In [14]:
# segregate Num and Cat features

categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

print(f'Categorical columns:{categorical_cols}')
print(f'Numerical columns:{numerical_cols}')

Categorical columns:Index(['workclass', 'education', 'marital.status', 'occupation',
       'relationship', 'race', 'sex', 'native.country'],
      dtype='object')
Numerical columns:Index(['age', 'education.num', 'capital.gain', 'hours.per.week'], dtype='object')


In [15]:
#model training
from sklearn.impute import SimpleImputer #Handling missing values
from sklearn.preprocessing import StandardScaler #handling Feature Scaling
from sklearn.preprocessing import OneHotEncoder # Encoding
##pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [16]:
##Numerical pipeline
num_pipeline = Pipeline(
   steps=[
      ('imputer',SimpleImputer(strategy='median')),
      ('scaler',StandardScaler())
   ]
)
##Categorical pipeline
cat_pipeline = Pipeline(
   steps=[
      ('imputer',SimpleImputer(strategy='most_frequent')),
      ('one-hot-encoder',OneHotEncoder(sparse_output=False)),
      ('scaler',StandardScaler())
   ]
)

preprocessor = ColumnTransformer([
   ('num_pipeline',num_pipeline,numerical_cols),
   ('cat_pipeline',cat_pipeline,categorical_cols)
]
)

In [17]:
#train tset split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=42)

In [18]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())

X_test = pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [19]:
X_train.head()

Unnamed: 0,num_pipeline__age,num_pipeline__education.num,num_pipeline__capital.gain,num_pipeline__hours.per.week,cat_pipeline__workclass_Federal-gov,cat_pipeline__workclass_Local-gov,cat_pipeline__workclass_Never-worked,cat_pipeline__workclass_Private,cat_pipeline__workclass_Self-emp-inc,cat_pipeline__workclass_Self-emp-not-inc,...,cat_pipeline__native.country_Portugal,cat_pipeline__native.country_Puerto-Rico,cat_pipeline__native.country_Scotland,cat_pipeline__native.country_South,cat_pipeline__native.country_Taiwan,cat_pipeline__native.country_Thailand,cat_pipeline__native.country_Trinadad&Tobago,cat_pipeline__native.country_United-States,cat_pipeline__native.country_Vietnam,cat_pipeline__native.country_Yugoslavia
0,0.838334,1.133344,-0.147283,1.178557,5.702564,-0.260975,-0.016939,-1.746846,-0.188814,-0.289584,...,-0.036237,-0.055895,-0.020248,-0.049231,-0.04052,-0.021236,-0.021236,0.306378,-0.047094,-0.023959
1,-0.995673,-0.033636,-0.147283,-0.036444,-0.17536,-0.260975,-0.016939,0.57246,-0.188814,-0.289584,...,-0.036237,-0.055895,-0.020248,-0.049231,-0.04052,-0.021236,-0.021236,0.306378,-0.047094,-0.023959
2,-0.702232,0.744351,-0.147283,-0.036444,-0.17536,-0.260975,-0.016939,0.57246,-0.188814,-0.289584,...,-0.036237,-0.055895,-0.020248,-0.049231,-0.04052,-0.021236,-0.021236,0.306378,-0.047094,-0.023959
3,-1.435835,-3.14558,-0.147283,1.178557,-0.17536,-0.260975,-0.016939,0.57246,-0.188814,-0.289584,...,-0.036237,-0.055895,-0.020248,-0.049231,-0.04052,-0.021236,-0.021236,-3.263939,-0.047094,-0.023959
4,-1.289115,-0.422629,0.401092,-0.036444,-0.17536,-0.260975,-0.016939,0.57246,-0.188814,-0.289584,...,-0.036237,-0.055895,-0.020248,-0.049231,-0.04052,-0.021236,-0.021236,0.306378,-0.047094,-0.023959


In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [21]:
lg = LogisticRegression()
lg.fit(X_train,y_train)

In [23]:
lg.intercept_

array([-2.07589514])

In [25]:
#Evaluate Model

def evaluate_model(true,pred):
   acc = accuracy_score(true,pred)
   return acc

In [41]:
models = {
   'LogisticRegression':LogisticRegression(),
   'Decision Tree':DecisionTreeClassifier(),
   'Support vector Machine':SVC(),
   'Naive Bayes':GaussianNB(),
   'Random Forest':RandomForestClassifier(),
   'KNN':KNeighborsClassifier()
}
model_list = []
acc_list=[]

for i in range(len(list(models))):
   model = list(models.values())[i]
   model.fit(X_train,y_train)

   #make prediction
   y_pred = model.predict(X_test)

   acc = evaluate_model(y_test,y_pred)

   print(list(models.keys())[i])
   model_list.append(list(models.keys())[i])

   print('Model Traning Performance')
   print('Accuracy_score',acc)

   acc_list.append(acc)

   print('='*25)
   print('\n')




LogisticRegression
Model Traning Performance
Accuracy_score 0.848678549477566


Decision Tree
Model Traning Performance
Accuracy_score 0.8113091579594346


Support vector Machine
Model Traning Performance
Accuracy_score 0.8478180700676091


Naive Bayes
Model Traning Performance
Accuracy_score 0.3708666256914567


Random Forest
Model Traning Performance
Accuracy_score 0.8430239704978488


KNN
Model Traning Performance
Accuracy_score 0.8263060848186847




In [40]:
print('classification_report: ','\n',classification_report(y_test,y_pred))
print('confusion_matrix: ','\n',confusion_matrix(y_test,y_pred))

classification_report:  
               precision    recall  f1-score   support

           0       0.88      0.90      0.89      6228
           1       0.64      0.59      0.61      1907

    accuracy                           0.83      8135
   macro avg       0.76      0.74      0.75      8135
weighted avg       0.82      0.83      0.82      8135

confusion_matrix:  
 [[5603  625]
 [ 788 1119]]
