## Model Training


In [1]:
# Importing Libraries
import numpy as np
import pandas as pd


In [2]:
df = pd.read_csv('data/adult.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
#space remover
def space_remover(dataframe):
   for i in dataframe.columns:
      if dataframe[i].dtype == 'O':
         dataframe[i] = dataframe[i].map(str.strip)
      else:
         pass
space_remover(df)

In [5]:
df.drop_duplicates(keep='first',inplace=True)

In [6]:
income_map = {'<=50K' :0 ,'>50K':1}

df['salary'] = df['salary'].map(income_map)

In [7]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [8]:
df.replace('?',np.nan,inplace=True)

In [9]:
df.drop(labels=['fnlwgt','capital-loss'],axis=1,inplace=True)

In [10]:
df.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,hours-per-week,country,salary
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,40,United-States,0
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,13,United-States,0
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,40,United-States,0
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,40,United-States,0
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,40,Cuba,0


In [11]:
X = df.iloc[:,:-1]
y = df['salary']

In [12]:
X.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,hours-per-week,country
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,40,United-States
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,13,United-States
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,40,United-States
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,40,United-States
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,40,Cuba


In [13]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: salary, dtype: int64

In [14]:
# segregate Num and Cat features

categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

print(f'Categorical columns:{categorical_cols}')
print(f'Numerical columns:{numerical_cols}')

Categorical columns:Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'country'],
      dtype='object')
Numerical columns:Index(['age', 'education-num', 'capital-gain', 'hours-per-week'], dtype='object')


In [15]:
#model training
from sklearn.impute import SimpleImputer #Handling missing values
from sklearn.preprocessing import StandardScaler #handling Feature Scaling
from sklearn.preprocessing import OneHotEncoder # Encoding
##pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [16]:
##Numerical pipeline
num_pipeline = Pipeline(
   steps=[
      ('imputer',SimpleImputer(strategy='median')),
      ('scaler',StandardScaler())
   ]
)
##Categorical pipeline
cat_pipeline = Pipeline(
   steps=[
      ('imputer',SimpleImputer(strategy='most_frequent')),
      ('one-hot-encoder',OneHotEncoder(sparse_output=False)),
      ('scaler',StandardScaler())
   ]
)

preprocessor = ColumnTransformer([
   ('num_pipeline',num_pipeline,numerical_cols),
   ('cat_pipeline',cat_pipeline,categorical_cols)
]
)

In [17]:
#train tset split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=42)

In [18]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())

X_test = pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [19]:
X_train.head()

Unnamed: 0,num_pipeline__age,num_pipeline__education-num,num_pipeline__capital-gain,num_pipeline__hours-per-week,cat_pipeline__workclass_Federal-gov,cat_pipeline__workclass_Local-gov,cat_pipeline__workclass_Never-worked,cat_pipeline__workclass_Private,cat_pipeline__workclass_Self-emp-inc,cat_pipeline__workclass_Self-emp-not-inc,...,cat_pipeline__country_Portugal,cat_pipeline__country_Puerto-Rico,cat_pipeline__country_Scotland,cat_pipeline__country_South,cat_pipeline__country_Taiwan,cat_pipeline__country_Thailand,cat_pipeline__country_Trinadad&Tobago,cat_pipeline__country_United-States,cat_pipeline__country_Vietnam,cat_pipeline__country_Yugoslavia
0,1.498616,1.132773,-0.142996,1.588788,-0.174114,-0.259899,-0.012804,0.570946,-0.191245,-0.289168,...,-0.032659,-0.061181,-0.018109,-0.049231,-0.042015,-0.023959,-0.023087,0.310204,-0.045764,-0.021236
1,0.323774,-0.033504,-0.142996,-0.031707,-0.174114,-0.259899,-0.012804,0.570946,-0.191245,-0.289168,...,-0.032659,-0.061181,-0.018109,-0.049231,-0.042015,-0.023959,-0.023087,0.310204,-0.045764,-0.021236
2,-1.438489,-0.033504,-0.142996,-1.652201,-0.174114,-0.259899,-0.012804,0.570946,-0.191245,-0.289168,...,-0.032659,-0.061181,-0.018109,-0.049231,-0.042015,-0.023959,-0.023087,0.310204,-0.045764,-0.021236
3,0.103491,-0.422263,-0.142996,-0.031707,-0.174114,-0.259899,-0.012804,0.570946,-0.191245,-0.289168,...,-0.032659,-0.061181,-0.018109,-0.049231,-0.042015,-0.023959,-0.023087,0.310204,-0.045764,-0.021236
4,-0.704213,-1.58854,-0.142996,-0.679904,-0.174114,-0.259899,-0.012804,0.570946,-0.191245,-0.289168,...,-0.032659,-0.061181,-0.018109,-0.049231,-0.042015,-0.023959,-0.023087,0.310204,-0.045764,-0.021236


In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [21]:
lg = LogisticRegression()
lg.fit(X_train,y_train)

In [23]:
lg.intercept_

array([-2.07589514])

In [21]:
#Evaluate Model

def evaluate_model(true,pred):
   acc = accuracy_score(true,pred)
   return acc

In [22]:
models = {
   'LogisticRegression':LogisticRegression(),
   'Decision Tree':DecisionTreeClassifier(),
   'Random Forest':RandomForestClassifier(),
   'Support vector Machine':SVC(),
   'KNN':KNeighborsClassifier()
}
model_list = []
acc_list=[]

for i in range(len(list(models))):
   model = list(models.values())[i]
   model.fit(X_train,y_train)

   #make prediction
   y_pred = model.predict(X_test)

   acc = evaluate_model(y_test,y_pred)

   print(list(models.keys())[i])
   model_list.append(list(models.keys())[i])

   print('Model Traning Performance')
   print('Accuracy_score',acc)

   acc_list.append(acc)

   print('='*25)
   print('\n')




LogisticRegression
Model Traning Performance
Accuracy_score 0.8518746158574063


Decision Tree
Model Traning Performance
Accuracy_score 0.8063921327596804


Random Forest
Model Traning Performance
Accuracy_score 0.8362630608481868


Support vector Machine
Model Traning Performance
Accuracy_score 0.8458512599877074


KNN
Model Traning Performance
Accuracy_score 0.8221266133988937




In [23]:
print('classification_report: ','\n',classification_report(y_test,y_pred))
print('\n')
print('confusion_matrix: ','\n',confusion_matrix(y_test,y_pred))

classification_report:  
               precision    recall  f1-score   support

           0       0.87      0.90      0.88      6159
           1       0.65      0.58      0.61      1976

    accuracy                           0.82      8135
   macro avg       0.76      0.74      0.75      8135
weighted avg       0.82      0.82      0.82      8135



confusion_matrix:  
 [[5546  613]
 [ 834 1142]]
