### Name: SHARDUL GORE
### Roll: R016

## Q2

In [238]:
# importing libraries
import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score

In [239]:
df = pd.read_csv('adult.csv') # reading dataset
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [240]:
df.info() # checking model info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              48842 non-null  int64 
 1   workclass        48842 non-null  object
 2   fnlwgt           48842 non-null  int64 
 3   education        48842 non-null  object
 4   educational-num  48842 non-null  int64 
 5   marital-status   48842 non-null  object
 6   occupation       48842 non-null  object
 7   relationship     48842 non-null  object
 8   race             48842 non-null  object
 9   gender           48842 non-null  object
 10  capital-gain     48842 non-null  int64 
 11  capital-loss     48842 non-null  int64 
 12  hours-per-week   48842 non-null  int64 
 13  native-country   48842 non-null  object
 14  income           48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [241]:
df.isnull().sum() # checking null values

age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64

### checking value counts or number of classes in object columns

In [242]:
df['workclass'].value_counts()

workclass
Private             33906
Self-emp-not-inc     3862
Local-gov            3136
?                    2799
State-gov            1981
Self-emp-inc         1695
Federal-gov          1432
Without-pay            21
Never-worked           10
Name: count, dtype: int64

In [243]:
df['education'].value_counts()

education
HS-grad         15784
Some-college    10878
Bachelors        8025
Masters          2657
Assoc-voc        2061
11th             1812
Assoc-acdm       1601
10th             1389
7th-8th           955
Prof-school       834
9th               756
12th              657
Doctorate         594
5th-6th           509
1st-4th           247
Preschool          83
Name: count, dtype: int64

In [244]:
df['marital-status'].value_counts()

marital-status
Married-civ-spouse       22379
Never-married            16117
Divorced                  6633
Separated                 1530
Widowed                   1518
Married-spouse-absent      628
Married-AF-spouse           37
Name: count, dtype: int64

In [245]:
df['marital-status'].value_counts()

marital-status
Married-civ-spouse       22379
Never-married            16117
Divorced                  6633
Separated                 1530
Widowed                   1518
Married-spouse-absent      628
Married-AF-spouse           37
Name: count, dtype: int64

In [246]:
df['relationship'].value_counts()

relationship
Husband           19716
Not-in-family     12583
Own-child          7581
Unmarried          5125
Wife               2331
Other-relative     1506
Name: count, dtype: int64

In [247]:
df['race'].value_counts()

race
White                 41762
Black                  4685
Asian-Pac-Islander     1519
Amer-Indian-Eskimo      470
Other                   406
Name: count, dtype: int64

In [248]:
df['gender'].value_counts()

gender
Male      32650
Female    16192
Name: count, dtype: int64

In [249]:
df['native-country'].value_counts()

native-country
United-States                 43832
Mexico                          951
?                               857
Philippines                     295
Germany                         206
Puerto-Rico                     184
Canada                          182
El-Salvador                     155
India                           151
Cuba                            138
England                         127
China                           122
South                           115
Jamaica                         106
Italy                           105
Dominican-Republic              103
Japan                            92
Guatemala                        88
Poland                           87
Vietnam                          86
Columbia                         85
Haiti                            75
Portugal                         67
Taiwan                           65
Iran                             59
Greece                           49
Nicaragua                        49
Peru         

In [250]:
df['income'].value_counts()

income
<=50K    37155
>50K     11687
Name: count, dtype: int64

In [251]:
object_cols = df.select_dtypes(include='object').columns
object_cols

Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'gender', 'native-country', 'income'],
      dtype='object')

In [252]:
# Label encoding the target variable
df['income'] = df['income'].map({'<=50K': 0, '>50K': 1})
# df.head()

In [253]:
df['native-country'].nunique() 

42

In [254]:
# dropping native-country as it has too many uniqu values it will increase the dimension of the input data
df.drop('native-country', axis=1, inplace=True)

In [255]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,0
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,0
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,1
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,1
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,0


In [256]:
# replacing '?' by nan, for removing '?' values
df.replace('?', np.nan, inplace=True)

In [257]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,0
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,0
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,1
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,1
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,0


In [258]:
object_cols = df.select_dtypes(include='object').columns
object_cols

Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'gender'],
      dtype='object')

In [259]:
# updating nan values with mode of the column to remove missing of '?' values
for col in object_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [260]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,0
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,0
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,1
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,1
4,18,Private,103497,Some-college,10,Never-married,Prof-specialty,Own-child,White,Female,0,0,30,0


In [261]:
df.shape, object_cols

((48842, 14),
 Index(['workclass', 'education', 'marital-status', 'occupation',
        'relationship', 'race', 'gender'],
       dtype='object'))

In [262]:
# one hot encoding the object columns
df = pd.get_dummies(columns=object_cols, data=df, drop_first=True).astype('int64')
df.head()

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,income,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,education_11th,education_12th,education_1st-4th,education_5th-6th,education_7th-8th,education_9th,education_Assoc-acdm,education_Assoc-voc,education_Bachelors,education_Doctorate,education_HS-grad,education_Masters,education_Preschool,education_Prof-school,education_Some-college,marital-status_Married-AF-spouse,marital-status_Married-civ-spouse,marital-status_Married-spouse-absent,marital-status_Never-married,marital-status_Separated,marital-status_Widowed,occupation_Armed-Forces,occupation_Craft-repair,occupation_Exec-managerial,occupation_Farming-fishing,occupation_Handlers-cleaners,occupation_Machine-op-inspct,occupation_Other-service,occupation_Priv-house-serv,occupation_Prof-specialty,occupation_Protective-serv,occupation_Sales,occupation_Tech-support,occupation_Transport-moving,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Asian-Pac-Islander,race_Black,race_Other,race_White,gender_Male
0,25,226802,7,0,0,40,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1
1,38,89814,9,0,0,50,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
2,28,336951,12,0,0,40,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1
3,44,160323,10,7688,0,40,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
4,18,103497,10,0,0,30,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0


In [263]:
df.shape

(48842, 58)

In [264]:
# splitting the data into train and test
X = df.drop('income', axis=1)
y = df['income']

In [265]:
X.shape, y.shape # checking X and y shape

((48842, 57), (48842,))

In [266]:
# splitting the data into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=73)

In [267]:
# applying standardScaler to the splitted data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [268]:
# converting the data into numpy arrays
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)

In [269]:
# Pytorch class model for Adult dataset
class AdultModel(nn.Module):
    def __init__(self, INPUT_DIM, OUTPUT_DIM):
        super(AdultModel, self).__init__()

        self.layer1 = nn.Linear(INPUT_DIM, 64)
        self.layer2 = nn.Linear(64, 16)
        self.layer3 = nn.Linear(16, 8)
        self.layer4 = nn.Linear(8, OUTPUT_DIM)
        self.relu = nn.ReLU()
        self.log_softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.relu(self.layer2(x))
        x = self.relu(self.layer3(x))
        x = self.layer4(x)
        x = self.log_softmax(x)
        return x

In [270]:
# init variables
INPUT_DIM = X_train.shape[1]
OUTPUT_DIM = y_train.nunique()

INPUT_DIM, OUTPUT_DIM

(57, 2)

In [271]:
# model initialisation
model = AdultModel(INPUT_DIM, OUTPUT_DIM)

In [272]:
from torchsummary import summary

summary(model, (INPUT_DIM, )) #model summary

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                   [-1, 64]           3,712
              ReLU-2                   [-1, 64]               0
            Linear-3                   [-1, 16]           1,040
              ReLU-4                   [-1, 16]               0
            Linear-5                    [-1, 8]             136
              ReLU-6                    [-1, 8]               0
            Linear-7                    [-1, 2]              18
        LogSoftmax-8                    [-1, 2]               0
Total params: 4,906
Trainable params: 4,906
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.02
Estimated Total Size (MB): 0.02
----------------------------------------------------------------


In [273]:
# activation function and loss function
loss = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [274]:
EPOCHS = 100
# converting numpy array into tensors
X_train_tensor = torch.tensor(X_train)
y_train_tensor = torch.tensor(y_train.values).long()
# training loop
for epoch in range(EPOCHS):
    optimizer.zero_grad()
    y_pred = model(X_train_tensor)
    l = loss(y_pred, y_train_tensor)
    l.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f'epoch {epoch}: {l.item()}')

epoch 0: 0.8646038770675659
epoch 10: 0.40002068877220154
epoch 20: 0.3593512177467346
epoch 30: 0.3438655138015747
epoch 40: 0.32612425088882446
epoch 50: 0.3158065378665924
epoch 60: 0.3081488609313965
epoch 70: 0.3018065392971039
epoch 80: 0.29756709933280945
epoch 90: 0.294092059135437


In [275]:
# evaluating model
with torch.no_grad():
    y_pred = model(torch.tensor(X_test))
    correct = (torch.argmax(y_pred, dim=1) == torch.tensor(y_test.values)).sum().item()

    print(f'Accuracy: {correct/len(y_test)}')

Accuracy: 0.8530044016787798


In [276]:
# accuracy and f1 score
y_test_tensor = torch.tensor(y_test.values).to(y_pred.device)
accuracy = accuracy_score(y_test_tensor.cpu().numpy(), torch.argmax(y_pred, dim=1).cpu().numpy())
f1 = f1_score(y_test_tensor.cpu().numpy(), torch.argmax(y_pred, dim=1).cpu().numpy())

In [277]:
print(f'Accuracy: {accuracy}')
print(f'F1 Score: {f1}')

Accuracy: 0.8530044016787798
F1 Score: 0.6576061039580353


## Q1) a

#### Ans: 
#### 1. Hyperparameter tuning
#### 2. changing the number of layers
#### 3. changing the number of neurons in each layer
#### 4. changing  the number of epochs
#### 5. Change the optimizer
#### 6. Change the learning rate

In [278]:
# hyperpparameter tuning
# increasing number of epochs
EPOCHS = 80

X_train_tensor = torch.tensor(X_train)
y_train_tensor = torch.tensor(y_train.values).long()
# training loop
for epoch in range(EPOCHS):
    optimizer.zero_grad()
    y_pred = model(X_train_tensor)
    l = loss(y_pred, y_train_tensor)
    l.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f'epoch {epoch}: {l.item()}')

# evaluating model
with torch.no_grad():
    y_pred = model(torch.tensor(X_test))
    correct = (torch.argmax(y_pred, dim=1) == torch.tensor(y_test.values)).sum().item()

    print(f'Accuracy: {correct/len(y_test)}')

epoch 0: 0.29104524850845337
epoch 10: 0.28816211223602295
epoch 20: 0.2854708433151245
epoch 30: 0.28273019194602966
epoch 40: 0.27979546785354614
epoch 50: 0.2767866849899292
epoch 60: 0.27466633915901184
epoch 70: 0.27150431275367737
Accuracy: 0.8452246903470161


##### in the above case we didnt see any improvement because the losses are again propogating resulting in over fitting on dataset. so in ideal case new model has to be created and trained separately in hyper parameter tuning 

In [279]:
# changing learning rate
optimizer = optim.Adam(model.parameters(), lr=0.003)

model1 = AdultModel(INPUT_DIM, OUTPUT_DIM)

X_train_tensor = torch.tensor(X_train)
y_train_tensor = torch.tensor(y_train.values).long()
# training loop
for epoch in range(EPOCHS):
    optimizer.zero_grad()
    y_pred = model(X_train_tensor)
    l = loss(y_pred, y_train_tensor)
    l.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f'epoch {epoch}: {l.item()}')

# evaluating model
with torch.no_grad():
    y_pred = model(torch.tensor(X_test))
    correct = (torch.argmax(y_pred, dim=1) == torch.tensor(y_test.values)).sum().item()

    print(f'Accuracy: {correct/len(y_test)}')

epoch 0: 0.2682904303073883
epoch 10: 0.26808837056159973
epoch 20: 0.26704567670822144
epoch 30: 0.2658202350139618
epoch 40: 0.264806866645813
epoch 50: 0.26370057463645935
epoch 60: 0.2625073790550232
epoch 70: 0.26125916838645935
Accuracy: 0.8447128672330843


##### in the above case we didnt see any improvement because the losses are again propogating resulting in over fitting on dataset. so in ideal case new model has to be created and trained separately in hyper parameter tuning 