In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Import libraries**

In [2]:
import pandas as pd

# **Load the dataset**

In [3]:
data = pd.read_csv("/content/drive/MyDrive/LMS/ETL/combined_data.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19158 entries, 0 to 19157
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   enrollee_id          19158 non-null  int64  
 1   full_name            19158 non-null  object 
 2   city                 19158 non-null  object 
 3   gender               19158 non-null  object 
 4   enrolled_university  19158 non-null  object 
 5   education_level      19158 non-null  object 
 6   major_discipline     19158 non-null  object 
 7   relevent_experience  19158 non-null  object 
 8   experience           19158 non-null  object 
 9   company_size         19158 non-null  object 
 10  company_type         19158 non-null  object 
 11  last_new_job         19158 non-null  object 
 12  training_hours       19158 non-null  int64  
 13  employed             19158 non-null  float64
dtypes: float64(1), int64(2), object(11)
memory usage: 2.0+ MB


In [4]:
number_of_label = data.groupby("employed").size()
number_of_label

Unnamed: 0_level_0,0
employed,Unnamed: 1_level_1
0.0,14381
1.0,4777


In [5]:
numeric_col = data.select_dtypes(include="number").columns.tolist()
numeric_col

['enrollee_id', 'training_hours', 'employed']

# **Machine Learning model**

### **Separate features and target variable**

In [7]:
# Drop non-predictive columns
X = data.drop(columns=['employed', 'enrollee_id', 'full_name', 'city'])
y = data['employed']

## **Identify categorical and numerical columns**

In [8]:
categorical_cols = X.select_dtypes(include="object").columns.tolist()
numeric_cols = X.select_dtypes(include="number").columns.tolist()

In [9]:
categorical_cols

['gender',
 'enrolled_university',
 'education_level',
 'major_discipline',
 'relevent_experience',
 'experience',
 'company_size',
 'company_type',
 'last_new_job']

## **Preprocessing data**

In [10]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

### **Categorical Data**

In [11]:
categorical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

### **Numerical Data**

In [14]:
numerical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

## **Combine preprocessors in a column transformer**

In [16]:
preprocessor = ColumnTransformer(
    transformers=[
        # Numerical Data
        ('num', numerical_preprocessor, numeric_cols),
        # Categorical Data
        ('cat', categorical_preprocessor, categorical_cols)
    ])

## **Split data into training and test sets**

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## **Build the model pipeline**

In [18]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

## **Train the model**

In [19]:
model.fit(X_train, y_train)

In [20]:
# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.78      0.95      0.86      2880
         1.0       0.56      0.20      0.29       952

    accuracy                           0.76      3832
   macro avg       0.67      0.57      0.57      3832
weighted avg       0.73      0.76      0.72      3832



# **Deep Learning model**

In [21]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import pandas as pd
import numpy as np

In [22]:
# Preprocessing
X = data.drop(columns=['employed', 'enrollee_id', 'full_name', 'city'])
y = data['employed']

categorical_columns = X.select_dtypes(include=['object']).columns
numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns

# Preprocessing pipelines
categorical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_preprocessor, numerical_columns),
        ('cat', categorical_preprocessor, categorical_columns)
    ]
)

X_processed = preprocessor.fit_transform(X)
y_processed = y.values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train.toarray(), dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.toarray(), dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)  # Add an extra dimension for binary classification
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)

# PyTorch Dataset
class EmploymentDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = EmploymentDataset(X_train_tensor, y_train_tensor)
test_dataset = EmploymentDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Define the model
class EmploymentModel(nn.Module):
    def __init__(self, input_size):
        super(EmploymentModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x

# Initialize the model
input_size = X_train.shape[1]
model = EmploymentModel(input_size)

# Loss and optimizer
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 20
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch [{epoch+1}/{epochs}], Loss: {running_loss/len(train_loader):.4f}")

# Evaluation
model.eval()
y_pred_list = []
with torch.no_grad():
    for X_batch, _ in test_loader:
        y_pred = model(X_batch)
        y_pred_list.append(y_pred)

y_pred_tensor = torch.cat(y_pred_list)
y_pred_classes = (y_pred_tensor > 0.5).float()
accuracy = (y_pred_classes.squeeze() == y_test_tensor).sum().item() / len(y_test_tensor)

print(f"Test Accuracy: {accuracy:.4f}")

Epoch [1/20], Loss: 0.5259
Epoch [2/20], Loss: 0.5062
Epoch [3/20], Loss: 0.5011
Epoch [4/20], Loss: 0.4983
Epoch [5/20], Loss: 0.4940
Epoch [6/20], Loss: 0.4895
Epoch [7/20], Loss: 0.4854
Epoch [8/20], Loss: 0.4815
Epoch [9/20], Loss: 0.4770
Epoch [10/20], Loss: 0.4742
Epoch [11/20], Loss: 0.4708
Epoch [12/20], Loss: 0.4673
Epoch [13/20], Loss: 0.4639
Epoch [14/20], Loss: 0.4610
Epoch [15/20], Loss: 0.4579
Epoch [16/20], Loss: 0.4539
Epoch [17/20], Loss: 0.4508
Epoch [18/20], Loss: 0.4479
Epoch [19/20], Loss: 0.4444
Epoch [20/20], Loss: 0.4402
Test Accuracy: 2487.0543


In [23]:
from sklearn.metrics import classification_report, accuracy_score

# Evaluation mode
model.eval()

# Store predictions and true labels
y_pred_list = []
y_true_list = []

with torch.no_grad():  # Disable gradient calculation for testing
    for X_batch, y_batch in test_loader:
        y_pred = model(X_batch)  # Get model predictions
        y_pred_classes = (y_pred > 0.5).float()  # Convert probabilities to binary labels
        y_pred_list.extend(y_pred_classes.numpy())
        y_true_list.extend(y_batch.numpy())

# Convert lists to arrays for evaluation
y_pred_array = np.array(y_pred_list)
y_true_array = np.array(y_true_list)

# Metrics
print("Accuracy:", accuracy_score(y_true_array, y_pred_array))
print("\nClassification Report:")
print(classification_report(y_true_array, y_pred_array))

Accuracy: 0.7288622129436325

Classification Report:
              precision    recall  f1-score   support

         0.0       0.80      0.85      0.82      2880
         1.0       0.44      0.36      0.40       952

    accuracy                           0.73      3832
   macro avg       0.62      0.61      0.61      3832
weighted avg       0.71      0.73      0.72      3832

