## Import necessary libraries

In [1]:
# import lib

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

## Load the dataset

In [2]:
# import dataset
df= pd.read_excel(r"C:\Users\Shweta\Desktop\Python\classification_assignement_OrbitShift.xlsx")

In [3]:
df.head()

Unnamed: 0,title,label
0,"Director, Learning and Support",CLASS1
1,Customer Service Assistant,CLASS2
2,2023 Internship and Education Program Manageme...,CLASS13
3,Communications Technician,CLASS13
4,Emergency Preparedness Officer,CLASS3


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2259 entries, 0 to 2258
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   2259 non-null   object
 1   label   2259 non-null   object
dtypes: object(2)
memory usage: 35.4+ KB


In [6]:
df.shape

(2259, 2)

## EDA

### Cleaning the title column

In [7]:
df.duplicated().sum()

0

In the data, we find values like 2022, 2023, and other pattern which might reduce the quality of trainin model to predict the true class for a title. Hence,, a code to clean the title column is given below:

In [8]:
import re
# Define a function to remove unwanted patterns
def clean_title(title):
    # Remove non-alphanumeric characters except spaces
    cleaned_title = re.sub(r'[^a-zA-Z0-9\s]', '', title)
    return cleaned_title

# Apply the function to the 'title' column
df['title'] = df['title'].apply(clean_title)

In [9]:
df['title'] = df['title'].str.replace(r'2022|2023|20222023', '', regex=True)

In [10]:
df.head()

Unnamed: 0,title,label
0,Director Learning and Support,CLASS1
1,Customer Service Assistant,CLASS2
2,Internship and Education Program Management I...,CLASS13
3,Communications Technician,CLASS13
4,Emergency Preparedness Officer,CLASS3


Number of records per class

In [18]:
df['label'].value_counts()


CLASS13    489
CLASS4     373
CLASS5     300
CLASS6     263
CLASS8     225
CLASS7     168
CLASS10    119
CLASS3      94
CLASS11     81
CLASS1      76
CLASS2      40
CLASS9      28
CLASS12      3
Name: label, dtype: int64

## Handling imbalance through Oversampling method

In [19]:
df['label'] = df['label'].astype('category')

target_count = 300 

oversampled_df = df.copy()

class_counts = df['label'].value_counts()

for label, count in class_counts.items():
    if count < target_count:
        #print(label)
        oversample_count = target_count - count
        current_class_df = df[df['label'] == label]
        #Oversampling
        oversampled_samples = current_class_df[current_class_df['label']== label].sample(oversample_count, replace=True, random_state=42)
        oversampled_df = pd.concat([oversampled_df, oversampled_samples], axis=0, ignore_index=True)
    

# Display balanced dataset
oversampled_df['label'].value_counts()


CLASS13    489
CLASS4     373
CLASS1     300
CLASS10    300
CLASS11    300
CLASS12    300
CLASS2     300
CLASS3     300
CLASS5     300
CLASS6     300
CLASS7     300
CLASS8     300
CLASS9     300
Name: label, dtype: int64

## Construct model

Splitting the dataset into train, validation and test dataset

In [21]:
X_tmp, X_test, y_tmp, y_test = train_test_split(oversampled_df['title'],
                                                oversampled_df['label'],test_size=0.2, random_state=42)

In [22]:
X_train, X_val, y_train, y_val = train_test_split(X_tmp, y_tmp, test_size=0.2, random_state=42)

### Train and evaluate model using huggingface transformer model

In [25]:
import torch

#tokenizer
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

X_train_tokens = tokenizer(X_train.tolist(), padding=True, truncation=True, return_tensors='pt')
X_val_tokens = tokenizer(X_val.tolist(), padding=True, truncation=True, return_tensors='pt')
X_test_tokens = tokenizer(X_test.tolist(), padding=True, truncation=True, return_tensors='pt')

In [26]:
#embeddings
with torch.no_grad():
    X_train_embeddings = model(**X_train_tokens).last_hidden_state
    X_val_embeddings = model(**X_val_tokens).last_hidden_state
    X_test_embeddings = model(**X_test_tokens).last_hidden_state

# Mean pooling
X_train_mean = X_train_embeddings.mean(dim=1)
X_val_mean = X_val_embeddings.mean(dim=1)
X_test_mean = X_test_embeddings.mean(dim=1)

## Logistic regression

In [27]:
# logistic regression classifier
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train_mean.numpy(), y_train)

val_predictions = classifier.predict(X_val_mean.numpy())

In [28]:
# Evaluate accuracy on the validation set
accuracy_val = accuracy_score(y_val, val_predictions)
print(f"Validation Accuracy: {accuracy_val:.2%}")

Validation Accuracy: 80.18%


In [29]:
# Prediction
test_predictions = classifier.predict(X_test_mean.numpy())

In [30]:
# Evaluate accuracy on the test set
accuracy_test = accuracy_score(y_test, test_predictions)
print(f"Test Accuracy: {accuracy_test:.2%}")

Test Accuracy: 78.99%


### End result

In [32]:
print("Classification Report on Test Set:")
print(classification_report(y_test, test_predictions))

Classification Report on Test Set:
              precision    recall  f1-score   support

      CLASS1       0.87      0.95      0.91        58
     CLASS10       0.77      0.95      0.85        60
     CLASS11       0.85      0.96      0.90        49
     CLASS12       0.98      1.00      0.99        55
     CLASS13       0.55      0.59      0.57        93
      CLASS2       0.93      1.00      0.97        57
      CLASS3       0.83      0.92      0.87        48
      CLASS4       0.68      0.62      0.64        78
      CLASS5       0.73      0.61      0.66        71
      CLASS6       0.67      0.51      0.58        63
      CLASS7       0.79      0.75      0.77        64
      CLASS8       0.75      0.70      0.72        66
      CLASS9       1.00      1.00      1.00        71

    accuracy                           0.79       833
   macro avg       0.80      0.81      0.80       833
weighted avg       0.79      0.79      0.79       833



### Given below is an example to see if the prediction works:

In [33]:
### Create a Pickle file to deploy it using fastapi 
import pickle
pickle_out = open("classifier.pkl","wb")
pickle.dump(classifier, pickle_out)
pickle_out.close()

In [36]:
import pickle

# Save the trained model
with open('classifier.pkl', 'wb') as file:
    pickle.dump(classifier, file)

# Now you can load the model and use it for prediction
with open('classifier.pkl', 'rb') as file:
    loaded_classifier = pickle.load(file)

# Example of using the loaded model for prediction
example_text = ("intern","director")
example_tokens = tokenizer(example_text, padding=True, truncation=True, return_tensors='pt')
with torch.no_grad():
    example_embeddings = model(**example_tokens).last_hidden_state
    example_mean = example_embeddings.mean(dim=1)
    prediction = loaded_classifier.predict(example_mean.numpy())

print("Prediction:", prediction)

Prediction: ['CLASS11' 'CLASS4']
