# Step 1: Install and Import Libraries

## a) Install Required Libraries

In [None]:
!pip install pandas numpy scikit-learn nltk textstat vaderSentiment better-profanity

## b) Import Libraries

In [2]:
# Data manipulation
import pandas as pd
import numpy as np

# Machine learning
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestClassifier  # Example classifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Text analysis libraries
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import textstat
from better_profanity import profanity

# Other utilities
import warnings
warnings.filterwarnings("ignore")  # To suppress warnings (optional)

# Step 2: Data Preparation
In this step, we'll:

1. Load the dataset.
2. Check for missing values or duplicates.
3. Encode the target column (is_kids_safe) if necessary.
4. Split the data into features (X) and target (y).

## a) Load the Dataset


In [3]:
# Load the dataset
df = pd.read_excel('../data/raw/final_dataset_classification.xlsx')

# Display the first few rows
df.head(3)

Unnamed: 0,video_id,title,description,published_at,channel_title,tags,category_id,Category Label,default_audio_language,transcript,...,dislike_count,text_dict,combined_text,kids_safe_nltk,profanity,themes,readability,tone,cultural_sensitivity,is_kids_safe
0,R7KfBjoLGOY,Every Swear Word RANKED...,Every Swear Word RANKED... today we rank every...,2024-06-15T20:34:45Z,The Duck,,1,Film & Animation,,swear words a very essential part of the vocab...,...,,"{'title': 'Every Swear Word RANKED...', 'descr...",Every Swear Word RANKED... Every Swear Word RA...,0,True,True,False,True,False,False
1,GkQmkZNLCeM,Best Toy Learning Video for Toddlers and Kids ...,Best Toy Learning Video for Toddlers and Kids ...,2021-07-08T18:29:49Z,Genevieve's Playhouse - Learning Videos for Kids,"best, learning video for toddlers, for kids, c...",1,Film & Animation,en,Transcript not available,...,,{'title': 'Best Toy Learning Video for Toddler...,Best Toy Learning Video for Toddlers and Kids ...,1,False,False,False,True,False,True
2,_et1i-ykdxA,Best ABC Learning Toy Video for Toddlers! Paw ...,Best ABC Learning Toy Video for Toddlers! Paw ...,2022-07-14T16:45:05Z,Genevieve's Playhouse - Learning Videos for Kids,"abc, for toddlers, for baby, learning, educati...",1,Film & Animation,en,Transcript not available,...,,{'title': 'Best ABC Learning Toy Video for Tod...,Best ABC Learning Toy Video for Toddlers! Paw ...,1,False,False,False,True,False,True


## b) Check for Missing Values and Duplicates

In [4]:
# Check for missing values
print("Missing Values:\n", df.isnull().sum())

# Check for duplicates
print("Number of duplicates:", df.duplicated().sum())

Missing Values:
 video_id                     0
title                        0
description                483
published_at                 0
channel_title                0
tags                      1026
category_id                  0
Category Label             397
default_audio_language    1301
transcript                 584
duration                  2038
view_count                2038
like_count                2038
dislike_count             2038
text_dict                    0
combined_text                0
kids_safe_nltk               0
profanity                    0
themes                       0
readability                  0
tone                         0
cultural_sensitivity         0
is_kids_safe                 0
dtype: int64
Number of duplicates: 461


In [5]:
# Remove duplicates
df = df.drop_duplicates()

# Confirm duplicates are removed
print("Number of duplicates after cleaning:", df.duplicated().sum())

Number of duplicates after cleaning: 0


## Encode the Target Column (if not binary integers)

In [6]:
df

Unnamed: 0,video_id,title,description,published_at,channel_title,tags,category_id,Category Label,default_audio_language,transcript,...,dislike_count,text_dict,combined_text,kids_safe_nltk,profanity,themes,readability,tone,cultural_sensitivity,is_kids_safe
0,R7KfBjoLGOY,Every Swear Word RANKED...,Every Swear Word RANKED... today we rank every...,2024-06-15T20:34:45Z,The Duck,,1,Film & Animation,,swear words a very essential part of the vocab...,...,,"{'title': 'Every Swear Word RANKED...', 'descr...",Every Swear Word RANKED... Every Swear Word RA...,0,True,True,False,True,False,False
1,GkQmkZNLCeM,Best Toy Learning Video for Toddlers and Kids ...,Best Toy Learning Video for Toddlers and Kids ...,2021-07-08T18:29:49Z,Genevieve's Playhouse - Learning Videos for Kids,"best, learning video for toddlers, for kids, c...",1,Film & Animation,en,Transcript not available,...,,{'title': 'Best Toy Learning Video for Toddler...,Best Toy Learning Video for Toddlers and Kids ...,1,False,False,False,True,False,True
2,_et1i-ykdxA,Best ABC Learning Toy Video for Toddlers! Paw ...,Best ABC Learning Toy Video for Toddlers! Paw ...,2022-07-14T16:45:05Z,Genevieve's Playhouse - Learning Videos for Kids,"abc, for toddlers, for baby, learning, educati...",1,Film & Animation,en,Transcript not available,...,,{'title': 'Best ABC Learning Toy Video for Tod...,Best ABC Learning Toy Video for Toddlers! Paw ...,1,False,False,False,True,False,True
3,_UNe6tz28OI,Best Toy Learning Video for Toddlers and Kids ...,Best Toy Learning Video for Toddlers and Kids ...,2021-03-23T01:43:53Z,Genevieve's Playhouse - Learning Videos for Kids,"best, toddler learning videos, for kids, for b...",1,Film & Animation,en,Transcript not available,...,,{'title': 'Best Toy Learning Video for Toddler...,Best Toy Learning Video for Toddlers and Kids ...,1,False,False,False,True,False,True
4,zamt_1PAwqE,The Joy of Sharing Short Episode | Fun Learni...,"Welcome to ""The Learning Show"", the exciting a...",2023-05-19T14:42:38Z,Videogyan Shows - Educational Videos For Kids,"shorts, learning for kids, educational videos,...",1,Film & Animation,en,Transcript not available,...,,{'title': 'The Joy of Sharing Short Episode | ...,The Joy of Sharing Short Episode | Fun Learni...,0,False,True,False,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3419,8_nv0q5ztYE,Pitch Perfect - Trailer,"Arriving at her new college, Beca (Anna Kendri...",2013-10-11T05:10:52Z,Universal Pictures All-Access,"Anna Kendrick, Brittany Snow, Anna Camp, Rebel...",44,,,hi there welcome to Bon University here's your...,...,0.0,"{'title': 'Pitch Perfect - Trailer', 'descript...",Pitch Perfect - Trailer Arriving at her new co...,0,True,True,False,True,False,True
3420,irhVziMbSZk,Moneyball (2011) - Trailer,Real-life story is based on Oakland A's genera...,2014-07-11T03:41:39Z,Sony Pictures at Home UK,"Philip Seymour Hoffman, Brad Pitt, Robin Wrigh...",44,,,,...,0.0,"{'title': 'Moneyball (2011) - Trailer', 'descr...",Moneyball (2011) - Trailer Real-life story is ...,1,False,False,False,True,False,False
3421,dMaq_pfxs-0,Hitch (2005) - Trailer,"New York ""date doctor"" Alex Hitchens (Will Smi...",2014-02-01T23:07:44Z,Sony Pictures Home Entertainment,"Kevin James, Eva Mendes, Michael Rapaport, Ada...",44,,en,,...,0.0,"{'title': 'Hitch (2005) - Trailer', 'descripti...","Hitch (2005) - Trailer New York ""date doctor"" ...",1,False,False,True,True,False,False
3422,Ecn8dvDK7LQ,Lemonade Mouth - Trailer,Be heard. Be strong. Be proud. Its time to tur...,2012-07-11T16:50:42Z,DisneyMoviesOnDemand,"Lemonade Mouth, Bridgit Mendler, Adam Hicks, H...",44,,,,...,0.0,"{'title': 'Lemonade Mouth - Trailer', 'descrip...",Lemonade Mouth - Trailer Be heard. Be strong. ...,1,False,False,False,True,False,True


In [7]:
# Ensure target column is binary
df['kids_safe_nltk'] = df['kids_safe_nltk'].astype(int)

In [8]:
df_equal = df.copy()

## Split the Data into Features (X) and Target (y)
Separate the combined_text column as the feature and is_kids_safe as the target variable.

In [9]:
df.columns

Index(['video_id', 'title', 'description', 'published_at', 'channel_title',
       'tags', 'category_id', 'Category Label', 'default_audio_language',
       'transcript', 'duration', 'view_count', 'like_count', 'dislike_count',
       'text_dict', 'combined_text', 'kids_safe_nltk', 'profanity', 'themes',
       'readability', 'tone', 'cultural_sensitivity', 'is_kids_safe'],
      dtype='object')

In [10]:
# Features (X) and target (y)
X = df['combined_text']  # Feature column
y = df['kids_safe_nltk']   # Target column

## used for increasing BERT model accuracy (skip otherwise)

In [11]:
# Features (X) and target (y)
X = df_equal['combined_text']  # Feature column
y = df_equal['kids_safe_nltk']   # Target column

# Step 3: Text Preprocessing
Since X contains text data, we need to preprocess it before feeding it into a machine learning model. This involves:

1. Lowercasing: Converting text to lowercase for consistency.
2. Removing Special Characters: Eliminating punctuation, numbers, and special characters.
3. Tokenization: Splitting text into individual words.
4. Removing Stop Words: Removing common words like "the", "is", etc., that don’t add much meaning.
5. Stemming/Lemmatization: Reducing words to their root form (e.g., "running" → "run").

In [12]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Initialize tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to preprocess text
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Remove special characters, numbers, and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stop words and lemmatize tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    
    # Rejoin tokens into a single string
    return ' '.join(tokens)

# Apply preprocessing to all text in X
X = X.apply(preprocess_text)

# Check a few samples after preprocessing
print(X.head())

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shusritavenugopal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/shusritavenugopal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/shusritavenugopal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0    every swear word ranked every swear word ranke...
1    best toy learning video toddler kid learn colo...
2    best abc learning toy video toddler paw patrol...
3    best toy learning video toddler kid learn colo...
4    joy sharing short episode fun learning video c...
Name: combined_text, dtype: object


# Step 4: Train-Test Split and Vectorization
## 4.1 Train-Test Split
1. Training Data: Used to train the model.
2. Testing Data: Used to evaluate the model's performance.

## 4.2 Text Vectorization
Since machine learning models work with numerical data, we’ll convert the text into numerical features using TF-IDF (Term Frequency-Inverse Document Frequency). This method represents the importance of words in a document relative to the entire dataset.

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))

# Fit the vectorizer on the training data and transform both train and test data
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Check the shape of the transformed data
print("Train Data Shape:", X_train_tfidf.shape)
print("Test Data Shape:", X_test_tfidf.shape)

Train Data Shape: (2371, 5000)
Test Data Shape: (593, 5000)


The text data has been successfully converted into numerical form using TF-IDF vectorization. 

1. Train Data Shape: (2371, 5000)
- 2371 refers to the number of training samples (rows).
- 5000 refers to the number of features (columns), which corresponds to the top 5000 terms (words or n-grams) that the TF-IDF vectorizer has identified as most significant in your dataset.
2. Test Data Shape: (593, 5000)
- 593 refers to the number of test samples (rows).
- 5000 remains the same, as the test data is transformed using the same set of features (top 5000 terms) that the model learned from the training data.

max_features = 5000

The train-test split worked as expected, with 80% of the data (2371 samples) used for training and 20% (593 samples) for testing.




# Step 4: Model Training

In this step, we will train a classifier using the features extracted by the TF-IDF vectorizer. A good starting point for text classification tasks is the Logistic Regression model, as it's simple and effective for binary classification problems.

In [14]:
# Importing Logistic Regression from sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Initialize the Logistic Regression model
model = LogisticRegression(max_iter=1000)

# Train the model using the training data
model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_tfidf)

# Evaluate the model's performance
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy Score: 0.7639123102866779

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.91      0.84       398
           1       0.72      0.47      0.57       195

    accuracy                           0.76       593
   macro avg       0.75      0.69      0.70       593
weighted avg       0.76      0.76      0.75       593



#### Accuracy Score: 0.76 (or 76%) indicates that the model is correctly classifying 76% of the test instances.

#### Precision:
- For class 0 (kids safe): Precision of 0.78 means that 78% of instances predicted as kids-safe are actually kids-safe.
- For class 1 (not safe): Precision of 0.72 means that 72% of instances predicted as not safe are indeed not safe.

#### Recall:
- For class 0 (kids safe): Recall of 0.91 means that 91% of all actual kids-safe instances were correctly identified by the model.
- For class 1 (not safe): Recall of 0.47 means that only 47% of the actual not-safe instances were correctly identified.
#### F1-Score:
- For class 0: The F1-score is 0.84, which is a balance between precision and recall.
- For class 1: The F1-score is 0.57, indicating that the model struggles more with predicting the "not safe" class accurately.

## Simple logistic regression model is performing well in identifying "safe for kids" content. But it struggles more with identifying not-safe content (lower recall for class 1). So, choosing to try other models like Random Forest, SVM, or XGBoost to see if they perform better.

# SVM

## Step 1: Importing the SVM model and other required libraries

In [15]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

## Step 2: Initialize and train the SVM model

In [16]:
# Initialize the SVM classifier
svm_model = SVC(kernel='linear', random_state=42)

# Train the model using the transformed features
svm_model.fit(X_train_tfidf, y_train)

## Step 3: Make predictions and evaluate the model

In [17]:
# Make predictions on the test set
y_pred_svm = svm_model.predict(X_test_tfidf)

# Evaluate the model
print("Accuracy Score:", accuracy_score(y_test, y_pred_svm))
print("Classification Report:\n", classification_report(y_test, y_pred_svm))

Accuracy Score: 0.7993254637436762
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.87      0.85       398
           1       0.71      0.66      0.68       195

    accuracy                           0.80       593
   macro avg       0.77      0.76      0.77       593
weighted avg       0.80      0.80      0.80       593



#### The model achieved an accuracy of 80%, which is a solid result. 
- Class 0 (not kid-safe): The model performed well in this class, with a precision of 0.84, recall of 0.87, and an F1-score of 0.85. This indicates that the model is correctly identifying most of the instances in this class.
- Class 1 (kid-safe): The performance here is slightly lower, with a precision of 0.71, recall of 0.66, and F1-score of 0.68. The model is still performing decently, but there’s room for improvement in identifying kid-safe content.


Hyperparameter tuning can help improve the performance of your SVM model, especially when adjusting the C parameter, which controls the trade-off between a smooth decision boundary and classifying training points correctly.

#### Step 1: Define a parameter grid: test different values for the C parameter, which determines the regularization strength.

#### Step 2: GridSearchCV will exhaustively search through all combinations of parameters and return the best one based on cross-validation performance.



In [18]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for the SVM model
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization strength
    'kernel': ['linear', 'rbf'],  # Different kernel types
    'gamma': ['scale', 'auto'],  # Kernel coefficient
}

# Initialize GridSearchCV with SVM model and parameter grid
grid_search = GridSearchCV(estimator=SVC(), param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit GridSearchCV to the training data
grid_search.fit(X_train_tfidf, y_train)

# Get the best parameters and the best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Print the best hyperparameters and the best score
print(f"Best Hyperparameters: {best_params}")
print(f"Best Cross-Validation Score: {best_score}")

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   1.5s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   1.5s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   1.5s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   1.5s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   1.6s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   1.8s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   1.8s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   1.8s
[CV] END ...................C=0.1, gamma=auto, kernel=linear; total time=   1.5s
[CV] END ...................C=0.1, gamma=auto, kernel=linear; total time=   1.5s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   1.7s
[CV] END ...................C=0.1, gamma=auto, k

## Step 3: Evaluate the best model


In [19]:
# Train the best model on the entire training data
best_model = grid_search.best_estimator_

# Evaluate the model on the test data
y_pred = best_model.predict(X_test_tfidf)

# Calculate the accuracy and print the classification report
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy Score: {accuracy}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy Score: 0.806070826306914
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.88      0.86       398
           1       0.73      0.65      0.69       195

    accuracy                           0.81       593
   macro avg       0.78      0.77      0.77       593
weighted avg       0.80      0.81      0.80       593



#### Class 0 (not kids safe):
- Precision: 84% — The model is fairly good at identifying content that is "not kids safe."
- Recall: 88% — It correctly identifies most "not kids safe" content, but some content might still be misclassified.

#### Class 1 (kids safe):
- Precision: 73% — The model is moderately good at identifying content that is "kids safe."
- Recall: 65% — There’s a need for improvement in identifying all "kids safe" content without missing too many.


## Implementing a BERT model for the classification task

## Step 1: Install Hugging Face Transformers and Torch

In [20]:
!pip install transformers
!pip install torch


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


## Step 2: Import Necessary Libraries

In [21]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

## Step 3: Tokenization
Since BERT requires its input to be tokenized, we'll use the BERT tokenizer to convert your text data into the format expected by the BERT model.

In [22]:
# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['combined_text'], padding='max_length', truncation=True)

# Tokenize the train and test data
X_train_tfidf = df['combined_text'].tolist()
y_train_tfidf = df['kids_safe_nltk'].tolist()

# Splitting data for training and validation
X_train, X_val, y_train, y_val = train_test_split(X_train_tfidf, y_train_tfidf, test_size=0.2)

train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(X_val, truncation=True, padding=True, max_length=128)

## Step 4: Convert to PyTorch Dataset

In [23]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_encodings, y_train)
val_dataset = TextDataset(val_encodings, y_val)

## Step 5: Model Setup

In [24]:
# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Step 6: Trainer Setup
We'll use Hugging Face's Trainer API to train the model. We'll define the training arguments (like learning rate, batch size, number of epochs, etc.) and set up the Trainer with our dataset.

In [25]:
!pip install --upgrade "accelerate>=0.26.0"
!pip install --upgrade "transformers[torch]"




[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [26]:
import accelerate
import transformers

print(accelerate.__version__)
print(transformers.__version__)

1.2.0
4.47.0


## Baseline

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=4,              # number of training epochs
    per_device_train_batch_size=16,  # batch size for training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

## This configurations gave 83% accuracy. 

## hyperparameters tuning:

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=4,              # number of training epochs
    per_device_train_batch_size=16,  # batch size for training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.05,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

## This configurations gave 83% accuracy. 

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=5,              # number of training epochs
    per_device_train_batch_size=16,  # batch size for training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

## This configurations yielded 82% accuracy. 

In [28]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=6,              # number of training epochs
    per_device_train_batch_size=16,  # batch size for training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

## This configurations gave 80% accuracy. 


## Defining trainer

In [29]:
trainer = Trainer(
    model=model,                         # the instantiated hugging face transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=lambda p: {
        'accuracy': accuracy_score(p.predictions.argmax(axis=1), p.label_ids)
    },
)

## Step 7: Train the Model
Now we can start training the model. This will take a while depending on the dataset size and the system you're using.

In [30]:
trainer.train()

  0%|          | 0/894 [00:00<?, ?it/s]

{'loss': 0.6936, 'grad_norm': 3.143017053604126, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.07}
{'loss': 0.6891, 'grad_norm': 4.33994197845459, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.13}
{'loss': 0.6756, 'grad_norm': 3.7533223628997803, 'learning_rate': 3e-06, 'epoch': 0.2}
{'loss': 0.6388, 'grad_norm': 3.5352041721343994, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.27}
{'loss': 0.6648, 'grad_norm': 3.93685245513916, 'learning_rate': 5e-06, 'epoch': 0.34}
{'loss': 0.6067, 'grad_norm': 2.4190571308135986, 'learning_rate': 6e-06, 'epoch': 0.4}
{'loss': 0.5951, 'grad_norm': 3.4141294956207275, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.47}
{'loss': 0.5977, 'grad_norm': 3.3383634090423584, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.54}
{'loss': 0.5453, 'grad_norm': 4.20260763168335, 'learning_rate': 9e-06, 'epoch': 0.6}
{'loss': 0.5088, 'grad_norm': 6.475742816925049, 'learning_rate': 1e-05, 'epoch': 0.67}
{'loss': 0.6128, 'grad_norm': 8.6842832

TrainOutput(global_step=894, training_loss=0.30296231862845135, metrics={'train_runtime': 835.323, 'train_samples_per_second': 17.031, 'train_steps_per_second': 1.07, 'total_flos': 935754468387840.0, 'train_loss': 0.30296231862845135, 'epoch': 6.0})

## Step 8: Evaluation

In [32]:
results = trainer.evaluate()

print(f"Evaluation Results: {results}")

  0%|          | 0/10 [00:00<?, ?it/s]

Evaluation Results: {'eval_loss': 0.949877917766571, 'eval_accuracy': 0.8347386172006745, 'eval_runtime': 11.8746, 'eval_samples_per_second': 49.938, 'eval_steps_per_second': 0.842, 'epoch': 6.0}


## Step 9: Predictions

In [33]:
# Let's predict on the validation dataset first
predictions = trainer.predict(val_dataset)
predicted_labels = predictions.predictions.argmax(axis=1)

# Print the classification report
print(classification_report(y_val, predicted_labels))

  0%|          | 0/10 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0       0.86      0.90      0.88       402
           1       0.77      0.69      0.73       191

    accuracy                           0.83       593
   macro avg       0.82      0.80      0.81       593
weighted avg       0.83      0.83      0.83       593



## Overall Accuracy:
The model achieved an accuracy of 83%, which is respectable for a BERT model.

In [None]:
import os
from datetime import datetime

# Generate a unique model name
model_name = f"model_{datetime.now().strftime('%Y%m%d_%H%M%S')}_v839069"

# Save the model with the generated name
save_path = os.path.join("../saved_BERT_pretrained_models", model_name)
trainer.save_model(save_path)

print(f"Model saved at: {save_path}")

Model saved at: ../saved_BERT_pretrained_models/model_20241211_113756_v818574


In [34]:
import os
from datetime import datetime

# Specify the folder to save the model and report
model_folder = "../saved_BERT_pretrained_models"
# Generate a unique model name
model_name = f"model_{datetime.now().strftime('%Y%m%d_%H%M%S')}_v839069"

# Save the model with the generated name
save_path = os.path.join(model_folder, model_name)
trainer.save_model(save_path)
# Save the tokenizer
tokenizer.save_pretrained(save_path)

print(f"Model saved at: {save_path}")
# Generate and save the classification report
report = classification_report(y_val, predicted_labels)
report_path = os.path.join(model_folder, "classification_report.txt")

with open(report_path, "w") as file:
    file.write(report)

print(f"Model and classification report saved in '{model_folder}'")


Model saved at: ../saved_BERT_pretrained_models/model_20241211_133348_v839069
Model and classification report saved in '../saved_BERT_pretrained_models'
