# INSTALL LIBRARIES

In [1]:
!pip install transformers torch tensorflow scikit-learn requests -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25h

## IMPORT LIBRARIES

In [2]:
import pandas as pd
import numpy as np
import re
from google.colab import drive
import os

from sklearn.model_selection import train_test_split

# ploting
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# warnings
import warnings
warnings.filterwarnings('ignore')

# FETCH DATASET

- As we are working on google colab which has temp file storage i prefer to use google drive to access the dataset

In [3]:
# Mount Drive
drive.mount("/content/drive")

# after mount you will see the drive from there you can get the dataset path
dataset_path = "/content/drive/MyDrive/NLP_PROJECTS/NEWSCLASSIFICATION"

# load the dataset using pandas

train_df= pd.read_csv(os.path.join(dataset_path,'Train_data.csv'))
test_df = pd.read_csv(os.path.join(dataset_path, 'Test_data.csv'))

print(f"Train Dataset : {train_df.shape}")
print(f"Test Dataset : {test_df.shape}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Train Dataset : (160682, 7)
Test Dataset : (40171, 6)


# GENERAL DATA PREPROCESSING

In [4]:
test_df.isnull().sum()

REF_NO                  0
headline                0
authors                 0
link                    0
short_description    3915
date                    0
dtype: int64

In [5]:
import re

def clean_text(txt):
    txt = txt.lower()
    # Remove web links (http, https, www, etc.)
    txt = re.sub(r'http\S+|www\S+|https\S+', '', txt, flags=re.MULTILINE)

    # Remove email addresses
    txt = re.sub(r'\S*@\S*\s?', '', txt)

    # Remove special characters (keeping only alphanumeric characters and spaces)
    txt = re.sub(r'[^a-zA-Z0-9\s]', '', txt)

    # Remove numbers
    txt = re.sub(r'\d+', '', txt)

    # Remove extra spaces
    txt = re.sub(r'\s+', ' ', txt).strip()





    return txt

In [6]:
def data_cleaning(df):
  print(f"Before DataSet Null counts \n {df.isnull().sum()}")
  df['short_description']= df['short_description'].fillna(df['headline'])
  df.dropna(inplace=True)
  print(f"After DataSet Null counts \n {df.isnull().sum()}")
  print("Cleaning the Text ")
  df['short_description']=df['short_description'].apply(clean_text)
  print("Cleaning process is completed Successfully  ")
  return df

In [7]:
train_clean_df= data_cleaning(train_df)
# test_clean_df = data_cleaning(test_df)

Before DataSet Null counts 
 REF_NO                   0
headline                 6
authors                  0
link                     0
short_description    15797
date                     0
category                 0
dtype: int64
After DataSet Null counts 
 REF_NO               0
headline             0
authors              0
link                 0
short_description    0
date                 0
category             0
dtype: int64
Cleaning the Text 
Cleaning process is completed Successfully  


In [8]:
train_clean_df.head(3)

Unnamed: 0,REF_NO,headline,authors,link,short_description,date,category
0,123291,"Putin, Fear and Leadership","James A. Cusumano, Ph.D., ContributorOwner and...",https://www.huffingtonpost.com/entry/the-simil...,there are uncomfortable parallels between hitl...,2014-05-08,POLITICS
1,37541,"Barack Obama Failed To Get A New Climate Law, ...",Kate Sheppard,https://www.huffingtonpost.com/entry/barack-ob...,the fate of climate rules now lies with the su...,2017-01-11,POLITICS
2,84614,Senate GOP Leaders Want To Put Ted Cruz In A T...,Laura Barrón-López,https://www.huffingtonpost.com/entry/senate-go...,washington senate republican leaders spent a r...,2015-07-26,POLITICS


In [9]:
train_clean_df['short_description'][6]

'i spent most of my s obsessing about love when will i meet him when will i find true love why isnt he calling me why am i alone where is he who is he am i over him why cant i get over him these were the litany of questions that lived in my mind every day'

In [10]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [11]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
def lemmatize_text(txt):
  words = txt.split()
  lemmatized_words=[lemmatizer.lemmatize(word) for word in words if word not in stop_words]

  return ' '.join(lemmatized_words)


In [12]:
train_clean_df['lemmatize'] = train_clean_df['short_description'].apply(lemmatize_text)

# Tokenization and Padding

In [13]:
from transformers import DistilBertTokenizer


# Load the tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize_text(txt):
  tokens = tokenizer.encode_plus(
      txt,
      max_length =128,
      truncation  = True,
      padding ='max_length',
      add_special_tokens='True',
      return_tensors = 'tf'


  )
  return tokens['input_ids'], tokens['attention_mask']



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [14]:
df = pd.DataFrame()
df['input_ids'], df['attention_mask'] =  zip(*train_clean_df['lemmatize'].apply(tokenize_text))

# Target Columns Encode

In [15]:
# Label encoding
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

train_clean_df['encoded_category'] = label_encoder.fit_transform(train_clean_df['category'])

# y = train_clean_df['encoded_category']

# labels = tf.constant(train_clean_df['encoded_category'] .values)

In [16]:
num_categories=len(train_clean_df['category'].unique())

In [18]:
import tensorflow as tf

# Convert to tensors
input_ids = tf.constant([tf.squeeze(i, axis=0).numpy() for i in df['input_ids']])
attention_mask = tf.constant([tf.squeeze(i, axis=0).numpy() for i in df['attention_mask']])
labels = tf.constant(train_clean_df['encoded_category'].values)

# Validate tensor shapes
print(f"Input IDs Shape: {input_ids.shape}")
print(f"Attention Mask Shape: {attention_mask.shape}")
print(f"Labels Shape: {labels.shape}")


Input IDs Shape: (160676, 128)
Attention Mask Shape: (160676, 128)
Labels Shape: (160676,)


# Create Validation and train val split

In [19]:
import tensorflow as tf
from sklearn.model_selection import train_test_split

X_train_ids, X_val_ids, y_train, y_val, mask_train, mask_val = train_test_split(
    input_ids.numpy(), labels.numpy(), attention_mask.numpy(), test_size=0.2, random_state=42
)

# Convert back to tensors after splitting
X_train_ids = tf.constant(X_train_ids)
X_val_ids = tf.constant(X_val_ids)
mask_train = tf.constant(mask_train)
mask_val = tf.constant(mask_val)
y_train = tf.constant(y_train)
y_val = tf.constant(y_val)


# Create TensorFlow Dataset

In [20]:
# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': X_train_ids, 'attention_mask': mask_train}, y_train
)).batch(16)

val_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': X_val_ids, 'attention_mask': mask_val}, y_val
)).batch(16)


Model Compile and Train

In [21]:
len(label_encoder.classes_)

41

In [22]:
from transformers import  TFDistilBertForSequenceClassification
# Load the model
num_categories = len(label_encoder.classes_)
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_categories)

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['Accuracy'])


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [23]:

history = model.fit(train_dataset, validation_data=val_dataset, epochs=3)

results = model.evaluate(val_dataset)
print(f"Validation loss : {results[0]}, Validation Accuracy {results[1]}")

Epoch 1/3


Cause: for/else statement not yet supported


Cause: for/else statement not yet supported
Epoch 2/3
Epoch 3/3
Validation loss : 1.9385862350463867, Validation Accuracy 0.5036407709121704


# Predictions

In [41]:
def predict(txt):
  encoding = tokenizer.encode_plus(
      txt,
      max_length=128,
      truncation=True,
      padding='max_length',
      add_special_tokens=True,
      return_tensors = 'tf'
  )

  input_ids = encoding['input_ids']
  attention_mask=encoding['attention_mask']

  prediction = model.predict([input_ids, attention_mask])
  predicted_class_id = tf.argmax(prediction.logits,axis=1).numpy()[0]
  predicted_class_label = label_encoder.inverse_transform([predicted_class_id])[0]
  return predicted_class_label




In [43]:
  # Example of prediction
  sample_text="The latest blockbuster film has received rave reviews and is on track to become the highest-grossing movie of the year"
  predicted_category = predict(sample_text)
  print(predicted_category)

ENTERTAINMENT


# Save The Model and Tokenizer

In [46]:
model.save_pretrained('/content/drive/MyDrive/NLP_PROJECTS/NEWSCLASSIFICATION/model')

In [47]:
tokenizer.save_pretrained('/content/drive/MyDrive/NLP_PROJECTS/NEWSCLASSIFICATION/tokenizer')

('/content/drive/MyDrive/NLP_PROJECTS/NEWSCLASSIFICATION/tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/NLP_PROJECTS/NEWSCLASSIFICATION/tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/NLP_PROJECTS/NEWSCLASSIFICATION/tokenizer/vocab.txt',
 '/content/drive/MyDrive/NLP_PROJECTS/NEWSCLASSIFICATION/tokenizer/added_tokens.json')

# Model Evaluation

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

import seaborn as sns
import matplotlib.pyplot as plt

