In [1]:
import torch

torch.cuda.is_available()
torch.cuda.device_count()


1

In [2]:
torch.cuda.get_device_name()

'NVIDIA GeForce RTX 3050 Laptop GPU'

In [3]:
torch.cuda.current_device()

0

In [4]:
# Import required libraries
import numpy as np
import pandas as pd
from transformers import DistilBertTokenizer, TFDistilBertModel
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, GRU, Dropout
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import nltk
from nltk.corpus import stopwords
import re

In [5]:
# Download NLTK data
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
# Load dataset (using sample Twitter dataset for demonstration)
url = "https://raw.githubusercontent.com/dD2405/Twitter_Sentiment_Analysis/master/train.csv"
data = pd.read_csv(url)

In [7]:
print(data.head())

   id  label                                              tweet
0   1      0   @user when a father is dysfunctional and is s...
1   2      0  @user @user thanks for #lyft credit i can't us...
2   3      0                                bihday your majesty
3   4      0  #model   i love u take with u all the time in ...
4   5      0             factsguide: society now    #motivation


In [8]:
# Preprocessing function
def preprocess_text(text):
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    text = text.split()
    text = [word for word in text if word not in stopwords.words('english')]  # Remove stopwords
    return " ".join(text)


In [9]:
# Apply preprocessing
data['cleaned_text'] = data['tweet'].apply(preprocess_text)

In [10]:
# Encode sentiment labels
data['sentiment'] = data['label'].map({0: "negative", 1: "positive"})
data['label'] = data['label']  # Already encoded as 0 and 1

In [11]:
data.head()

Unnamed: 0,id,label,tweet,cleaned_text,sentiment
0,1,0,@user when a father is dysfunctional and is s...,user father dysfunctional selfish drags kids d...,negative
1,2,0,@user @user thanks for #lyft credit i can't us...,user user thanks lyft credit cant use cause do...,negative
2,3,0,bihday your majesty,bihday majesty,negative
3,4,0,#model i love u take with u all the time in ...,model love u take u time ur,negative
4,5,0,factsguide: society now #motivation,factsguide society motivation,negative


In [12]:
# Split dataset
X = data['cleaned_text']
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Tokenizer and model for DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
bert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [14]:
# Tokenize text data
def tokenize_sentences(texts, tokenizer, max_len=128):
    tokens = tokenizer(
        list(texts),
        max_length=max_len,
        padding='max_length',
        truncation=True,
        return_tensors="tf"
    )
    return tokens

train_tokens = tokenize_sentences(X_train, tokenizer)
test_tokens = tokenize_sentences(X_test, tokenizer)


In [15]:
# Define the hybrid model (DistilBERT + GRU)
def build_model(max_len=128):
    # DistilBERT Input
    input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
    attention_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")

    # DistilBERT Embeddings
    bert_output = bert_model(input_ids, attention_mask=attention_mask)
    sequence_output = bert_output.last_hidden_state

    # GRU Layer
    gru_output = GRU(128, return_sequences=False)(sequence_output)
    dropout = Dropout(0.3)(gru_output)

    # Dense Output Layer
    output = Dense(1, activation='sigmoid')(dropout)

    # Model
    model = Model(inputs=[input_ids, attention_mask], outputs=output)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model


In [16]:
# Build and compile the model
max_len = 128
model = build_model(max_len)
model.summary()

# Train the model
history = model.fit(
    [train_tokens['input_ids'], train_tokens['attention_mask']],
    y_train,
    validation_split=0.2,
    epochs=1,
    batch_size=16
)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 128)]                0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, 128)]                0         []                            
 )                                                                                                
                                                                                                  
 tf_distil_bert_model (TFDi  TFBaseModelOutput(last_hid   6636288   ['input_ids[0][0]',           
 stilBertModel)              den_state=(None, 128, 768)   0          'attention_mask[0][0]']      
                             , hidden_states=None, atte                                       

In [17]:
# Evaluate the model
y_pred = model.predict([test_tokens['input_ids'], test_tokens['attention_mask']])
y_pred = (y_pred > 0.5).astype(int)




In [18]:
# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      5937
           1       0.76      0.58      0.66       456

    accuracy                           0.96      6393
   macro avg       0.87      0.78      0.82      6393
weighted avg       0.95      0.96      0.95      6393



In [22]:
import re
from textblob import TextBlob

In [28]:
# Function to analyze sentence-level sentiment
def analyze_sentence_sentiment(text):
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    sentiment = "positive" if polarity > 0 else "negative" if polarity < 0 else "neutral"
    return sentiment, polarity

In [30]:
# Example sentence-level sentiment analysis
cleaned_text = X_test.iloc[0]
sentence_sentiment, polarity = analyze_sentence_sentiment(cleaned_text)
word_sentiments = analyze_word_sentiments(cleaned_text)

In [33]:
# Display results
print(f"\ncleaned Text: {cleaned_text}")
print("Word-Level Sentiments:")
print(word_sentiments)
print(f"Overall Sentence Sentiment: {sentence_sentiment} ")


cleaned Text: user mom says smile captivating says happy sunday pugsley luigi smile sunday pug pugs
Word-Level Sentiments:
{'user': 'neutral', 'mom': 'neutral', 'says': 'neutral', 'smile': 'positive', 'captivating': 'positive', 'happy': 'positive', 'sunday': 'neutral', 'pugsley': 'neutral', 'luigi': 'neutral', 'pug': 'neutral', 'pugs': 'neutral'}
Overall Sentence Sentiment: positive 
