# Library

In [None]:
# !pip install transformers

Collecting transformers
  Downloading transformers-4.33.3-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m69.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m80.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m64.0 MB/s[0m eta [36m0:00:0

In [36]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
import string
import re
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import classification_report

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Data Preprocessing

In [37]:
# facing encoding issue while importing dataset so that tried with try catch
encodings = ['utf-8', 'latin1', 'ISO-8859-1']
for encoding in encodings:
    try:
        data = pd.read_csv('/content/drive/MyDrive/Sentiment_classification/hate.csv', encoding=encoding)
        break  # Stop trying encodings if one succeeds
    except UnicodeDecodeError:
        continue  # Try the next encoding if there's an error

In [38]:
data #print dataset

Unnamed: 0.1,Unnamed: 0,comment,label
0,1,dalits are lowlives,N
1,2,gay people are a burden to our society,N
2,3,Arabs are not welcome,N
3,4,I'm not saying we should actually eliminate he...,N
4,5,bananas are for black people,N
...,...,...,...
41139,117100,Indian culture is so backwards,N
41140,118100,"Women don't want equality, they just want to b...",N
41141,119100,fuck covid,P
41142,1205,This computer is garbage,P


In [39]:
data.drop('Unnamed: 0',axis=1,inplace=True) #drop irrelivent columns

In [40]:
data

Unnamed: 0,comment,label
0,dalits are lowlives,N
1,gay people are a burden to our society,N
2,Arabs are not welcome,N
3,I'm not saying we should actually eliminate he...,N
4,bananas are for black people,N
...,...,...
41139,Indian culture is so backwards,N
41140,"Women don't want equality, they just want to b...",N
41141,fuck covid,P
41142,This computer is garbage,P


In [47]:
data.isna().sum() #check if any null values

comment    0
label      0
dtype: int64

In [41]:
data.shape

(41144, 2)

In [42]:
data["label"]=data["label"].apply(lambda x: 0 if x == 'N' else 1) #encode label column with 0 & 1

In [None]:
data

Unnamed: 0,comment,label
0,dalits are lowlives,0
1,gay people are a burden to our society,0
2,Arabs are not welcome,0
3,I'm not saying we should actually eliminate he...,0
4,bananas are for black people,0
...,...,...
41139,Indian culture is so backwards,0
41140,"Women don't want equality, they just want to b...",0
41141,fuck covid,1
41142,This computer is garbage,1


In [None]:
#split features and lables
x=data.comment.values
y=data.label.values

In [None]:
print(x[:10])
print(y[:10])

['dalits are lowlives' 'gay people are a burden to our society'
 'Arabs are not welcome'
 "I'm not saying we should actually eliminate heebs.. I just wish they naturally became extinct"
 'bananas are for black people' 'asian people are the shit'
 'women can not reproduce kids but men can in all cases'
 'i N wearing black in the summer!' 'Who cares what Chinese people think?'
 'i try to punch a letter box when i see one']
[0 0 0 0 0 1 0 1 0 0]


In [None]:
#function to remove punctuations from the sentences
def remove_punctuation(text):
  translator=str.maketrans("","",string.punctuation)
  return text.translate(translator)

In [None]:
#get values without punctuation
x_data=[]
for text in x:
  without_punctuation=remove_punctuation(str(text))
  x_data.append(without_punctuation.lower())


In [None]:
x_data

['dalits are lowlives',
 'gay people are a burden to our society',
 'arabs are not welcome',
 'im not saying we should actually eliminate heebs i just wish they naturally became extinct',
 'bananas are for black people',
 'asian people are the shit',
 'women can not reproduce kids but men can in all cases',
 'i n wearing black in the summer',
 'who cares what chinese people think',
 'i try to punch a letter box when i see one',
 'i feel sorry for the poor little half caste boy down the street hes neither one thing nor the other',
 'this drink sucks its awful',
 'gays are purposeless',
 'the thought of a world free of blacks makes me so happy',
 'its such a rape culture with these immigrants women dont even see it as rape',
 'discrimination against r3t4rdz is not wrong and disgusting its good',
 'politicians are arseholes',
 'get rid of all the hindus',
 'jewish people look violent',
 'this shitty camera is awful in taking pictures',
 'christianity is a massive problem in this country i

In [None]:
#function to remove numbers
def remove_numbers(text):
    text_without_numbers = re.sub(r'\d+', '', text)
    return text_without_numbers

In [None]:
#get values without numbers
x_data_1=[]
for text in x_data:
  without_num=remove_numbers(text)
  x_data_1.append(without_num)


In [None]:
x_data_1

['dalits are lowlives',
 'gay people are a burden to our society',
 'arabs are not welcome',
 'im not saying we should actually eliminate heebs i just wish they naturally became extinct',
 'bananas are for black people',
 'asian people are the shit',
 'women can not reproduce kids but men can in all cases',
 'i n wearing black in the summer',
 'who cares what chinese people think',
 'i try to punch a letter box when i see one',
 'i feel sorry for the poor little half caste boy down the street hes neither one thing nor the other',
 'this drink sucks its awful',
 'gays are purposeless',
 'the thought of a world free of blacks makes me so happy',
 'its such a rape culture with these immigrants women dont even see it as rape',
 'discrimination against rtrdz is not wrong and disgusting its good',
 'politicians are arseholes',
 'get rid of all the hindus',
 'jewish people look violent',
 'this shitty camera is awful in taking pictures',
 'christianity is a massive problem in this country it 

In [None]:
x=x_data_1 #getting data into x variable

In [None]:

x_train,x_test,y_train,y_test=tts(x,y,test_size=.3) #split data into train & test variables

# Model Selection & Data Preparation

In [None]:
#get the pretrained transformer model
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name) #get the tokenizer
model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=2)

In [None]:
#function for tokenization
def tokenize_text(texts, max_length=128):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='tf'
        )

       
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

 #concatenates the tokenized input IDs and attention masks along the specified axis 
    input_ids = tf.concat(input_ids, axis=0)
    attention_masks = tf.concat(attention_masks, axis=0)

    return input_ids, attention_masks


In [None]:
#tokenize the train test IDs and attention mask
input_ids_train, attention_masks_train = tokenize_text(x_train)
input_ids_test, attention_masks_test = tokenize_text(x_test)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices(((input_ids_train, attention_masks_train), y_train)).shuffle(100).batch(16)
test_dataset = tf.data.Dataset.from_tensor_slices(((input_ids_test, attention_masks_test), y_test)).batch(16)

In [None]:
optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=1e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')


In [None]:
# Check for GPU availability
if tf.test.gpu_device_name():
    print("GPU found")
    device_name = tf.test.gpu_device_name()
else:
    print("No GPU found, using CPU")
    device_name = "/cpu:0"

GPU found


# Model building

In [45]:
# build the model
input_ids = tf.keras.layers.Input(shape=(128,), dtype=tf.int32)
attention_masks = tf.keras.layers.Input(shape=(128,), dtype=tf.int32)
with tf.device(device_name):
    bert_output = model([input_ids, attention_masks])[0]
    output = tf.keras.layers.Dense(2, activation='softmax')(bert_output)

classifier = tf.keras.Model(inputs=[input_ids, attention_masks], outputs=output)

classifier.compile(optimizer=optimizer, loss=loss, metrics=[metric]) #compile model


In [30]:
#train model
history = classifier.fit(
    train_dataset,
    epochs=3,
    validation_data=test_dataset
)

Epoch 1/3


  output, from_logits = _get_logits(


Epoch 2/3
Epoch 3/3


In [None]:
classifier.save('/content/drive/MyDrive/Sentiment_classification/sentiment_model.h5') #save model

In [46]:
test_loss, test_accuracy = classifier.evaluate(test_dataset)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")


  output, from_logits = _get_logits(


Test Loss: 0.4593393802642822, Test Accuracy: 0.7897359132766724


In [34]:
# Predictions
predictions = classifier.predict(test_dataset)
predicted_labels = tf.argmax(predictions, axis=1)
true_labels = y_test

# Print classification report
report = classification_report(true_labels, predicted_labels, target_names=["negative", "positive"])
print(report)

              precision    recall  f1-score   support

    negative       0.81      0.79      0.80      6576
    positive       0.77      0.79      0.78      5768

    accuracy                           0.79     12344
   macro avg       0.79      0.79      0.79     12344
weighted avg       0.79      0.79      0.79     12344



# Prediction

In [48]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def predict_sentiment(text):
    # Tokenize and prepare the input text
    inputs = tokenizer(
        text,
        add_special_tokens=True,
        max_length=128,
        pad_to_max_length=True,
        return_tensors="tf"
    )

    # Perform prediction using the loaded model
    predictions = classifier.predict([inputs["input_ids"], inputs["attention_mask"]])

    # Get the predicted label (0 or 1)
    predicted_label = np.argmax(predictions, axis=1)[0]

    # Return the predicted label and the corresponding sentiment
    if predicted_label == 0:
        sentiment = "Negative"
    else:
        sentiment = "Positive"

    return sentiment

In [69]:
predict_sentiment("peoples are not bad always")



'Positive'