In [1]:
import pandas as pd

df = pd.read_csv('/content/Twitter_Data.csv')

print("Shape of dataset:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nNull values:\n", df.isnull().sum())
print("\nData types:\n", df.dtypes)

print("\nSample rows:")
print(df.head())


Shape of dataset: (162980, 2)

Columns: ['clean_text', 'category']

Null values:
 clean_text    4
category      7
dtype: int64

Data types:
 clean_text     object
category      float64
dtype: object

Sample rows:
                                          clean_text  category
0  when modi promised “minimum government maximum...      -1.0
1  talk all the nonsense and continue all the dra...       0.0
2  what did just say vote for modi  welcome bjp t...       1.0
3  asking his supporters prefix chowkidar their n...       1.0
4  answer who among these the most powerful world...       1.0


In [2]:
df = df.dropna(subset=['clean_text', 'category'])
df['category'] = df['category'].astype(int)
df['category'] = df['category'].map({0: "Neutral", -1: "Negative", 1: "Positive"})


In [3]:
print("Missing values before cleaning:\n", df.isnull().sum())

df = df.dropna()

print("\nMissing values after cleaning:\n", df.isnull().sum())

Missing values before cleaning:
 clean_text    0
category      0
dtype: int64

Missing values after cleaning:
 clean_text    0
category      0
dtype: int64


In [4]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = text.lower()
    words = [word for word in text.split() if word not in stop_words]
    return " ".join(words)

df['clean_text'] = df['clean_text'].astype(str).apply(clean_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
df['text_length'] = df['clean_text'].apply(lambda x: len(x.split()))

In [6]:
X = df['clean_text']
y = df['category']

In [7]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

In [8]:
vocab_size = 5000
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X)

X_sequences = tokenizer.texts_to_sequences(X)

In [9]:
max_length = max([len(x) for x in X_sequences])
X_padded = pad_sequences(X_sequences, maxlen=max_length, padding='pre')

In [10]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_categorical = to_categorical(y_encoded)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_categorical, test_size=0.2, random_state=42)

In [12]:
embedding_dim = 64

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
model.add(LSTM(128, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()



In [13]:
history = model.fit(
    X_train, y_train,
    epochs=5,
    batch_size=64,
    validation_data=(X_test, y_test),
    verbose=1
)

Epoch 1/5
[1m2038/2038[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m196s[0m 95ms/step - accuracy: 0.7690 - loss: 0.5776 - val_accuracy: 0.9058 - val_loss: 0.3166
Epoch 2/5
[1m2038/2038[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 98ms/step - accuracy: 0.9083 - loss: 0.3060 - val_accuracy: 0.9057 - val_loss: 0.3104
Epoch 3/5
[1m2038/2038[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m195s[0m 94ms/step - accuracy: 0.9110 - loss: 0.2856 - val_accuracy: 0.9084 - val_loss: 0.3035
Epoch 4/5
[1m2038/2038[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 98ms/step - accuracy: 0.9171 - loss: 0.2605 - val_accuracy: 0.9096 - val_loss: 0.3052
Epoch 5/5
[1m2038/2038[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 97ms/step - accuracy: 0.9196 - loss: 0.2438 - val_accuracy: 0.9046 - val_loss: 0.3185


In [14]:
import numpy as np

y_pred = model.predict(X_test)

y_pred_normalized = (y_pred == y_pred.max(axis=1, keepdims=True)).astype(int)

print("Sample prediction probabilities:\n", y_pred[:5])
print("\nNormalized predictions:\n", y_pred_normalized[:5])

[1m1019/1019[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 22ms/step
Sample prediction probabilities:
 [[2.2945207e-02 8.1843585e-01 1.5861893e-01]
 [3.2476105e-02 4.7943604e-04 9.6704447e-01]
 [8.4607112e-01 4.8940810e-03 1.4903466e-01]
 [8.1387305e-01 2.8125212e-02 1.5800172e-01]
 [2.9817346e-01 4.6708623e-01 2.3474029e-01]]

Normalized predictions:
 [[0 1 0]
 [0 0 1]
 [1 0 0]
 [1 0 0]
 [0 1 0]]


In [15]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_test_labels = np.argmax(y_test, axis=1)
y_pred_labels = np.argmax(y_pred_normalized, axis=1)

accuracy = accuracy_score(y_test_labels, y_pred_labels)
print("Accuracy:", accuracy)

print("\nClassification Report:\n", classification_report(y_test_labels, y_pred_labels, target_names=label_encoder.classes_))

print("\nConfusion Matrix:\n", confusion_matrix(y_test_labels, y_pred_labels))

Accuracy: 0.9046450266920292

Classification Report:
               precision    recall  f1-score   support

    Negative       0.87      0.84      0.85      7152
     Neutral       0.89      0.96      0.92     11067
    Positive       0.93      0.90      0.91     14375

    accuracy                           0.90     32594
   macro avg       0.90      0.90      0.90     32594
weighted avg       0.91      0.90      0.90     32594


Confusion Matrix:
 [[ 5972   485   695]
 [  204 10591   272]
 [  659   793 12923]]
