In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

In [None]:
train = pd.read_csv('/content/twitter_training.csv')
train

In [None]:
test = pd.read_csv('/content/twitter_validation.csv')
test

In [None]:
test.columns = ['Header1', 'company','labels','text']
train.columns = ['Header1', 'company','labels','text']

In [None]:
print(train)
print("---------------------------------------------------------------------")
print(test)

In [None]:
train.drop(columns=["Header1","company"],inplace=True)
test.drop(columns=["Header1","company"],inplace=True)

In [None]:
train.head(),test.head()

In [None]:
sentiment=pd.concat([train,test],ignore_index=True)
sentiment

In [None]:
sentiment.info()

In [None]:
sentiment.isnull().sum()

In [None]:
sentiment.dropna(inplace=True)
sentiment.isnull().sum()

In [None]:
sentiment.duplicated().sum()

In [None]:
sentiment.drop_duplicates(inplace=True)
sentiment.duplicated().sum()

In [None]:
sentiment

In [None]:
sentiment['text'].str.len()!=0

In [None]:
sentiment['text'] = sentiment['text'].astype(str)
sentiment['text'] = sentiment['text'].str.lower()
sentiment['text']

In [None]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np
nltk.download('wordnet', "/kaggle/working/nltk_data/")
nltk.download('omw-1.4', "/kaggle/working/nltk_data/")
! unzip /kaggle/working/nltk_data/corpora/wordnet.zip -d /kaggle/working/nltk_data/corpora
! unzip /kaggle/working/nltk_data/corpora/omw-1.4.zip -d /kaggle/working/nltk_data/corpora
nltk.data.path.append("/kaggle/working/nltk_data/")

In [None]:
def process_text(text):
    text = re.sub(r'\s+', ' ', text, flags=re.I) # Remove extra white space from text

    text = re.sub(r'\W', ' ', str(text)) # Remove all the special characters from text

    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text) # Remove all single characters from text

    text = re.sub(r'[^a-zA-Z\s]', '', text) # Remove any character that isn't alphabetical

    text = text.lower()

    words = word_tokenize(text)

    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    stop_words = set(stopwords.words("english"))
    Words = [word for word in words if word not in stop_words]

    Words = [word for word in Words if len(word) > 3]

    indices = np.unique(Words, return_index=True)[1]
    cleaned_text = np.array(Words)[np.sort(indices)].tolist()

    return cleaned_text

In [None]:
x = sentiment.text
y = sentiment.labels
x,y

In [None]:
texts = list(x)

In [None]:
texts

In [None]:
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')

In [None]:
cleaned_text = [process_text(text) for text in texts]

In [None]:
cleaned_text[:10]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(cleaned_text,y, test_size = 0.2, random_state =42)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
max_vocab = 20000
tokenizer = Tokenizer(num_words = max_vocab)
tokenizer.fit_on_texts(x_train)
word_idx = tokenizer.word_index
v= len(word_idx)
print("the size of vocab = ", v)
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

In [None]:
from keras.preprocessing.sequence import pad_sequences

In [None]:
maxlen =100
x_train = pad_sequences(x_train, maxlen = maxlen)
x_test = pad_sequences(x_test, maxlen = maxlen)

In [None]:
y.value_counts()

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense,Input,GlobalMaxPooling1D,Dropout,Bidirectional
from keras.models import Model
D=100
input=Input(shape=(maxlen,))

In [None]:
from keras.optimizers import Adam

In [None]:
lr = 0.0001
x = Embedding(v+1, D) (input)
x= Dropout(0.5)(x)
x = Bidirectional(LSTM(150))(x)
x= Dense(32, activation = 'relu')(x)
x = Dense (4, activation = 'softmax')(x)

model = Model(input, x)

optimizer = Adam(learning_rate = lr)

model.compile(optimizer = optimizer, loss = 'categorical_crossentropy',
              metrics = ["acc"])


In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [None]:
y_train_en = le.fit_transform(y_train)
y_test_en = le.fit_transform(y_test)

In [None]:
y_train_one_hot = tf.keras.utils.to_categorical(y_train_en)
y_test_one_hot = tf.keras.utils.to_categorical(y_test_en)


In [None]:
history = model.fit(x_train, y_train_one_hot, epochs =40,
                    validation_data = (x_test, y_test_one_hot))

In [None]:
plt.figure(figsize = (10,6))
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title("Model Accuracy")
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Test'], loc ='upper left')
plt.show()



In [None]:
plt.figure(figsize = (10,6))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title("Model Loss")
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(['Train', 'Test'], loc ='upper left')
plt.show()

In [None]:
loss, accuracy = model.evaluate(x_test, y_test_one_hot)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
y_pred = model.predict(x_test)
y_pred_labels = np.argmax(y_pred, axis =1)
y_true_labels = np.argmax(y_test_one_hot, axis =1)
confusion_matrix = confusion_matrix(y_true_labels, y_pred_labels)
plt.figure(figsize=(10,6))
sns.heatmap(confusion_matrix, annot =True, fmt = 'd', cmap= 'Blues',
            xticklabels=['Class 0', 'Class 1', 'Class 2', 'Class 3'],
            yticklabels=['Class 0', 'Class 1', 'Class 2', 'Class 3'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()