In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv("/content/drive/MyDrive/Redback_A/chatgpt.csv")
data = data.drop("Unnamed: 0", axis=1)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 219294 entries, 0 to 219293
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   tweets  219294 non-null  object
 1   labels  219294 non-null  object
dtypes: object(2)
memory usage: 3.3+ MB


In [None]:
score_cnt = data.groupby("labels")["tweets"].count()/len(data)
score_cnt

labels
bad        0.491559
good       0.255415
neutral    0.253026
Name: tweets, dtype: float64

In [None]:
data['labels'][data['labels']=="bad"] = 0
data['labels'][data['labels']=="neutral"] = 1
data['labels'][data['labels']=="good"] = 2
data.head()

Unnamed: 0,tweets,labels
0,ChatGPT: Optimizing Language Models for Dialog...,1
1,"Try talking with ChatGPT, our new AI system wh...",2
2,ChatGPT: Optimizing Language Models for Dialog...,1
3,"THRILLED to share that ChatGPT, our new model ...",2
4,"As of 2 minutes ago, @OpenAI released their ne...",0


In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

In [None]:
X = data["tweets"]
y = data["labels"]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, train_size=0.8, random_state=42)

In [None]:
## Set parameters for tokenization 
vocab_size = 20000
embedding_dim = 32
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 20000

In [None]:
# Tokenize dataset
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

In [None]:
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
# Create the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(3, activation='softmax') # Output layer with 3 units and softmax activation
])

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
num_epochs = 1

# Convert y_train and y_test to one-hot encoded matrices
y_train_onehot = tf.keras.utils.to_categorical(y_train, num_classes=3)
y_test_onehot = tf.keras.utils.to_categorical(y_test, num_classes=3)

history = model.fit(X_train_padded, y_train_onehot, epochs=num_epochs, batch_size=32, validation_data=(X_test_padded, y_test_onehot))




In [None]:
# Evaluate model's performance on a test dataset
test_loss, test_acc = model.evaluate(X_test_padded, y_test_onehot, verbose=2)
print("Test Accuracy: ", test_acc)

1371/1371 - 74s - loss: 0.2542 - accuracy: 0.9123 - 74s/epoch - 54ms/step
Test Accuracy:  0.9122642874717712
