In [1]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, GRU, Dense
from sklearn.model_selection import train_test_split

In [2]:
# 1. Data Preprocessing

# Load the dataset
data = pd.read_csv(r"C:\Users\rohit\Downloads\archive\Reddit_Data.csv")

In [3]:
# Drop rows where 'clean_comment' is NaN
data = data.dropna(subset=['clean_comment'])

In [4]:
# Tokenize and pad sequences
max_words = 10000  # You can adjust this based on your dataset
maxlen = 100  # Maximum length of a comment, you can adjust this based on your dataset

In [5]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data['clean_comment'])
sequences = tokenizer.texts_to_sequences(data['clean_comment'])

In [6]:
word_index = tokenizer.word_index
print(f'Found {len(word_index)} unique tokens.')

Found 54719 unique tokens.


In [7]:
data_pad = pad_sequences(sequences, maxlen=maxlen)

In [8]:
labels = np.asarray(data['category'])
print('Shape of data tensor:', data_pad.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (37149, 100)
Shape of label tensor: (37149,)


In [9]:
# Split the data into a training set and a validation set
x_train, x_val, y_train, y_val = train_test_split(data_pad, labels, test_size=0.2)

In [10]:
# 2. Model Building

embedding_dim = 100

model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(GRU(32))
model.add(Dense(1, activation='sigmoid'))  # Use 'sigmoid' for binary classification

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          1000000   
                                                                 
 gru (GRU)                   (None, 32)                12864     
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 1,012,897
Trainable params: 1,012,897
Non-trainable params: 0
_________________________________________________________________


In [11]:
# Compile the model
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])

In [12]:
# Train the model
history = model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [13]:
# 3. Evaluation

# Evaluate the model on the test set
loss, accuracy = model.evaluate(x_val, y_val)
print(f'Test accuracy: {accuracy}')

Test accuracy: 0.6917900443077087
