In [1]:
!pip install tensorflow



In [2]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

In [3]:
# 1. Data Preprocessing

# Load the dataset
data = pd.read_csv(r"C:\Users\rohit\Downloads\archive\Reddit_Data.csv")

In [4]:
# Drop rows where 'clean_comment' is NaN
data = data.dropna(subset=['clean_comment'])

In [5]:
# Tokenize the "clean_comment" column
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(data['clean_comment'])
sequences = tokenizer.texts_to_sequences(data['clean_comment'])

In [6]:
# Pad the sequences
padded_sequences = pad_sequences(sequences, maxlen=100, padding='post', truncating='post')

In [7]:
# Convert categories to one-hot encoding
labels = to_categorical(data['category'] + 1)  # Adding 1 to make categories 0, 1, and 2

In [8]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

In [9]:
# 2. Model Building

model = Sequential([
    Embedding(input_dim=5000, output_dim=64, input_length=100),
    LSTM(64, return_sequences=True),
    Dropout(0.5),
    LSTM(64),
    Dense(3, activation='softmax')
])

In [10]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [11]:
# 3. Training

model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test), batch_size=64)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x241df89cdf0>

In [12]:
# 4. Evaluation

loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy*100:.2f}%")

Test Accuracy: 49.18%


# Changing the Hyperparameters

### Changing the epochs to 10

In [13]:
# 3. Training

model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test), batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x241df9624c0>

In [14]:
# 4. Evaluation

loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy*100:.2f}%")

Test Accuracy: 92.27%


### Changing optimizer to rmsprop

In [15]:
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [16]:
# 3. Training

model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test), batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x241df9c2af0>

In [17]:
# 4. Evaluation

loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy*100:.2f}%")

Test Accuracy: 90.46%


### Tokenizer hyperparameter num_words = 4000

In [32]:
# Tokenize the "clean_comment" column
tokenizer = Tokenizer(num_words=4000, oov_token='<OOV>')
tokenizer.fit_on_texts(data['clean_comment'])
sequences = tokenizer.texts_to_sequences(data['clean_comment'])

In [33]:
# Pad the sequences
padded_sequences = pad_sequences(sequences, maxlen=100, padding='post', truncating='post')

In [34]:
# Convert categories to one-hot encoding
labels = to_categorical(data['category'] + 1)  # Adding 1 to make categories 0, 1, and 2

In [35]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

In [36]:
# 2. Model Building

model = Sequential([
    Embedding(input_dim=5000, output_dim=64, input_length=100),
    LSTM(64, return_sequences=True),
    Dropout(0.5),
    LSTM(64),
    Dense(3, activation='softmax')
])

In [37]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [38]:
# 3. Training

model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test), batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x241e943a6d0>

In [39]:
# 4. Evaluation

loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy*100:.2f}%")

Test Accuracy: 91.95%
