In [62]:
import pandas as pd
import numpy as np
import re
import nltk
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tensorflow import *


In [63]:

# Load and clean dataset
df = pd.read_csv("./Final/final_dataset")
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def remove_punc(comment):
    punc = '''!()-[]}{;:'"\,<>./?@#$%^&*_~'''
    for ele in comment:
        if ele in punc:
            comment = comment.replace(ele, "")
    return comment

def normalize_document(doc):
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I | re.A)
    doc = doc.lower()
    doc = doc.strip()
    tokens = wpt.tokenize(doc)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    doc = ' '.join(filtered_tokens)
    return doc

In [64]:
# Data preprocessing
df["Comments"] = df["Comments"].apply(remove_punc)
df["Comments"] = df["Comments"].apply(normalize_document)

# Split dataset
X = df["Comments"]
y = df["Type"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)


In [65]:
# Tokenizing and padding sequences
tokenizer = Tokenizer(num_words=5000, lower=True)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1
max_len = 150
X_train = pad_sequences(X_train, padding="post", maxlen=max_len)
X_test = pad_sequences(X_test, padding="post", maxlen=max_len)

# One-hot encode labels
num_classes = 2
y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)

In [66]:
def build_model(vocab_size, max_len, embedding_dim=100, lstm_units=128, dropout_rate=0.2):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len))
    model.add(LSTM(lstm_units, return_sequences=True))
    model.add(Dropout(dropout_rate))
    model.add(LSTM(lstm_units // 2))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(2, activation='softmax'))
    model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
    return model


In [67]:
#Train the model
model = lstm_model()
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate model accuracy
y_pred = model.predict(X_test)
y_test_ = np.argmax(y_test, axis=1)
y_pred_labels = np.argmax(y_pred, axis=1)

print(f"Accuracy: {accuracy_score(y_test_, y_pred_labels)}")

# Test with actual data
a = input("Enter any sentence: ")
a = tokenizer.texts_to_sequences([a])
a = pad_sequences(a, padding='post', maxlen=max_len)

prediction = model.predict(a)
predicted_class = np.argmax(prediction, axis=1)[0]
print(f"Predicted class: {class_labels[predicted_class]}")



Epoch 1/10
[1m  16/1467[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:21[0m 97ms/step - accuracy: 0.5137 - loss: 0.6947

KeyboardInterrupt: 