In [None]:
!pip install datasets
!pip install scikit-learn
!pip install tensorflow

In [1]:
from datasets import load_dataset

train_file_path = r"C:\Users\HP\Desktop\NLP_DA\q1_dataset\train_150k.txt"
test_file_path = r"C:\Users\HP\Desktop\NLP_DA\q1_dataset\test_62k.txt"

train_dataset = load_dataset("text", data_files=train_file_path)
train_data = train_dataset['train']

test_dataset = load_dataset("text", data_files=test_file_path)
test_data = test_dataset['train'] 

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report



X_train = train_data['text'][:100]
X_test = test_data['text'][:100]

y_train = [int(sample[0]) for sample in X_train]
y_test = [int(sample[0]) for sample in X_test]


vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_vec, y_train)


nb_predictions = nb_classifier.predict(X_test_vec)
accuracy = accuracy_score(y_test, nb_predictions)
print("Naive Bayes Classifier Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, nb_predictions))



Naive Bayes Classifier Accuracy: 0.52
Classification Report:
               precision    recall  f1-score   support

           0       0.55      0.57      0.56        53
           1       0.49      0.47      0.48        47

    accuracy                           0.52       100
   macro avg       0.52      0.52      0.52       100
weighted avg       0.52      0.52      0.52       100



In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report


tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train_tfidf, y_train)


svm_predictions = svm_classifier.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, svm_predictions)
print("SVM Classifier Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, svm_predictions))


SVM Classifier Accuracy: 0.51
Classification Report:
               precision    recall  f1-score   support

           0       0.54      0.57      0.55        53
           1       0.48      0.45      0.46        47

    accuracy                           0.51       100
   macro avg       0.51      0.51      0.51       100
weighted avg       0.51      0.51      0.51       100



In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
import numpy as np


X_train = train_data['text'][:100]
X_test = test_data['text'][:100]


y_train = np.array([int(sample[0]) for sample in X_train])
y_test = np.array([int(sample[0]) for sample in X_test])

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

max_len = max(len(seq) for seq in X_train_seq + X_test_seq)
X_train_padded = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=64, input_length=max_len))
model.add(LSTM(64))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(X_train_padded, y_train, epochs=5, validation_data=(X_test_padded, y_test))



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x1f2dcf70f70>