In [1]:
!pip install pandas scikit-learn nltk matplotlib tensorflow imbalanced-learn




In [None]:
import pandas as pd
from sklearn.utils import shuffle
import string
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from imblearn.under_sampling import NearMiss
from sklearn.feature_extraction.text import CountVectorizer

# Read data
fake = pd.read_csv("data/Fake - Imbalanced.csv")
true = pd.read_csv("data/True.csv")

# Adding labels to the data
fake['target'] = 0  # 'fake' = 0
true['target'] = 1  # 'true' = 1

# Concatenate and shuffle data
data = pd.concat([fake, true]).reset_index(drop=True)
data = shuffle(data).reset_index(drop=True)

# Drop unnecessary columns
data.drop(["title", "date"], axis=1, inplace=True)

# Preprocess text
stop = set(stopwords.words('english'))
data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply(lambda x: ''.join([char for char in x if char not in string.punctuation]))
data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop]))

# Vectorize text for undersampling
vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(data['text'])
y = data['target']

# Undersample the data to balance the classes
nm = NearMiss(version=1)
X_nm, y_nm = nm.fit_resample(X_vectorized, y)

# Convert vectorized data back to text
X_nm_inverse = vectorizer.inverse_transform(X_nm)
X_nm_strings = [" ".join(words) for words in X_nm_inverse]

# Create a balanced dataset
balanced_dataset = pd.DataFrame({'text': X_nm_strings, 'target': y_nm})

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(balanced_dataset['text'], balanced_dataset['target'], test_size=0.2, random_state=42)

# Tokenize and pad sequences
max_words = 10000
max_len = 500
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

# Build LSTM model
embedding_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=max_len))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))  # Binary classification

# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train model
batch_size = 32
epochs = 5
history = model.fit(X_train_pad, y_train, validation_data=(X_test_pad, y_test), epochs=epochs, batch_size=batch_size, verbose=2)

# Evaluate model
score, acc = model.evaluate(X_test_pad, y_test, verbose=2)
print("Test accuracy:", acc)

# Plot training history
plt.plot(history.history['accuracy'], label='Train')
plt.plot(history.history['val_accuracy'], label='Test')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
