# Long Short Term Memory Implementation

Importing necessary libraries

In [6]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string

# Download stopwords (only need to run once)
nltk.download('stopwords')
nltk.download('punkt')

# Load and preprocess the train data
train_path = "/content/train.csv"
train_data = pd.read_csv(train_path)


# Combine 'title', 'author', and 'text' columns to create a single input text
train_data['input_text'] = train_data['title'].fillna('') + ' ' + train_data['author'].fillna('') + ' ' + train_data['text'].fillna('')

# Define stopwords, punctuations, and stemmer
stop_words = set(stopwords.words('english'))
punctuations = set(string.punctuation)
stemmer = PorterStemmer()

# Function to preprocess the text
def preprocess_text(text):
    # Tokenize the text
    words = word_tokenize(text)

    # Remove stopwords and punctuations, and lowercase the words
    words = [word.lower() for word in words if word.lower() not in stop_words and word not in punctuations]

    # Apply stemming
    words = [stemmer.stem(word) for word in words]

    # Join the words back into a single string
    preprocessed_text = ' '.join(words)

    return preprocessed_text

# Apply text preprocessing to the input_text column
train_data['input_text'] = train_data['input_text'].apply(preprocess_text)

# Define input and target variables
X = train_data['input_text'].values
y = train_data['label'].values

# Tokenize the text data with a limited vocabulary size
max_words = 10000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X)

# Convert text to sequences of integers
X_sequences = tokenizer.texts_to_sequences(X)

# Pad sequences to ensure uniform length for LSTM input
max_sequence_length = 100  # Adjust this value as needed
X_padded = pad_sequences(X_sequences, maxlen=max_sequence_length, padding='post')

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_padded, y, test_size=0.2, random_state=42)

# Get the vocabulary size (number of unique words in the tokenizer)
vocab_size = len(tokenizer.word_index) + 1

# Build the LSTM model
model = Sequential()
model.add(Embedding(vocab_size, 64, input_length=max_sequence_length))  # Lower the embedding dimension (e.g., 64)
model.add(LSTM(32, dropout=0.2, recurrent_dropout=0.2))  # Lower the LSTM dimension (e.g., 32)
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Data generator for training to load smaller batches
def data_generator(X, y, batch_size):
    steps = len(X) // batch_size
    while True:
        for i in range(steps):
            batch_X = X[i * batch_size: (i + 1) * batch_size]
            batch_y = y[i * batch_size: (i + 1) * batch_size]
            yield batch_X, batch_y

# Train the model using the data generator
batch_size = 32
epochs = 5
steps_per_epoch = len(X_train) // batch_size
model.fit(data_generator(X_train, y_train, batch_size), steps_per_epoch=steps_per_epoch,
          validation_data=(X_val, y_val), epochs=epochs)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x791fa8540880>

In [8]:
# Load the submit data
submit_path = "/content/submit.csv"
submit_data = pd.read_csv(submit_path)

# Load and preprocess the test data
test_path = "/content/test.csv"
test_data = pd.read_csv(test_path)

# Combine 'title', 'author', and 'text' columns to create a single input text
test_data['input_text'] = test_data['title'].fillna('') + ' ' + test_data['author'].fillna('') + ' ' + test_data['text'].fillna('')

# Apply the same text preprocessing to the test data
X_test = test_data['input_text'].values
X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length, padding='post')

# Make predictions on the test data
y_test_pred = model.predict(X_test_padded)
y_test_pred_labels = np.round(y_test_pred).astype(int).flatten()

# Store the results in a DataFrame
results_df = pd.DataFrame({'id': test_data['id'], 'label': y_test_pred_labels, 'original_label': submit_data['label']})

# Save the results to a new CSV file named 'results.csv'
results_path = "/content/results.csv"
results_df.to_csv(results_path, index=False)

# Compare the predictions with the values in submit.csv
accuracy = np.mean(results_df['label'] == submit_data['label'])

print("Model Accuracy on Test Data:", accuracy)


Model Accuracy on Test Data: 0.5478846153846154


In [9]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the submit data (ground truth labels)
submit_path = "/content/submit.csv"
submit_data = pd.read_csv(submit_path)

# Compare the predictions with the ground truth labels
accuracy = accuracy_score(submit_data['label'], results_df['label'])
precision = precision_score(submit_data['label'], results_df['label'])
recall = recall_score(submit_data['label'], results_df['label'])
f1 = f1_score(submit_data['label'], results_df['label'])

print("Model Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)


Model Accuracy: 0.5478846153846154
Precision: 0.5493039443155452
Recall: 0.9930094372596994
F1-Score: 0.707332254450392


In [13]:
# Save the model
model_path = "/content/model.LSTM"
model.save(model_path)

# Save the tokenizer
import pickle

tokenizer_path = "/content/tokenizer.LSTM"
with open(tokenizer_path, 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [11]:
# Load the model
loaded_model = tf.keras.models.load_model(model_path)

# Load the tokenizer
with open(tokenizer_path, 'rb') as handle:
    loaded_tokenizer = pickle.load(handle)




In [18]:
from google.colab import files
files.download('/content/model.LSTM')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [17]:
import numpy as np
import tensorflow as tf
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the model
model_path = "/content/model.LSTM"
loaded_model = tf.keras.models.load_model(model_path)

# Load the tokenizer
tokenizer_path = "/content/tokenizer.LSTM"
with open(tokenizer_path, 'rb') as handle:
    loaded_tokenizer = pickle.load(handle)

# Define the new text
new_text = ""

# Preprocess the new text
preprocessed_text = preprocess_text(new_text)  # using the same preprocess_text function you defined during training

# Convert text to sequences of integers
new_text_sequence = loaded_tokenizer.texts_to_sequences([preprocessed_text])

# Pad sequences to ensure uniform length for LSTM input
max_sequence_length = 100  # same as during training
new_text_padded = pad_sequences(new_text_sequence, maxlen=max_sequence_length, padding='post')

# Make a prediction
prediction = loaded_model.predict(new_text_padded)

# Print the result
if prediction >= 0.5:
    print("This text is likely to be fake.")
else:
    print("This text is likely to be real.")




This text is likely to be fake.


In [20]:
!tar czvf model.tar.gz /content/model.h5
!tar czvf tokenizer.tar.gz /content/tokenizer.pickle

# Download the compressed files
files.download('/content/model.tar.gz')
files.download('/content/tokenizer.tar.gz')


tar: Removing leading `/' from member names
/content/model.h5
tar: Removing leading `/' from member names
/content/tokenizer.pickle


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>