In [1]:
!pip install gensim nltk keras tensorflow scikit-learn

# Download GloVe vectors
!wget https://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

import pandas as pd
from google.colab import files

# Upload the CSV file
uploaded = files.upload()  # Opens a file dialog for file upload
df = pd.read_csv(next(iter(uploaded)))  # Load into DataFrame

import numpy as np
from nltk.tokenize import word_tokenize
import nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from sklearn.metrics import classification_report

# Step 2: Download NLTK resources
nltk.download('punkt')

# Step 3: Tokenize content and map labels
df['tokens'] = df['content'].apply(lambda x: word_tokenize(x.lower()))
label_mapping = {'F': 1, 'NF': 0}
df['labels'] = df['RequirementType'].map(label_mapping)

# Load the 100-dimensional GloVe vectors
embeddings_index = {}
with open('glove.6B.100d.txt') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
print(f"Loaded {len(embeddings_index)} word vectors.")

# Step 5: Prepare tokenizer and embedding matrix
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['content'])
word_index = tokenizer.word_index
print(f'Found {len(word_index)} unique tokens.')

# Create the embedding matrix
embedding_dim = 100
num_tokens = len(word_index) + 1
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Step 6: Convert texts to sequences and pad them
X = tokenizer.texts_to_sequences(df['content'])
X = pad_sequences(X, maxlen=100)
y = df['labels']

# Step 7: Split data into training (60%), validation (20%), and test (20%) sets
from sklearn.model_selection import train_test_split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Step 8: Build the Keras Model
model = Sequential()
model.add(Embedding(input_dim=num_tokens,
                    output_dim=embedding_dim,
                    weights=[embedding_matrix],
                    input_length=100,
                    trainable=False))  # GloVe embeddings are not trainable
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Step 9: Train the Model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

# Step 10: Evaluate the Model
# Calculate performance on all sets and print classification reports
def evaluate_model(model, X, y, set_name):
    y_pred = (model.predict(X) > 0.5).astype("int32")
    report = classification_report(y, y_pred, target_names=['NF', 'F'], digits=4)
    print(f"Classification Report for {set_name} Set:\n")
    print(report)

# Evaluation on Training Set
evaluate_model(model, X_train, y_train, "Training")

# Evaluation on Validation Set
evaluate_model(model, X_val, y_val, "Validation")

# Evaluation on Test Set
evaluate_model(model, X_test, y_test, "Testing")


--2024-09-22 05:20:14--  https://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-09-22 05:20:14--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2024-09-22 05:22:53 (5.18 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]



Saving final_corrected_fine_labeled_reviews.csv to final_corrected_fine_labeled_reviews.csv


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Loaded 400000 word vectors.
Found 13086 unique tokens.




Epoch 1/10
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 28ms/step - accuracy: 0.6657 - loss: 0.5996 - val_accuracy: 0.8055 - val_loss: 0.4046
Epoch 2/10
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 42ms/step - accuracy: 0.8310 - loss: 0.3791 - val_accuracy: 0.8735 - val_loss: 0.2962
Epoch 3/10
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 36ms/step - accuracy: 0.9122 - loss: 0.2179 - val_accuracy: 0.8968 - val_loss: 0.2473
Epoch 4/10
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 25ms/step - accuracy: 0.9549 - loss: 0.1164 - val_accuracy: 0.8816 - val_loss: 0.2776
Epoch 5/10
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 42ms/step - accuracy: 0.9732 - loss: 0.0787 - val_accuracy: 0.8655 - val_loss: 0.3765
Epoch 6/10
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 25ms/step - accuracy: 0.9887 - loss: 0.0384 - val_accuracy: 0.8812 - val_loss: 0.3546
Epoch 7/10
[1m235/2