<a href="https://colab.research.google.com/github/sudip2k17/TextPreprocessingWithSpacy/blob/main/SequentialSentenceClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Sequential Sentence Classification Project**

**Objective**: Build a deep learning model for sequential sentence classification to convert "harder to read" text into "easier to read" text.

Model Overview

1. Baseline Model: Naïve Bayes + TF-IDF
2. Deep Learning Models:


*   Conv1D with Token Embedding
*   Pretrained Feature Extractor (GloVe, FastText)
*   Conv1D with Character Embedding
*   LLM-Based Approach (BERT as Feature Extractor)

Step 1: Install & Import Dependencies

In [None]:
!pip install transformers datasets torch tensorflow keras nltk spacy

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.

In [8]:
import numpy as np
import pandas as pd
import tensorflow as tf
import torch
import nltk
import spacy

from transformers import BertTokenizer, BertModel
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Input
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, Dataset

Step 2: Load & Preprocess Data (PubMed 20k RCT)

In [9]:
# Define dataset paths
train_path = "/content/dataset/train.txt"
test_path = "/content/dataset/test.txt"

# Function to load dataset
def load_pubmed_dataset(file_path):
    sentences = []
    labels = []
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file.readlines():
            if line.strip():
                # Check if the line contains a tab character
                if "\t" in line:
                    label, sentence = line.split("\t", 1) # Limit split to 1 to avoid issues with multiple tabs
                    sentences.append(sentence.strip())
                    labels.append(label.strip())
                else:
                    # Handle lines without tabs (e.g., skip them or print a warning)
                    print(f"Warning: Skipping line without tab: {line.strip()}")

    return sentences, labels

# Load data
train_sentences, train_labels = load_pubmed_dataset(train_path)
test_sentences, test_labels = load_pubmed_dataset(test_path)

# Encode labels
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)
test_labels_encoded = label_encoder.transform(test_labels)

label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Mapping:", label_mapping)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Label Mapping: {'BACKGROUND': 0, 'CONCLUSIONS': 1, 'METHODS': 2, 'OBJECTIVE': 3, 'RESULTS': 4}


Step 3: Baseline Model (Naïve Bayes + TF-IDF)

In [10]:
# Apply preprocessing (SpaCy for stopword removal, lemmatization)
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

# Preprocess data
train_sentences_clean = [preprocess_text(text) for text in train_sentences]
test_sentences_clean = [preprocess_text(text) for text in test_sentences]

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(ngram_range=(1,3))
X_train_tfidf = vectorizer.fit_transform(train_sentences_clean)
X_test_tfidf = vectorizer.transform(test_sentences_clean)

# Train Naïve Bayes Model
from sklearn.naive_bayes import MultinomialNB
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, train_labels_encoded)

# Evaluate
nb_accuracy = nb_model.score(X_test_tfidf, test_labels_encoded)
print(f"Naïve Bayes Accuracy: {nb_accuracy * 100:.2f}%")

Naïve Bayes Accuracy: 63.75%


Step 4: Conv1D with Token Embedding

In [1]:
# Tokenize Text
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(train_sentences)
train_sequences = tokenizer.texts_to_sequences(train_sentences)
test_sequences = tokenizer.texts_to_sequences(test_sentences)

# Apply Padding
MAX_SEQUENCE_LENGTH = 100
train_padded = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
test_padded = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

# Conv1D Model
conv1d_model = Sequential([
    Embedding(input_dim=10000, output_dim=100, input_length=MAX_SEQUENCE_LENGTH),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dense(len(label_mapping), activation='softmax')
])

conv1d_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train
conv1d_model.fit(train_padded, np.array(train_labels_encoded), epochs=5, batch_size=32, validation_data=(test_padded, np.array(test_labels_encoded)))

# Evaluate
conv1d_accuracy = conv1d_model.evaluate(test_padded, np.array(test_labels_encoded))[1]
print(f"Conv1D Accuracy: {conv1d_accuracy * 100:.2f}%")

NameError: name 'Tokenizer' is not defined

Step 5: Pretrained Feature Extractor (GloVe)

In [None]:
# Load GloVe
glove_path = "glove.6B.100d.txt"
embedding_index = {}
with open(glove_path, "r", encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype="float32")
        embedding_index[word] = coefs

# Create Embedding Matrix
embedding_matrix = np.zeros((10000, 100))
for word, i in tokenizer.word_index.items():
    if i < 10000:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

# Modify Conv1D Model with Pretrained Embeddings
conv1d_model.layers[0].set_weights([embedding_matrix])
conv1d_model.layers[0].trainable = False

Step 6: Hybrid Model (BERT + Conv1D)

In [None]:
# Load BERT
bert_model = BertModel.from_pretrained("bert-base-uncased")
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Extract BERT Embeddings
def extract_bert_embeddings(texts):
    encodings = bert_tokenizer(texts, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    with torch.no_grad():
        outputs = bert_model(input_ids=encodings["input_ids"], attention_mask=encodings["attention_mask"])
    return outputs.last_hidden_state[:, 0, :].cpu().numpy()

train_bert_embeddings = extract_bert_embeddings(train_sentences)
test_bert_embeddings = extract_bert_embeddings(test_sentences)

# Reshape for Conv1D
train_bert_reshaped = np.expand_dims(train_bert_embeddings, axis=-1)
test_bert_reshaped = np.expand_dims(test_bert_embeddings, axis=-1)

# Hybrid Model
hybrid_model = Sequential([
    Input(shape=(train_bert_embeddings.shape[1], 1)),
    Conv1D(128, 3, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dense(len(label_mapping), activation='softmax')
])

hybrid_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train
hybrid_model.fit(train_bert_reshaped, np.array(train_labels_encoded), epochs=5, batch_size=32, validation_data=(test_bert_reshaped, np.array(test_labels_encoded)))

# Evaluate
hybrid_accuracy = hybrid_model.evaluate(test_bert_reshaped, np.array(test_labels_encoded))[1]
print(f"Hybrid Model (BERT + Conv1D) Accuracy: {hybrid_accuracy * 100:.2f}%")