In [1]:
# RNN for Educational Text Classification and Next Word Generation

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# -------------------- PHASE 1: DATASET CREATION --------------------

# Sample synthetic dataset for classification
texts = [
    "Algebra teaches us about variables and equations.",
    "The cell is the basic unit of life.",
    "World War II ended in 1945.",
    "Photosynthesis is essential for plant life.",
    "Calculus helps in finding areas under curves.",
    "Newton discovered gravity.",
    "The French Revolution changed Europe.",
    "Multiplication is repeated addition.",
    "The water cycle involves evaporation and precipitation.",
    "Pythagorean theorem is used in right-angle triangles."
]
labels = ["Math", "Science", "History", "Science", "Math", "Science", "History", "Math", "Science", "Math"]

# Long text for generation task (focus on History)
history_corpus = """
The French Revolution was a period of radical social and political change in France. It began in 1789 when revolutionaries stormed the Bastille, a state prison in Paris. The revolution led to the end of monarchy and rise of democracy. Important figures like Robespierre and Napoleon played major roles. The revolution introduced concepts of liberty, equality, and fraternity, inspiring future movements around the world.
"""

# -------------------- PHASE 2: TEXT PREPROCESSING --------------------

# Classification Preprocessing
tokenizer_cls = Tokenizer()
tokenizer_cls.fit_on_texts(texts)
sequences = tokenizer_cls.texts_to_sequences(texts)
padded_cls = pad_sequences(sequences, padding='post')
vocab_size_cls = len(tokenizer_cls.word_index) + 1

label_enc = LabelEncoder()
encoded_labels = label_enc.fit_transform(labels)

X_train, X_test, y_train, y_test = train_test_split(padded_cls, encoded_labels, test_size=0.2, random_state=42)

# Generation Preprocessing
tokenizer_gen = Tokenizer()
tokenizer_gen.fit_on_texts([history_corpus])
total_words = len(tokenizer_gen.word_index) + 1
input_sequences = []

corpus = history_corpus.lower().split(".")
for line in corpus:
    tokens = tokenizer_gen.texts_to_sequences([line])[0]
    for i in range(1, len(tokens)):
        n_gram_sequence = tokens[:i+1]
        input_sequences.append(n_gram_sequence)

max_seq_len = max([len(x) for x in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre')
X_gen = input_sequences[:, :-1]
y_gen = tf.keras.utils.to_categorical(input_sequences[:, -1], num_classes=total_words)

# -------------------- PHASE 3: MODEL BUILDING --------------------

# Classification Model
model_cls = Sequential([
    Embedding(input_dim=vocab_size_cls, output_dim=32, input_length=padded_cls.shape[1]),
    SimpleRNN(32),
    Dense(3, activation='softmax')
])

model_cls.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Generation Model
model_gen = Sequential([
    Embedding(total_words, 32, input_length=max_seq_len - 1),
    SimpleRNN(64),
    Dense(total_words, activation='softmax')
])

model_gen.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# -------------------- PHASE 4: TRAINING AND EVALUATION --------------------

# Train Classification
model_cls.fit(X_train, y_train, epochs=20, verbose=1)
loss, accuracy = model_cls.evaluate(X_test, y_test)
print("\nClassification Accuracy:", accuracy)

# Train Generation
model_gen.fit(X_gen, y_gen, epochs=50, verbose=1)

# Text Generation
def generate_text(seed_text, next_words=20):
    for _ in range(next_words):
        token_list = tokenizer_gen.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_seq_len-1, padding='pre')
        predicted = np.argmax(model_gen.predict(token_list, verbose=0), axis=-1)
        output_word = ""
        for word, index in tokenizer_gen.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

# Example generation
seed = "The French Revolution began with a major"
print("\nGenerated Text:\n", generate_text(seed))


Epoch 1/20




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.1250 - loss: 1.1063
Epoch 2/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - accuracy: 0.2500 - loss: 1.0685
Epoch 3/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step - accuracy: 0.5000 - loss: 1.0318
Epoch 4/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - accuracy: 0.7500 - loss: 0.9956
Epoch 5/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step - accuracy: 0.8750 - loss: 0.9596
Epoch 6/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step - accuracy: 1.0000 - loss: 0.9233
Epoch 7/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step - accuracy: 1.0000 - loss: 0.8866
Epoch 8/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - accuracy: 1.0000 - loss: 0.8494
Epoch 9/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms