In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Concatenate, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models.fasttext import load_facebook_model
from indicnlp.tokenize import indic_tokenize
from rouge import Rouge
from nltk.translate.bleu_score import sentence_bleu
from collections import Counter


In [2]:
train_df=pd.read_csv(r"C:\Users\SHAM\OneDrive\Desktop\project\train_extractive.csv",encoding='utf-8')
test_df=pd.read_csv(r"C:\Users\SHAM\OneDrive\Desktop\project\test_extractive.csv",encoding='utf-8')
valid_df=pd.read_csv(r"C:\Users\SHAM\OneDrive\Desktop\project\val_extractive.csv", encoding='utf-8')

In [3]:

train_articles, train_summaries = train_df['extractive_summary1'].tolist(), train_df['targets'].tolist()
valid_articles, valid_summaries = valid_df['extractive_summary1'].tolist(), valid_df['targets'].tolist()
test_articles, test_summaries = test_df['extractive_summary1'].tolist(), test_df['targets'].tolist()


In [4]:

# Subset data
train_articles = train_articles[:300]
train_summaries = train_summaries[:300]
valid_articles = valid_articles[:100]
valid_summaries = valid_summaries[:100]
test_articles = test_articles[:100]
test_summaries = test_summaries[:100]


In [5]:

# Add <start> and <end> tokens to summaries
train_summaries = ['<start> ' + text + ' <end>' for text in train_summaries]
valid_summaries = ['<start> ' + text + ' <end>' for text in valid_summaries]
test_summaries = ['<start> ' + text + ' <end>' for text in test_summaries]


In [6]:

# Custom tokenizer with indic_tokenize
def custom_tokenize(text):
    if not isinstance(text, str) or not text.strip():
        return ['<unk>']
    return indic_tokenize.trivial_tokenize(text, lang='te')


In [7]:

class IndicTokenizer:
    def __init__(self, oov_token='<unk>'):
        self.word_index = {}
        self.index_word = {}
        self.oov_token = oov_token
        self.word_counts = {}

    def fit_on_texts(self, texts):
        word_counts = Counter()
        for text in texts:
            tokens = custom_tokenize(text)
            word_counts.update(tokens)
        self.word_index = {self.oov_token: 1, '<pad>': 0, '<start>': 2, '<end>': 3}
        for i, (word, _) in enumerate(word_counts.most_common()):
            self.word_index[word] = i + 4
        self.index_word = {i: w for w, i in self.word_index.items()}
        self.word_counts = word_counts

    def texts_to_sequences(self, texts):
        sequences = []
        for text in texts:
            tokens = custom_tokenize(text)
            seq = [self.word_index.get(token, self.word_index[self.oov_token]) for token in tokens]
            sequences.append(seq)
        return sequences


In [8]:

tokenizer = IndicTokenizer(oov_token='<unk>')
tokenizer.fit_on_texts(train_articles + train_summaries)
vocab_size = len(tokenizer.word_index) + 1
print(f"Vocabulary size: {vocab_size}")


Vocabulary size: 8168


In [9]:

# Convert texts to sequences
train_article_seq = tokenizer.texts_to_sequences(train_articles)
train_summary_seq = tokenizer.texts_to_sequences(train_summaries)
valid_article_seq = tokenizer.texts_to_sequences(valid_articles)
valid_summary_seq = tokenizer.texts_to_sequences(valid_summaries)
test_article_seq = tokenizer.texts_to_sequences(test_articles)
test_summary_seq = tokenizer.texts_to_sequences(test_summaries)


In [10]:

# Calculate maximum lengths
max_article_len = min(max([len(seq) for seq in train_article_seq]), 1000)
max_summary_len = min(max([len(seq) for seq in train_summary_seq]), 50)
print(f"Max article length: {max_article_len}")
print(f"Max summary length: {max_summary_len}")


Max article length: 207
Max summary length: 21


In [11]:

# Pad sequences
train_article_padded = pad_sequences(train_article_seq, maxlen=max_article_len, padding='post')
train_summary_padded = pad_sequences(train_summary_seq, maxlen=max_summary_len, padding='post')
valid_article_padded = pad_sequences(valid_article_seq, maxlen=max_article_len, padding='post')
valid_summary_padded = pad_sequences(valid_summary_seq, maxlen=max_summary_len, padding='post')
test_article_padded = pad_sequences(test_article_seq, maxlen=max_article_len, padding='post')
test_summary_padded = pad_sequences(test_summary_seq, maxlen=max_summary_len, padding='post')


In [12]:

# Load FastText embeddings
embedding_dim = 300
ft_model = load_facebook_model('C:/Users/SHAM/OneDrive/Desktop/wiki.te/wiki.te.bin')
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < vocab_size:
        try:
            embedding_vector = ft_model.wv[word]
            embedding_matrix[i] = embedding_vector
        except KeyError:
            embedding_matrix[i] = np.random.normal(0, 0.1, embedding_dim)
print(f"Embedding matrix shape: {embedding_matrix.shape}")


Embedding matrix shape: (8168, 300)


In [13]:

# Parameters
latent_dim = 256
num_encoder_layers = 2
batch_size = 4


In [14]:

# Prepare decoder input/output
train_decoder_input = train_summary_padded[:, :-1]
train_decoder_output = train_summary_padded[:, 1:]
valid_decoder_input = valid_summary_padded[:, :-1]
valid_decoder_output = valid_summary_padded[:, 1:]


In [15]:

# Reshape decoder output
train_decoder_output = train_decoder_output[..., tf.newaxis]
valid_decoder_output = valid_decoder_output[..., tf.newaxis]

# Create tf.data.Dataset
train_dataset = tf.data.Dataset.from_tensor_slices(
    ((train_article_padded, train_decoder_input), train_decoder_output)
).batch(batch_size, drop_remainder=True)

valid_dataset = tf.data.Dataset.from_tensor_slices(
    ((valid_article_padded, valid_decoder_input), valid_decoder_output)
).batch(batch_size, drop_remainder=True)


In [16]:

# Encoder
encoder_inputs = Input(shape=(max_article_len,))
enc_emb_layer = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=True)
enc_emb = enc_emb_layer(encoder_inputs)
enc_emb_projected = TimeDistributed(Dense(latent_dim * 2))(enc_emb)

encoder_outputs = enc_emb_projected
encoder_states = []
for i in range(num_encoder_layers):
    encoder_lstm = Bidirectional(
        LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.4),
        merge_mode='concat', name=f'bidirectional_{i}'
    )
    encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_lstm(encoder_outputs)
    encoder_states.append([forward_h, forward_c, backward_h, backward_c])

state_h = Concatenate(name='concatenate_h')([encoder_states[-1][0], encoder_states[-1][2]])
state_c = Concatenate(name='concatenate_c')([encoder_states[-1][1], encoder_states[-1][3]])
encoder_states_final = [state_h, state_c]


In [17]:

# Attention with Weight Normalization
class AttentionWithWeightNorm(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(AttentionWithWeightNorm, self).__init__(**kwargs)

    def build(self, input_shape):
        hidden_dim = input_shape[0][-1]
        self.W_s = self.add_weight(
            name='W_s', shape=(hidden_dim, hidden_dim), initializer='glorot_normal',
            constraint=tf.keras.constraints.UnitNorm(axis=0)
        )
        self.W_h = self.add_weight(
            name='W_h', shape=(hidden_dim, hidden_dim), initializer='glorot_normal',
            constraint=tf.keras.constraints.UnitNorm(axis=0)
        )
        self.v = self.add_weight(
            name='v', shape=(hidden_dim, 1), initializer='glorot_normal',
            constraint=tf.keras.constraints.UnitNorm(axis=0)
        )
        super(AttentionWithWeightNorm, self).build(input_shape)

    def call(self, inputs):
        encoder_outputs, decoder_outputs = inputs
        score_s = tf.tensordot(encoder_outputs, self.W_s, axes=[[2], [0]])
        score_h = tf.tensordot(decoder_outputs, self.W_h, axes=[[2], [0]])
        score_s = tf.expand_dims(score_s, axis=1)
        score_h = tf.expand_dims(score_h, axis=2)
        score = tf.nn.tanh(score_s + score_h)
        score = tf.tensordot(score, self.v, axes=[[3], [0]])
        score = tf.squeeze(score, axis=-1)
        attention_weights = tf.nn.softmax(score, axis=-1)
        context_vector = tf.matmul(attention_weights, encoder_outputs)
        return context_vector, attention_weights


In [18]:

# Decoder
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=True)
dec_emb = dec_emb_layer(decoder_inputs)
dec_emb_projected = TimeDistributed(Dense(latent_dim * 2))(dec_emb)

decoder_lstm = LSTM(latent_dim * 2, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.2)
decoder_outputs, _, _ = decoder_lstm(dec_emb_projected, initial_state=encoder_states_final)

attention_layer = AttentionWithWeightNorm()
context_vector, attention_weights = attention_layer([encoder_outputs, decoder_outputs])

decoder_combined_context = Concatenate(name='concatenate_context')([decoder_outputs, context_vector])
decoder_dense = TimeDistributed(Dense(vocab_size, activation='softmax'))
decoder_outputs = decoder_dense(decoder_combined_context)

# Define and compile model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=7, restore_best_weights=True)
checkpoint = ModelCheckpoint('best_model.keras', monitor='val_loss', save_best_only=True)





In [20]:

# Train model with increased epochs and regularization
history = model.fit(
    train_dataset,
    epochs=50,  # Increased epochs for better training
    validation_data=valid_dataset,
    callbacks=[early_stopping, checkpoint]
)


Epoch 1/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 806ms/step - accuracy: 0.5706 - loss: 3.2069 - val_accuracy: 0.6530 - val_loss: 3.0282
Epoch 2/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 752ms/step - accuracy: 0.6906 - loss: 2.2305 - val_accuracy: 0.6750 - val_loss: 3.1174
Epoch 3/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 733ms/step - accuracy: 0.6940 - loss: 2.0685 - val_accuracy: 0.6810 - val_loss: 3.2812
Epoch 4/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 760ms/step - accuracy: 0.6975 - loss: 1.9447 - val_accuracy: 0.6690 - val_loss: 3.3147
Epoch 5/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 733ms/step - accuracy: 0.7012 - loss: 1.7879 - val_accuracy: 0.6875 - val_loss: 3.3905
Epoch 6/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 731ms/step - accuracy: 0.7072 - loss: 1.6867 - val_accuracy: 0.6930 - val_loss: 3.5479
Epoch 7/50
[1m75/75[

In [21]:

# Encoder model
encoder_model = Model(encoder_inputs, [encoder_outputs, state_h, state_c])

# Decoder model
decoder_inputs = Input(shape=(None,))
decoder_encoder_outputs = Input(shape=(max_article_len, latent_dim * 2))
state_h_input = Input(shape=(latent_dim * 2,))
state_c_input = Input(shape=(latent_dim * 2,))
decoder_states_inputs = [state_h_input, state_c_input]

dec_emb = dec_emb_layer(decoder_inputs)
dec_emb_projected = TimeDistributed(Dense(latent_dim * 2))(dec_emb)
decoder_outputs, dec_h, dec_c = decoder_lstm(dec_emb_projected, initial_state=decoder_states_inputs)
context_vector, attention_weights = attention_layer([decoder_encoder_outputs, decoder_outputs])
decoder_combined_context = Concatenate(name='concatenate_context')([decoder_outputs, context_vector])
decoder_outputs = decoder_dense(decoder_combined_context)

decoder_model = Model(
    [decoder_inputs, decoder_encoder_outputs] + decoder_states_inputs,
    [decoder_outputs, attention_weights, dec_h, dec_c]
)


In [22]:

# Updated generate_summary with robust beam search
def generate_summary(input_article, tokenizer, encoder_model, decoder_model, max_summary_len, vocab_size, beam_width=3):
    try:
        if not isinstance(input_article, str) or not input_article.strip():
            return "<empty_input>"

        # Tokenize and pad input
        article_seq = tokenizer.texts_to_sequences([input_article])
        article_padded = pad_sequences(article_seq, maxlen=max_article_len, padding='post')
        article_padded = tf.convert_to_tensor(article_padded, dtype=tf.int32)

        # Validate indices
        if article_padded.numpy().max() >= vocab_size:
            return f"<invalid_index: max {article_padded.numpy().max()}>"

        # Encoder forward pass
        encoder_outputs, state_h, state_c = encoder_model(article_padded)

        # Initialize beam search
        start_token_id = tokenizer.word_index['<start>']
        end_token_id = tokenizer.word_index['<end>']
        beams = [(np.array([start_token_id]), 0.0, state_h, state_c)]  # (sequence, score, h, c)
        completed = []
        min_length = 5  # Enforce minimum summary length

        for step in range(max_summary_len):
            new_beams = []
            for seq, score, h, c in beams:
                if seq[-1] == end_token_id and len(seq) >= min_length:
                    completed.append((seq, score))
                    continue

                target_seq = tf.convert_to_tensor([[seq[-1]]], dtype=tf.int32)
                outputs, attn_weights, next_h, next_c = decoder_model([target_seq, encoder_outputs, h, c])
                probs = outputs[0, 0]  # [vocab_size]
                probs = tf.nn.softmax(probs)
                probs = probs / tf.reduce_sum(probs)  # Normalize to avoid numerical issues
                top_probs, top_indices = tf.math.top_k(probs, k=beam_width)

                # Penalize repetitive tokens
                seen_tokens = set(seq)
                for prob, idx in zip(top_probs.numpy(), top_indices.numpy()):
                    if idx in seen_tokens and idx not in [start_token_id, end_token_id]:
                        prob *= 0.7  # Reduce probability for repeated tokens
                    if idx >= vocab_size:  # Cap invalid indices
                        idx = tokenizer.word_index['<unk>']
                    new_seq = np.append(seq, idx)
                    new_score = score + np.log(prob + 1e-10)
                    new_beams.append((new_seq, new_score, next_h, next_c))

            beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_width]
            if len(completed) >= beam_width:
                break

        # Select best sequence
        if completed:
            best_seq, best_score = max(completed, key=lambda x: x[1])
        else:
            best_seq, best_score = max(beams, key=lambda x: x[1])

        # Debug generated sequence
        print(f"Generated indices: {best_seq}")
        # Decode sequence to text
        summary = [tokenizer.index_word.get(idx, '<unk>') for idx in best_seq
                   if idx not in [start_token_id, end_token_id, tokenizer.word_index.get('<pad>', 0)]]
        return ' '.join(summary) if summary else "<no_output>"

    except Exception as e:
        print(f"Error summarizing text: {input_article[:50]}... | Error: {str(e)}")
        return f"<error: {str(e)}>"


In [23]:

# Evaluate summaries
def evaluate_metrics(articles, summaries, tokenizer, encoder_model, decoder_model, max_summary_len, vocab_size):
    rouge = Rouge()
    bleu_scores, rouge_scores = [], []
    for article, target in zip(articles, summaries):
        generated = generate_summary(article, tokenizer, encoder_model, decoder_model, max_summary_len, vocab_size)
        target = target.replace('<start> ', '').replace(' <end>', '')
        if generated and target and not any(x in generated for x in ['<error', '<invalid', '<empty', '<no_output']):
            bleu = sentence_bleu([target.split()], generated.split(), weights=(0.25, 0.25, 0.25, 0.25))
            rouge_score = rouge.get_scores(generated, target, avg=True)['rouge-l']['f']
            bleu_scores.append(bleu)
            rouge_scores.append(rouge_score)
    return (np.mean(bleu_scores) if bleu_scores else 0.0, np.mean(rouge_scores) if rouge_scores else 0.0)

# Test inference
bleu, rouge = evaluate_metrics(test_articles, test_summaries, tokenizer, encoder_model, decoder_model, max_summary_len, vocab_size)
print(f"Final Evaluation -> BLEU: {bleu:.4f}, ROUGE-L: {rouge:.4f}")


Error summarizing text: తాడేపల్లి బైపాస్ కుంచనపల్లి కూడలి టీడీపీ, జనసేన నా... | Error: too many values to unpack (expected 2)
Error summarizing text: ఏపీ సీఎం వైఎస్ జగన్, సతీమణి భారతితో కలిసి లండన్కు ... | Error: too many values to unpack (expected 2)
Error summarizing text: రైతులు, వరిసాగుపై వివాదస్పద వ్యాఖ్యలు సిద్దిపేట కల... | Error: too many values to unpack (expected 2)
Error summarizing text: పోలీసు అమరవీరుల సంస్మరణ దినం జరుపుకోవడానికి ప్రధాన... | Error: too many values to unpack (expected 2)
Error summarizing text: ప్రభుత్వం తరఫున మంచి పనులెన్నో చేశాం ప్రజలకు వివరి... | Error: too many values to unpack (expected 2)
Error summarizing text: కచ్చితమైన బౌలింగ్కు గ్లెన్ మెక్గ్రాత్ పెట్టింది పే... | Error: too many values to unpack (expected 2)
Error summarizing text: నగరంలో శ్రీవారి లడ్డూ ప్రసాద విక్రయాలు ముగిశాయి లక... | Error: too many values to unpack (expected 2)
Error summarizing text: ముఖ్యమంత్రి కేసీఆర్ పైనా, టీఆర్ఎస్ నేతలపైనా బీజేపీ... | Error: too many values to unpack (expe

KeyboardInterrupt: 

In [None]:

# Example summaries
for i, article in enumerate([
    "సిద్దిపేట ఘటనపై ఎమ్మెల్యే రఘునందన్ రావు హైకోర్టును ఆశ్రయించారు 18 లక్షలు దొరికాయంటూ కట్టు కథ అల్లారని రఘునందన్ రావు పిటిషన్లో పేర్కొన్నారు ఎమ్మెల్యేలపై క్రిమినల్ కేసులను సీజే ధర్మాసనం విచారిస్తుందని జస్టిస్ లక్ష్మణ్ బెంచ్ చెప్పింది ప్రధాన న్యాయమూర్తి ధర్మాసనానికి బదిలీ చేయాలని రిజిస్టీకి న్యాయమూర్తి ఆదేశించారు",
    "ఏపీఎస్ఆర్టీసీ ఉద్యోగులకు సీఎం జగన్ శుభవార్త చెప్పారు 1 168 మందికి కారుణ్య నియామకాల ఉద్యోగాలు ఇచ్చేందుకు అంగీకరించారు",
    "పార్లమెంట్లో సమావేశమైన విపక్ష నేతలు మృతులకు కుటుంబాలకు ప్రగాఢ సానుభూతి తెలిపారు విపక్ష నేతలు ఉగ్రవాదుల దాడిని తీవ్రంగా ఖండించారు అనంతరం ఉప రాష్ట్రపతి అభ్యర్థిపై విపక్ష నేతలు చర్చిస్తున్నారు ఉప రాష్ట్రపతి అభ్యర్థిని ఏకగ్రీవంగా ఎన్నుకునే అవకాశం ఉంది"
]):
    summary = generate_summary(article, tokenizer, encoder_model, decoder_model, max_summary_len, vocab_size)
    print(f"Article {i+1}: {article[:50]}...")
    print(f"Generated Summary: {summary}\n")