In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load the dataset from Google Drive
train_df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/train.csv')
test_df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/test.csv')

# Use only the first 100 rows of the dataset
train_df = train_df.head(100)
test_df = test_df.head(100)

# Check the columns
print(train_df.columns)
print(test_df.columns)

# Preprocessing
def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = text.replace('[^a-zA-Z0-9\s]', '')
        return text
    return ""

# Fill NaN values with empty strings
train_df['headline'] = train_df['headline'].fillna("")
train_df['article'] = train_df['article'].fillna("")
test_df['headline'] = test_df['headline'].fillna("")
test_df['article'] = test_df['article'].fillna("")

# Adjust the column names as per the actual CSV file
train_df['input_text'] = train_df['article'].apply(preprocess_text)
train_df['summary_text'] = train_df['headline'].apply(preprocess_text)
test_df['input_text'] = test_df['article'].apply(preprocess_text)
test_df['summary_text'] = test_df['headline'].apply(preprocess_text)

# Tokenization and padding
max_input_len = 100
max_summary_len = 20

input_tokenizer = Tokenizer()
input_tokenizer.fit_on_texts(train_df['input_text'])
input_sequences = input_tokenizer.texts_to_sequences(train_df['input_text'])
input_sequences = pad_sequences(input_sequences, maxlen=max_input_len, padding='post')

summary_tokenizer = Tokenizer()
summary_tokenizer.fit_on_texts(train_df['summary_text'])
summary_sequences = summary_tokenizer.texts_to_sequences(train_df['summary_text'])
summary_sequences = pad_sequences(summary_sequences, maxlen=max_summary_len, padding='post')

# Define the model
latent_dim = 300

# Encoder
encoder_inputs = tf.keras.Input(shape=(max_input_len,))
encoder_embedding = tf.keras.layers.Embedding(len(input_tokenizer.word_index) + 1, latent_dim, trainable=True)(encoder_inputs)
encoder_lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True))
encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_lstm(encoder_embedding)
state_h = tf.keras.layers.Concatenate()([forward_h, backward_h])
state_c = tf.keras.layers.Concatenate()([forward_c, backward_c])

# Decoder
decoder_inputs = tf.keras.Input(shape=(max_summary_len,))
decoder_embedding = tf.keras.layers.Embedding(len(summary_tokenizer.word_index) + 1, latent_dim, trainable=True)(decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM(latent_dim * 2, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])

# Attention Layer
attention = tf.keras.layers.AdditiveAttention()
attention_output = attention([decoder_outputs, encoder_outputs])
decoder_concat_input = tf.keras.layers.Concatenate(axis=-1)([decoder_outputs, attention_output])

# Dense layer
dense = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(len(summary_tokenizer.word_index) + 1, activation='softmax'))
decoder_outputs = dense(decoder_concat_input)

# Compile model
model = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Prepare data for training
decoder_input_data = summary_sequences[:, :-1]
decoder_target_data = summary_sequences[:, 1:]

# Pad the decoder input and target data
decoder_input_data = pad_sequences(decoder_input_data, maxlen=max_summary_len-1, padding='post')
decoder_target_data = pad_sequences(decoder_target_data, maxlen=max_summary_len-1, padding='post')
decoder_target_data = np.expand_dims(decoder_target_data, -1)

# Train the model
model.fit([input_sequences, decoder_input_data], decoder_target_data, batch_size=20, epochs=100, validation_split=0.2)

# Save the model
model.save('hindi_text_summarization.h5')

print("Model trained and saved successfully.")


Mounted at /content/drive


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/My Drive/Colab Notebooks/train.csv'