In [None]:
# Step 1: Iotnstall required libraries
!pip install tensorflow pandas nltk

# Import necessary libraries
import os
import zipfile
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Attention, Concatenate
from tensorflow.keras.models import Model
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('punkt_tab')

# Step 2: Extract and Load the Dataset
uploaded_file_path = '/content/archive.zip'  # Path to the uploaded zip file
extracted_path = '/content/dataset/'  # Path to extract the zip file

# Extract the zip file
with zipfile.ZipFile(uploaded_file_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_path)

# List files in the extracted folder to identify dataset
extracted_files = os.listdir(extracted_path)
print(f"Files in extracted folder: {extracted_files}")

# Automatically detect the dataset file (assuming it is a CSV)
dataset_file = None
for file in extracted_files:
    if file.endswith('.txt'):
        dataset_file = os.path.join(extracted_path, file)
        break

if not dataset_file:
    raise FileNotFoundError("No txt file found in the extracted folder. Please check the uploaded ZIP file.")

# Load the dataset
df = pd.read_csv(dataset_file, delimiter='\t', header=None, names=['input','response'])
print("Dataset loaded successfully!")
print(df.head())

# Step 3: Data Preprocessing
# Ensure the dataset has 'input' and 'response' columns
if 'input' not in df.columns or 'response' not in df.columns:
    raise ValueError("Dataset must contain 'input' and 'response' columns.")

# Cleaning the text (lowercasing, tokenizing)
def preprocess_text(text):
    tokens = word_tokenize(str(text).lower())
    return ' '.join(tokens)

df['input'] = df['input'].apply(preprocess_text)
df['response'] = df['response'].apply(preprocess_text)

# Splitting the dataset
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization and vectorization
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(df['input'].tolist() + df['response'].tolist())

input_sequences = tokenizer.texts_to_sequences(train_data['input'])
response_sequences = tokenizer.texts_to_sequences(train_data['response'])

# Padding sequences
max_sequence_len = 50
input_sequences = tf.keras.preprocessing.sequence.pad_sequences(input_sequences, maxlen=max_sequence_len, padding='post')
response_sequences = tf.keras.preprocessing.sequence.pad_sequences(response_sequences, maxlen=max_sequence_len, padding='post')
