#Movie Script Generator

In [None]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import re

def preprocess_script(text):
    """
    Preprocess the script by maintaining scene descriptions, dialogues, and special instructions.
    Cleans and formats the text for consistency.
    """
    # Split the script into lines
    lines = text.split('\n')

    # Preprocessed script
    preprocessed_script = []

    for line in lines:
        # Remove any extraneous whitespace
        line = line.strip()

        # Skip empty lines
        if not line:
            continue

        # Check for character dialogues (usually in all caps)
        if line.isupper() and len(line.split()) <= 5:
            # Add a marker for character names
            line = f"<CHARACTER>{line}</CHARACTER>"

        # Add the processed line to the preprocessed script
        preprocessed_script.append(line)

    return '\n'.join(preprocessed_script)

# Read the entire script
file_path = '/content/inglorious_basterds_script.txt'  # Replace with your script file path
with open(file_path, 'r', encoding='utf-8') as file:
    entire_script_content = file.read()

# Preprocess the script
preprocessed_script = preprocess_script(entire_script_content)

# Save the preprocessed script to a new file
preprocessed_file_path = '/content/preprocessed_script.txt'  # Replace with your desired file path
with open(preprocessed_file_path, 'w', encoding='utf-8') as file:
    file.write(preprocessed_script)


In [None]:
import string
from collections import Counter


def tokenize_script(script_lines):
    """
    Tokenizes the script by splitting each line into words, removing stop words and punctuation,
    and creating a word-to-index mapping.
    """
    stop_words = set(stopwords.words('english'))
    # Remove punctuation and split each line into words
    tokens = [word.strip(string.punctuation) for line in script_lines for word in line.split()]

    # Filter out stop words and empty tokens
    tokens = [word for word in tokens if word and word.lower() not in stop_words]

    # Creating a counter of all words
    word_counts = Counter(tokens)

    # Sorting words according to their frequency
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)

    # Creating a word to index mapping (word -> integer)
    word_to_index = {word: index for index, word in enumerate(sorted_vocab, 1)}  # starting index from 1

    # Tokenizing the script
    tokenized_script = [[word_to_index.get(word.strip(string.punctuation)) for word in line.split() if word.strip(string.punctuation) and word.lower() not in stop_words and word_to_index.get(word.strip(string.punctuation)) is not None] for line in script_lines]

    return tokenized_script, word_to_index

# Path to the preprocessed script file
preprocessed_file_path = '/content/preprocessed_script.txt'  # Replace with your file path

# Reading the preprocessed script
with open(preprocessed_file_path, 'r', encoding='utf-8') as file:
    script_content = file.read()

# Splitting the script into a list of lines
script_lines = script_content.split('\n')

# Tokenizing the script
tokenized_script, word_to_index = tokenize_script(script_lines)

# Displaying the first few tokenized lines and a snippet of the word_to_index mapping
print(tokenized_script[:5])
print({k: word_to_index[k] for k in list(word_to_index)[:10]})  # Showing first 10 words in the word_to_index dictionary


[[1112, 1113], [1114, 525, 611, 1524, 1525, 271, 16, 156, 1115, 401], [720, 464, 1526, 721], [402, 2373], [1527, 1528, 1529]]
{'CHARACTER>COL': 1, 'LANDA</CHARACTER': 2, 'CHARACTER>LT': 3, 'German': 4, 'CHARACTER>SHOSANNA:</CHARACTER': 5, 'Shosanna': 6, 'ALDO</CHARACTER': 7, 'one': 8, 'I’m': 9, 'back': 10}


In [None]:
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, Dropout
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

# Assuming 'tokenized_script' and 'word_to_index' are available from your tokenization process

# Prepare data for training
def prepare_sequences(tokenized_script, sequence_length):
    X, y = [], []
    for line in tokenized_script:
        for i in range(1, len(line)):
            sequence = line[:i+1]
            sequence = pad_sequences([sequence], maxlen=sequence_length, padding='pre')[0]
            X.append(sequence[:-1])
            y.append(sequence[-1])
    return np.array(X), to_categorical(y, num_classes=len(word_to_index) + 1)


In [None]:
# Define sequence length and prepare sequences
sequence_length = 50  # You can adjust this
X, y = prepare_sequences(tokenized_script, sequence_length)

In [None]:
def generate_text(seed_text, num_words, model, word_to_index, index_to_word):
    text = []
    for _ in range(num_words):
        encoded = [word_to_index[word] for word in seed_text.split() if word in word_to_index]
        encoded = pad_sequences([encoded], maxlen=sequence_length-1, padding='pre')
        y_pred = model.predict(encoded, verbose=0)

        # Get the index with the highest probability
        predicted_index = np.argmax(y_pred, axis=-1)[0]
        predicted_word = index_to_word[predicted_index]

        seed_text += ' ' + predicted_word
        text.append(predicted_word)
    return ' '.join(text)

In [None]:
import streamlit as st
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import tensorflow as tf

# Your generate_text function here (make sure to define necessary variables like model, word_to_index, index_to_word, sequence_length)

# Streamlit interface
st.title('Movie Script Generation App')

# Text input for seed_text
seed_text = st.text_input('Seed Text', 'Enter your seed text here')

# Numeric input for num_words
num_words = st.number_input('Number of Words to Generate', min_value=1, max_value=100, value=25)

model=tf.keras.models.load_model('/content/my_model.keras')
# Button to generate text
if st.button('Generate Text'):
    generated_text = generate_text(seed_text, num_words, model, word_to_index, {v: k for k, v in word_to_index.items()})
    st.text("Generated Text:")
    st.write(generated_text)

ModuleNotFoundError: ignored