<a href="https://colab.research.google.com/github/sanjanabayya30/Generative_AI_2025/blob/main/GENAI_ProjectCode_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install language_tool_python

Collecting language_tool_python
  Downloading language_tool_python-2.9.0-py3-none-any.whl.metadata (54 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/54.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.5/54.5 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Downloading language_tool_python-2.9.0-py3-none-any.whl (49 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.2/49.2 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: language_tool_python
Successfully installed language_tool_python-2.9.0


In [2]:
import re
import numpy as np
import pandas as pd
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, roc_curve, auc, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.translate.bleu_score import sentence_bleu
import language_tool_python

# Download NLTK data for BLEU score
nltk.download('punkt')

# --- Dataset Cleaning ---
def clean_word_frequency_data(file_path):
    """Cleans the English word frequency dataset."""
    df = pd.read_csv('/content/drive/MyDrive/archive.zip')
    df.columns = df.columns.str.strip().str.lower()
    df['word'] = df['word'].str.lower().str.replace(r'[^a-z]', '', regex=True)
    df = df[df['word'] != '']
    df = df.dropna()
    df = df.groupby('word', as_index=False)['count'].sum()
    df = df.sort_values('count', ascending=False)
    return df.set_index('word')['count'].to_dict()

# Update with your dataset path
word_freq_data = clean_word_frequency_data("english_word_frequency.csv")

# --- Model Setup ---
model_name = "gpt2-medium"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
grammar_tool = language_tool_python.LanguageTool('en-US')
tokenizer.pad_token = tokenizer.eos_token  # Set pad token

# --- Generation Functions ---
def generate_story(prompt, max_new_tokens=500, temperature=0.7, top_k=50):
    """Generates a complete story using GPT-2 with controlled length."""
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    outputs = model.generate(
        inputs,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        top_k=top_k,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        no_repeat_ngram_size=2  # Reduce repetition
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def filter_common_words(text, word_freq_data, threshold=1000):
    """Filters text while preserving sentence structure."""
    sentences = re.split(r'(?<=[.!?]) +', text)
    filtered = []
    for sent in sentences:
        words = re.findall(r'\b\w+\b', sent.lower())
        filtered_words = [word for word in words if word_freq_data.get(word, 0) > threshold]
        if filtered_words:
            filtered.append(' '.join(filtered_words).capitalize())
    return '. '.join(filtered) + ('' if text.endswith(('.','!','?')) else '.')

def correct_grammar(text):
    """Corrects grammar while preserving original structure."""
    matches = grammar_tool.check(text)
    return language_tool_python.utils.correct(text, matches)

def count_words(text):
    return len(re.findall(r'\b\w+\b', text))

# --- User Interaction ---
user_prompt = input("Enter your story prompt: ").strip()[:100]
prompt_word_count = count_words(user_prompt)
print(f"\nYour Prompt ({prompt_word_count} words): {user_prompt}")

# --- Story Processing Pipeline ---
generated_story = generate_story(user_prompt)
raw_word_count = count_words(generated_story)

# Ensure story ends with proper punctuation
if not re.search(r'[.!?]$', generated_story):
    generated_story = generated_story.rsplit('.', 1)[0] + '.' if '.' in generated_story else generated_story + '.'

filtered_story = filter_common_words(generated_story, word_freq_data)
filtered_word_count = count_words(filtered_story)

corrected_story = correct_grammar(filtered_story)
corrected_word_count = count_words(corrected_story)

# --- Display Results ---
print(f"\nGenerated Story (Raw - {raw_word_count} words):\n{generated_story}")
print(f"\nFiltered Story ({filtered_word_count} words):\n{filtered_story}")
print(f"\nCorrected Story ({corrected_word_count} words):\n{corrected_story}")

# --- Save Results ---
with open("generated_stories.txt", "a") as f:
    f.write(f"\n\n{'='*50}")
    f.write(f"\nPrompt ({prompt_word_count} words): {user_prompt}")
    f.write(f"\n\nRaw Story ({raw_word_count} words):\n{generated_story}")
    f.write(f"\n\nFiltered Story ({filtered_word_count} words):\n{filtered_story}")
    f.write(f"\n\nCorrected Story ({corrected_word_count} words):\n{corrected_story}")
    f.write(f"\n{'='*50}")

# --- Evaluation Metrics ---
word_counts = {
    'prompt': prompt_word_count,
    'raw': raw_word_count,
    'filtered': filtered_word_count,
    'corrected': corrected_word_count
}

print("\nWord Count Metrics:")
for stage, count in word_counts.items():
    print(f"{stage.capitalize():<10}: {count} words")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading LanguageTool 6.5: 100%|██████████| 248M/248M [00:04<00:00, 55.0MB/s]
INFO:language_tool_python.download_lt:Unzipping /tmp/tmp11r4je5g.zip to /root/.cache/language_tool_python.
INFO:language_tool_python.download_lt:Downloaded https://www.languagetool.org/download/LanguageTool-6.5.zip to /root/.cache/language_tool_python.


Enter your story prompt: As the clock struck midnight, a lone violinist's melody echoed through the deserted streets, weaving a spell of forgotten memories and hidden dreams.


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



Your Prompt (16 words): As the clock struck midnight, a lone violinist's melody echoed through the deserted streets, weaving

Generated Story (Raw - 411 words):
As the clock struck midnight, a lone violinist's melody echoed through the deserted streets, weaving through a sea of shadows and dank darkness.

"Greetings, dear child," was her reply. "My name is the young girl who plays the violin." She was a child of seventeen, and her voice was soft and delicate. She smiled at the boy over her shoulder, the way she smiled when she spoke of her beloved, her mother. He smiled back, his dark eyes distant. The young violin played the melody, as always. But at midnight this time it was completely different. As the two fell into silence, she whispered, "Gee, I wonder if this is…what I think it is?"
 and the man smiled, smiling back. Her voice echoed, so softly, through this darkened alley. It was the sound of the piano, played by a woman who was thirty years older than me. I was fifteen years o