In [None]:
import pandas as pd
from collections import defaultdict

# Load data
with open("train.wp_source", "r", encoding="utf-8") as f:
    prompts = [line.strip() for line in f.readlines() if line.strip()]

with open("train.wp_target", "r", encoding="utf-8") as f:
    full_stories = [line.strip() for line in f.readlines() if line.strip()]

# Create a mapping for one-to-many relationship
prompt_to_stories = defaultdict(list)

# Align stories with prompts
prompt_index = 0
for story in full_stories:
    prompt_to_stories[prompts[prompt_index]].append(story)
    if prompt_index < len(prompts) - 1:
        prompt_index += 1  # Move to the next prompt if applicable

# Create DataFrame
data = []
for prompt, stories in prompt_to_stories.items():
    for story in stories:
        data.append({
            "story_id": f"{len(data):05d}",
            "prompt": prompt,
            "full_story": story
        })

df = pd.DataFrame(data)

# Save the merged dataset
df.to_csv("merged_writing_prompts.csv", index=False)

print(f"✅ Dataset successfully merged with {len(df)} entries!")


In [None]:
import pandas as pd

# Load Writing Prompt Dataset (Ensure CSV file has "prompt" and "story" columns)
df = pd.read_csv("merged_writing_prompts.csv")

# Display the first few rows
print(df.head())


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:

import pandas as pd

# Load Writing Prompt Dataset (Ensure CSV file has "prompt" and "story" columns)
df = pd.read_csv("/content/drive/MyDrive/Project /merged_writing_prompts.csv")

# Display the first few rows
print(df.head())

In [None]:
!git config --global credential.helper store

In [None]:
from huggingface_hub import login

# Enter your Hugging Face token here
hf_token = "YOUR KEY"  # Replace this with your actual token

# Login using the token
login(token=hf_token)

!huggingface-cli login

In [None]:
pip install --upgrade datasets

In [None]:
print(df.columns)
print(df.head())

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import pipeline
import torch

# ✅ Optimized Zero-Shot Classifier Pipeline
classifier = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli",
    device_map="auto",
    batch_size=256,
    model_kwargs={"torch_dtype": torch.float16}
)

# Genre options
genres = ["Horror", "Sci-Fi", "Mystery", "Fantasy", "Romance", "Adventure"]

# ✅ Step 1: Classify genres in batches for better speed
def classify_genre_batch(examples):
    results = classifier(examples['prompt'], candidate_labels=genres, truncation=True)
    return {"genre": [res['labels'][0] for res in results]}

# ✅ Step 2: Convert to Dataset for batch processing
dataset = Dataset.from_pandas(df)

# ✅ Step 3: Classify genres in batch
dataset = dataset.map(classify_genre_batch, batched=True, batch_size=256)

# ✅ Step 4: Reduce dataset size with balanced genre distribution
def balance_dataset(dataset, genres, n_samples=10000):
    balanced_data = []
    samples_per_genre = n_samples // len(genres)
    for genre in genres:
        genre_data = dataset.filter(lambda example: example['genre'] == genre).shuffle(seed=42).select(range(samples_per_genre))
        balanced_data.append(genre_data)
    return Dataset.from_dict({key: sum((d[key] for d in balanced_data), []) for key in balanced_data[0].features})

dataset = balance_dataset(dataset, genres)

# ✅ Step 5: Batch classify genre scores
def batch_classify(examples):
    results = classifier(examples['prompt'], candidate_labels=genres, truncation=True)
    genre_scores = [
        ', '.join([f"{label}: {score*100:.1f}%" for label, score in zip(res['labels'], res['scores'])])
        for res in results
    ]
    return {"genre_mix": genre_scores}

dataset = dataset.map(batch_classify, batched=True, batch_size=256)

# ✅ Step 6: Convert back to DataFrame and display results
df1 = dataset.to_pandas()
print(df1[["prompt", "genre_mix"]].head())


In [None]:
df1.to_csv("/content/drive/MyDrive/Project /gen_dataset.csv",index=False)

In [None]:
import pandas as pd
import time
from transformers import pipeline

# Load Emotion Classifier with Longformer (handles longer sequences)
emotion_classifier = pipeline("text-classification", model="allenai/longformer-base-4096")

# Function to handle long text sequences
def get_emotion(story):
    emotions = emotion_classifier(story, truncation=True, max_length=4096)
    return emotions[0]["label"]

# Estimate time to complete
start_time = time.time()

df1["emotion"] = df1["full_story"].apply(get_emotion)

end_time = time.time()
print(f"Time taken to complete: {end_time - start_time:.2f} seconds")

# Display results
print(df1[["full_story", "emotion"]].head())

In [None]:
df1.head()
df1.count()

In [None]:
import pandas as pd1

# Load Writing Prompt Dataset (Ensure CSV file has "prompt" and "story" columns)
df2 = pd1.read_csv("/content/drive/MyDrive/Project /gen_dataset.csv")

# Display the first few rows
print(df2.head())

In [None]:
import pandas as pd1

# Load Writing Prompt Dataset (Ensure CSV file has "prompt" and "story" columns)
df2 = pd1.read_csv("/content/drive/MyDrive/Project /gen_dataset.csv")

# Display the first few rows
print(df2.head())

In [None]:
import pandas as pd
import torch.multiprocessing as mp
from datasets import Dataset
from transformers import pipeline
import torch

# ✅ Set multiprocessing start method for CUDA compatibility
mp.set_start_method('spawn', force=True)

# ✅ Optimized Zero-Shot Classifier Pipeline with Accelerate
classifier = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli",
    device_map="auto",
    batch_size=256,
    model_kwargs={"torch_dtype": torch.float16}
)

# Genre options
genres = ["Horror", "Sci-Fi", "Mystery", "Fantasy", "Romance", "Adventure"]

# ✅ Step 1: Classify genres in batches for better speed
def classify_genre_batch(examples):
    results = classifier(list(examples['prompt']), candidate_labels=genres, truncation=True)
    return {"genre": [res['labels'][0] for res in results]}

# ✅ Step 2: Convert to Dataset for batch processing
dataset = Dataset.from_pandas(df2)

# ✅ Step 3: Classify genres in batch
dataset = dataset.map(classify_genre_batch, batched=True, batch_size=256, num_proc=1)

# ✅ Step 4: Reduce dataset size with balanced genre distribution
def balance_dataset(dataset, genres, n_samples=10000):
    balanced_data = []
    samples_per_genre = n_samples // len(genres)
    for genre in genres:
        genre_data = dataset.filter(lambda example: example['genre'] == genre).shuffle(seed=42)
        if len(genre_data) < samples_per_genre:
            genre_data = genre_data.select(range(len(genre_data)))
        else:
            genre_data = genre_data.select(range(samples_per_genre))
        balanced_data.append(genre_data)
    return Dataset.from_dict({key: sum((d[key] for d in balanced_data), []) for key in balanced_data[0].features})

dataset = balance_dataset(dataset, genres)

# ✅ Step 5: Batch classify genre scores
def batch_classify(examples):
    results = classifier(list(examples['prompt']), candidate_labels=genres, truncation=True)
    genre_scores = [
        ', '.join([f"{label}: {score*100:.1f}%" for label, score in zip(res['labels'], res['scores'])])
        for res in results
    ]
    return {"genre_mix": genre_scores}

# ✅ Step 6: Final batch classification with optimized batch size
dataset = dataset.map(batch_classify, batched=True, batch_size=256, num_proc=1)


In [None]:
label_counts = df2['genre'].value_counts()
print(label_counts)

In [None]:
df2.to_csv("/content/drive/MyDrive/Project /gen_dataset_2.csv",index=False)

In [None]:
import pandas as pd1

# Load Writing Prompt Dataset (Ensure CSV file has "prompt" and "story" columns)
df3 = pd1.read_csv("/content/drive/MyDrive/Project /gen_dataset_2.csv")

# Display the first few rows
print(df3.head())

In [None]:
!pip install torch-xla -f https://storage.googleapis.com/libtpu-releases/index.html

In [None]:
import pandas as pd
import time
from transformers import pipeline

# Load Emotion Classifier with Longformer (handles longer sequences)
emotion_classifier = pipeline("text-classification", model="allenai/longformer-base-4096")

# Function to handle long text sequences
def get_emotion(story):
    emotions = emotion_classifier(story, truncation=True, max_length=4096)
    return emotions[0]["label"]

# Estimate time to complete
start_time = time.time()

df3["emotion"] = df3["full_story"].apply(get_emotion)

end_time = time.time()
print(f"Time taken to complete: {end_time - start_time:.2f} seconds")

# Display results
print(df3[["full_story", "emotion"]].head())

In [None]:
from transformers import pipeline

# Load Emotion Classifier optimized for A100 with FP16 and batch inference
emotion_classifier = pipeline(
    "text-classification",
    model="bhadresh-savani/bert-base-uncased-emotion",
    device=0,  # Send to GPU
    batch_size=256,  # Higher batch size for A100 efficiency
    torch_dtype="auto"  # Automatically uses float16 for A100
)

# Function to classify emotion
def get_emotion(stories):
    emotions = emotion_classifier(stories, truncation=True, max_length=512)
    return [emotion['label'] for emotion in emotions]

# Apply to DataFrame
df3["emotion"] = get_emotion(df3["full_story"].tolist())

print(df3[["full_story", "emotion"]].head())


In [None]:
label_counts = df3['emotion'].value_counts()
print(label_counts)

In [None]:
df3.to_csv("/content/drive/MyDrive/Project /emo_dataset1.csv",index=False)

In [None]:
import pandas as pd1

# Load Writing Prompt Dataset (Ensure CSV file has "prompt" and "story" columns)
df4 = pd1.read_csv("/content/drive/MyDrive/Project /emo_dataset1.csv")

# Display the first few rows
print(df4.head())

In [None]:
df4["full_story"] = df4["full_story"].fillna("").astype(str)

In [None]:
import os
import torch
from transformers import pipeline
import pandas as pd
from tqdm import tqdm
import unicodedata
import time
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
torch.backends.cuda.matmul.allow_tf32 = True
torch.cuda.empty_cache()
import gc
gc.collect()

In [None]:
import pandas as pd

# Load Writing Prompt Dataset (Ensure CSV file has "prompt" and "story" columns)
df = pd.read_csv("/content/drive/MyDrive/Project /emo_dataset1.csv")

# Display the first few rows
print(df.head())

In [None]:
# Check for empty or problematic entries
print(df['full_story'].isna().sum())  # Count empty values
print(df['full_story'].apply(len).describe())  # Check text length distribution

In [None]:
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=0, truncation=True)
path_generator = pipeline("text-generation", model="gpt2", device=-1)  # Offloaded to CPU

In [None]:
def generate_choices(story):
    try:
        # Generate the summary and format choices
        summary = summarizer(story, max_length=200, min_length=100, do_sample=False)
        summarized_text = summary[0]['summary_text']
        return [
            f"Investigate: {summarized_text[:30]}...",
            f"Ignore and move on: {summarized_text[30:]}..."
        ]
    except Exception as e:
        print(f"Error in generate_choices: {e}")
        return ["Investigate: Error", "Ignore and move on: Error"]

In [None]:
def generate_next_story_path(story):
    try:
        # Truncate or preprocess story to avoid token limit issues
        prompt = f"Generate two creative paths for this scene based on the story: {story[:500]}"

        # Generate paths with specific configurations
        response = path_generator(
            prompt,
            max_new_tokens=50,  # Limit the generated text length
            num_return_sequences=2,  # Generate two separate outputs
            pad_token_id=50256  # Ensure proper padding for GPT-2
        )

        # Extract and format the generated outputs
        choice_1 = response[0]['generated_text']
        choice_2 = response[1]['generated_text']
        return {"Choice 1": choice_1.strip(), "Choice 2": choice_2.strip()}

    except Exception as e:
        print(f"Error in generate_next_story_path: {e}")
        return {"Choice 1": "Path unavailable", "Choice 2": "Path unavailable"}

In [None]:
def generate_scene_description(story):
    try:
        summary = summarizer(story, max_length=60, min_length=20, truncation=True, do_sample=False)
        return summary[0]['summary_text']
    except Exception as e:
        print(f"Error in generate_scene_description: {e}")
        return "Scene description unavailable"

In [None]:
def clean_text(story, max_token_limit=1024):
    story = unicodedata.normalize("NFKD", story.strip())
    token_count = len(story.split())
    if token_count > max_token_limit:
        story = ' '.join(story.split()[:max_token_limit])
    return story

In [None]:
# Test generate_choices
story = df.loc[4, "full_story"]
print("Choices:", generate_choices(story))

# Test generate_next_story_path
print("Next Story Path:", generate_next_story_path(story))

# Test generate_scene_description
print("Scene Description:", generate_scene_description(story))

In [None]:
from tqdm import tqdm
import torch

with tqdm(total=len(df), desc="Processing Stories", unit="story", dynamic_ncols=True) as pbar:
    batch_size = 4  # Adjust batch size based on hardware
    for i in range(0, len(df), batch_size):
        batch_df = df.iloc[i:i + batch_size].copy()  # Avoid SettingWithCopyWarning

        for index, row in batch_df.iterrows():
            story = row["full_story"]

            # Generate outputs and store them as strings
            df.loc[index, "choices"] = str(generate_choices(story))
            df.loc[index, "next_story_path"] = str(generate_next_story_path(story))
            df.loc[index, "scene_description"] = str(generate_scene_description(story))

        # Clear GPU cache after processing each batch
        torch.cuda.empty_cache()
        pbar.update(batch_size)

In [None]:
df.to_csv("/content/drive/MyDrive/Project /final.csv",index=False)

In [None]:
import pandas as pd

# Load Writing Prompt Dataset (Ensure CSV file has "prompt" and "story" columns)
df = pd.read_csv("/content/drive/MyDrive/Project /final.csv")


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
trainer.train()

In [None]:
!zip -r storytelling_model.zip ./storytelling_model


In [None]:
results = trainer.evaluate()
print("\nEvaluation Results:")
print(results)

In [None]:
import torch
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model

import os
os.environ["WANDB_DISABLED"] = "true"


# ✅ Load Mistral 7B model and tokenize
model_name = "mistralai/Mistral-7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# ✅ Assign a padding token if missing
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# ✅ Load model with correct device settings
model = AutoModelForCausalLM.from_pretrained(
    model_name, torch_dtype=torch.float16, device_map="auto"
)

# ✅ Load dataset (Replace with actual dataset path)
df = pd.read_csv("/content/drive/MyDrive/Project /final.csv")

# Drop unnecessary columns
df = df.drop(columns=["scene_description"])

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# ✅ Split dataset into train & test
split_dataset = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]

# ✅ Preprocessing function (Ensures fixed 1024-token sequences)
def preprocess_function(examples):
    inputs = (
        f"Prompt: {examples['prompt']}\n"
        f"Genre Mix: {examples['genre_mix']}\n"
        f"Emotion: {examples['emotion']}\n"
        f"Choices: {examples['choices']}\n"
        f"Story:"
    )
    targets = f"{examples['full_story']}\nNext Story Path: {examples['next_story_path']}"

    # Tokenize with fixed length 1024 to avoid ArrowInvalid errors
    tokenized_inputs = tokenizer(
        inputs, truncation=True, padding="max_length", max_length=1024, return_tensors="np"
    )
    tokenized_targets = tokenizer(
        targets, truncation=True, padding="max_length", max_length=1024, return_tensors="np"
    )

    return {
        "input_ids": tokenized_inputs["input_ids"][0].tolist(),
        "labels": tokenized_targets["input_ids"][0].tolist()
    }

# ✅ Use `set_transform()` to apply preprocessing dynamically
train_dataset = train_dataset.map(preprocess_function, batched=False)
test_dataset = test_dataset.map(preprocess_function, batched=False)

# ✅ Convert dataset format to tensors for PyTorch compatibility
train_dataset.set_format(type="torch", columns=["input_ids", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "labels"])

# ✅ LoRA Configuration (Efficient fine-tuning)
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],  # Apply LoRA to attention layers
    lora_dropout=0.05
)

# ✅ Apply LoRA to model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# ✅ Training arguments
training_args = TrainingArguments(
    output_dir="./mistral_story_gen",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    per_device_train_batch_size=2,  # Adjust based on GPU memory
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    optim="adamw_torch",
    learning_rate=3e-5,
    weight_decay=0.01,
    num_train_epochs=3,
    save_total_limit=2,
    fp16=True,  # Mixed precision for A100
    push_to_hub=False
)

# ✅ Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# ✅ Train the model
trainer.train()

# ✅ Save fine-tuned model
model.save_pretrained("./mistral_finetuned")
tokenizer.save_pretrained("./mistral_finetuned")


In [None]:
from google.colab import files
files.download("/content/drive/MyDrive/mistral_finetuned.zip")

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import get_peft_model, LoraConfig, TaskType

# ✅ Step 1: Load Tokenizer
model_name = "google/gemma-2b"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# ✅ Step 2: Load Model with Auto Device Mapping (Fixes Meta Device Issue)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,  # Use FP16 for better efficiency
    device_map="auto"  # Automatically distributes model across GPU/CPU
)

# ✅ Step 3: Apply LoRA Configuration
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,  # LoRA for causal language modeling
    r=16,  # Rank
    lora_alpha=32,  # Scaling factor
    lora_dropout=0.1,  # Dropout rate
)

# Apply LoRA
model = get_peft_model(model, lora_config)

# ✅ Step 4: Ensure Model is in Training Mode
model.train()

# ✅ Step 5: Verify Device Allocation
print("Model device allocation:")
print(model.hf_device_map)

print("✅ Model is successfully loaded and ready for fine-tuning!")


In [None]:
import torch
import os
from transformers import AutoModelForCausalLM, AutoTokenizer

# ✅ Ensure correct model path
model_path = os.path.join("/content/drive/MyDrive/Project", "mistral_finetuned")

# ✅ Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

# ✅ Load model efficiently
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16,
    device_map="auto",
    load_in_8bit=True  # Reduces VRAM usage
)

# ✅ Ensure padding token is set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# ✅ Define test input
test_input = "A mysterious figure enters the ancient ruins, carrying an old map. What happens next?"

# ✅ Tokenize input
inputs = tokenizer(test_input, return_tensors="pt", padding=True, truncation=True)
input_ids = inputs.input_ids.to(model.device)
attention_mask = inputs.attention_mask.to(model.device)

# ✅ Generate output with safe settings
with torch.no_grad():
    output_ids = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_new_tokens=200,
        temperature=0.8,
        top_p=0.9,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id  # Avoids potential warnings
    )

# ✅ Decode and print result
generated_story = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("\nGenerated Story:\n", generated_story)


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# ✅ Define the correct local path for the fine-tuned model
model_path = "/content/drive/MyDrive/Project /mistral_finetuned"  # Ensure this is the correct path

# ✅ Load the fine-tuned model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map="auto")

# ✅ Ensure the pad token is set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# ✅ Define test input
test_input = "A mysterious figure enters the ancient ruins, carrying an old map. What happens next?"

# ✅ Tokenize the input
input_ids = tokenizer(test_input, return_tensors="pt").input_ids.to(model.device)

# ✅ Generate output with max_new_tokens (instead of max_length)
with torch.no_grad():
    output_ids = model.generate(input_ids, max_new_tokens=200, temperature=0.8, top_p=0.9, do_sample=True)

# ✅ Decode and print generated story
generated_story = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("\nGenerated Story:\n", generated_story)


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# ✅ Define the correct local path for the fine-tuned model
base_model_path = "mistralai/Mistral-7B-v0.1"  # Base model
lora_model_path = "/content/drive/MyDrive/Project /mistral_finetuned"  # Fine-tuned LoRA model

# ✅ Load the base model
model = AutoModelForCausalLM.from_pretrained(
    base_model_path, torch_dtype=torch.float16, device_map="auto"
)

# ✅ Load the fine-tuned LoRA model and merge
model = PeftModel.from_pretrained(model, lora_model_path)
model = model.merge_and_unload()  # Merges LoRA weights for inference

# ✅ Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_path)

# ✅ Ensure the pad token is set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# ✅ Define test input
test_input = "A mysterious figure enters the ancient ruins, carrying an old map. What happens next?"

# ✅ Tokenize the input
input_ids = tokenizer(test_input, return_tensors="pt").input_ids.to(model.device)

# ✅ Generate output with max_new_tokens (instead of max_length)
with torch.no_grad():
    output_ids = model.generate(input_ids, max_new_tokens=200, temperature=0.8, top_p=0.9, do_sample=True)

# ✅ Decode and print generated story
generated_story = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("\nGenerated Story:\n", generated_story)
