In [10]:
import pandas as pd
import numpy as np
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, LeaveOneGroupOut, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\saita\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saita\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\saita\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
# Load dataset
df = pd.read_csv("data_stories_one_shot.csv")  # Adjust path if needed


In [13]:
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()

def preprocess(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    tokens = tokenizer.tokenize(text)  # Use Treebank tokenizer instead of word_tokenize
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    
    return ' '.join(tokens)

In [14]:
# Apply preprocessing
df['Processed'] = df['Sentence'].apply(preprocess)

# Vectorize
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['Processed'])
y = df['Stage']
groups = df['Plot_Name']


In [15]:
# Define models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'SVM': SVC(kernel='linear'),
    'Naive Bayes': MultinomialNB(),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}

In [16]:
# Store results
results = {
    'Model': [],
    'Zero-Shot CV Accuracy': [],
    'Zero-Shot Leave-One-Plot-Out Accuracy': []
}

# Stratified Cross-Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Evaluate each model
for name, model in models.items():
    # CV accuracy
    cv_scores = cross_val_score(model, X, y, cv=cv)
    
    # Leave-One-Plot-Out accuracy
    logo = LeaveOneGroupOut()
    logo_scores = []
    for train_idx, test_idx in logo.split(X, y, groups):
        model.fit(X[train_idx], y[train_idx])
        preds = model.predict(X[test_idx])
        logo_scores.append(accuracy_score(y[test_idx], preds))
    
    # Record results
    results['Model'].append(name)
    results['Zero-Shot CV Accuracy'].append(np.mean(cv_scores))
    results['Zero-Shot Leave-One-Plot-Out Accuracy'].append(np.mean(logo_scores))

# Results table
results_df = pd.DataFrame(results)
print(results_df.round(3))




                 Model  Zero-Shot CV Accuracy  \
0  Logistic Regression                  0.738   
1                  SVM                  0.769   
2          Naive Bayes                  0.762   
3        Random Forest                  0.715   

   Zero-Shot Leave-One-Plot-Out Accuracy  
0                                  0.672  
1                                  0.758  
2                                  0.750  
3                                  0.662  
