# Importing Dataset

In [None]:
import glob
import logging
import os
import pickle
import random
import re
import shutil
from typing import Dict, List, Tuple

import pandas as pd
import numpy as np
import torch

from sklearn.model_selection import train_test_split

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm.notebook import tqdm, trange

from pathlib import Path

In [None]:
from google.colab import drive
drive.mount('/drive')

csv_data = pd.read_csv('/drive/My Drive/CS425_Project/colab/simpsons_dataset.csv')
csv_data.head()

Mounted at /drive


Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


# QA Pairs

In [None]:
persona = 'Homer Simpson'
csv_data['spoken_words'] = csv_data['spoken_words']
length, _ = csv_data.shape
print("length", length)

length 158314


In [None]:
homer_responses = csv_data[(csv_data.raw_character_text) == persona]['spoken_words']
homer_resp_list = homer_responses.to_numpy().tolist()

In [None]:
scenes = []
scene_index = -1
i = 0 
length,_ = csv_data.shape 
startScene = False
while i < length:
  currentSpeaker = csv_data['raw_character_text'][i]
  utterance = csv_data['spoken_words'][i] 
  if not startScene:
    startScene = True
    scenes.append(list())
    scene_index += 1
    startScene = True
  
  if startScene:
    if type(utterance) is not str:
      #end of scene
      startScene = False
    else:
      scenes[scene_index].append((currentSpeaker,utterance))
  i += 1  

In [None]:
questions = []
responses = []
for scene in scenes:
  lines = scene 
  prev_speaker = None
  prev_utterance = None
  for speaker,utterance in lines:
    # print(speaker,utterance)
    if speaker == persona and prev_speaker != persona and prev_speaker is not None and prev_utterance not in questions:
      questions.append(prev_utterance)
      responses.append(utterance)
    prev_speaker = speaker
    prev_utterance = utterance 

len(questions),len(responses)

(18005, 18005)

In [None]:
import unicodedata
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

cleaned_questions = list(map(normalizeString,questions))
cleaned_responses = list(map(normalizeString,responses))

pairs = list(zip(questions,responses))
pairs[:5]

[('Mr. Bergstrom left today.', 'Oh.'),
 ("He's gone. Forever.", 'And?'),
 ("I didn't think you'd understand.",
  "Hey, just because I don't care doesn't mean I don't understand."),
 ("I'm glad I'm not crying because I would hate for you to think that what I'm about to say is based on emotion. But you, sir, are a baboon!",
  'Me?'),
 ('Yes, you! Baboon, baboon, baboon, baboon!',
  "I don't think you realize what you're saying.")]

In [None]:
PAIRS_OUTPUT = "qa_pairs.txt"
DELIM = "++++@++++"
with open(PAIRS_OUTPUT, 'w') as f:
  for pair in pairs:
    f.write(pair[0]+DELIM+pair[1]+"\n")

# QA Pairs with speaker IDs

In [None]:
csv_data['raw_character_text']

0                     Miss Hoover
1                    Lisa Simpson
2                     Miss Hoover
3                    Lisa Simpson
4         Edna Krabappel-Flanders
                   ...           
158309                Miss Hoover
158310                Miss Hoover
158311                Miss Hoover
158312               Ralph Wiggum
158313                      JANEY
Name: raw_character_text, Length: 158314, dtype: object

In [None]:
from sklearn.preprocessing import LabelEncoder

csv_data['clean_character_text'] = csv_data['raw_character_text']
csv_data['clean_character_text'].fillna('NAN',inplace=True)

le = LabelEncoder()
le = le.fit(csv_data['clean_character_text'])
csv_data['speakerID'] = le.transform(csv_data['clean_character_text'])

length, _ = csv_data.shape
print(len(le.classes_))
print(csv_data.head())
print("length", length)

6759
        raw_character_text  ... speakerID
0              Miss Hoover  ...      4034
1             Lisa Simpson  ...      3567
2              Miss Hoover  ...      4034
3             Lisa Simpson  ...      3567
4  Edna Krabappel-Flanders  ...      1884

[5 rows x 4 columns]
length 158314


In [None]:
scenes = []
scene_index = -1
i = 0 
length,_ = csv_data.shape 
startScene = False
while i < length:
  currentSpeaker = csv_data['speakerID'][i]
  utterance = csv_data['spoken_words'][i] 
  if not startScene:
    startScene = True
    scenes.append(list())
    scene_index += 1
    startScene = True
  
  if startScene:
    if type(utterance) is not str:
      #end of scene
      startScene = False
    else:
      scenes[scene_index].append((currentSpeaker,utterance))
  i += 1  

scenes[:10]

[[(4034,
   "No, actually, it was a little of both. Sometimes when a disease is in all the magazines and all the news shows, it's only natural that you think you have it."),
  (3567, "Where's Mr. Bergstrom?"),
  (4034,
   "I don't know. Although I'd sure like to talk to him. He didn't touch my lesson plan. What did he teach you?"),
  (3567, 'That life is worth living.'),
  (1884,
   "The polls will be open from now until the end of recess. Now, just in case any of you have decided to put any thought into this, we'll have our final statements. Martin?"),
  (3906, "I don't think there's anything left to say."),
  (1884, 'Bart?'),
  (648, 'Victory party under the slide!')],
 [(3567, 'Mr. Bergstrom! Mr. Bergstrom!'),
  (3467,
   'Hey, hey, he Moved out this morning. He must have a new job -- he took his Copernicus costume.'),
  (3567, 'Do you know where I could find him?'),
  (3467, "I think he's taking the next train to Capital City."),
  (3567, 'The train, how like him... traditional, ye

In [None]:
speakers = []
questions = []
responses = []
for scene in scenes:
  lines = scene 
  prev_speaker = None
  prev_utterance = None
  for speaker,utterance in lines:
    # print(speaker,utterance)

    # if speaker == persona and prev_speaker != persona and prev_speaker is not None and prev_utterance not in questions:
    if prev_speaker != speaker and prev_speaker is not None and prev_utterance not in questions:
      speakers.append(speaker)
      questions.append(prev_utterance)
      responses.append(utterance)
    prev_speaker = speaker
    prev_utterance = utterance 

len(speakers),len(questions),len(responses)

(87043, 87043, 87043)

In [None]:
import unicodedata
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

cleaned_questions = list(map(normalizeString,questions))
cleaned_responses = list(map(normalizeString,responses))

pairs = list(zip(speakers,questions,responses))
pairs[:5]

[(3567,
  "No, actually, it was a little of both. Sometimes when a disease is in all the magazines and all the news shows, it's only natural that you think you have it.",
  "Where's Mr. Bergstrom?"),
 (4034,
  "Where's Mr. Bergstrom?",
  "I don't know. Although I'd sure like to talk to him. He didn't touch my lesson plan. What did he teach you?"),
 (3567,
  "I don't know. Although I'd sure like to talk to him. He didn't touch my lesson plan. What did he teach you?",
  'That life is worth living.'),
 (1884,
  'That life is worth living.',
  "The polls will be open from now until the end of recess. Now, just in case any of you have decided to put any thought into this, we'll have our final statements. Martin?"),
 (3906,
  "The polls will be open from now until the end of recess. Now, just in case any of you have decided to put any thought into this, we'll have our final statements. Martin?",
  "I don't think there's anything left to say.")]

In [None]:
PAIRS_OUTPUT = "qa_pairs_with_speaker.txt"
DELIM = "++++@++++"
with open(PAIRS_OUTPUT, 'w') as f:
  for pair in pairs:
    f.write(str(pair[0])+DELIM+pair[1]+DELIM+pair[2]+"\n")