In [56]:
import pandas as pd

In [57]:
# Define the column names for each dataset
movie_titles_columns = ['movieID', 'movie_title', 'movie_year', 'IMDB_rating', 'IMDB_votes', 'genres']
movie_characters_columns = ['characterID', 'character_name', 'movieID', 'movie_title', 'gender', 'position']
movie_lines_columns = ['lineID', 'characterID', 'movieID', 'character_name', 'utterance']
movie_conversations_columns = ['characterID_1', 'characterID_2', 'movieID', 'utterance_list']

# Load the datasets into pandas dataframes
movie_titles_df = pd.read_table('../data/movie_titles_metadata.tsv', header=None, names=movie_titles_columns, encoding='ISO-8859-2')
movie_conversations_df = pd.read_table('../data/movie_conversations.tsv', header=None, names=movie_conversations_columns, encoding='ISO-8859-2')

In [58]:
movie_titles_df = movie_titles_df.dropna(subset=['genres'])

In [59]:
movie_titles_df.iloc[0].genres

"['comedy' 'romance']"

In [60]:
movie_characters_df = pd.read_csv('../data/movie_characters_metadata.tsv', sep='\t', header=None, names=movie_characters_columns, encoding='ISO-8859-2', on_bad_lines='skip')
movie_lines_df = pd.read_table('../data/movie_lines.tsv', header=None, names=movie_lines_columns, encoding='ISO-8859-2', on_bad_lines='skip')

In [61]:
# Merge the necessary tables
merged_df = pd.merge(movie_lines_df, movie_characters_df, on=['characterID', 'movieID'], how='left')
merged_df = pd.merge(merged_df, movie_titles_df[['movieID', 'movie_title', 'genres']], on='movieID', how='left')

# Group the data by conversation and aggregate the dialogue and speakers
grouped_df = merged_df.groupby(['characterID', 'movieID', 'movie_title_y', 'genres'])[['utterance', 'character_name_x']].agg(list)

# Reset the index and rename the columns
grouped_df = grouped_df.reset_index()
grouped_df.rename(columns={'utterance': 'dialogue', 'character_name_x': 'speakers', 'movie_title_y': 'movie_title'}, inplace=True)

In [62]:
grouped_df

Unnamed: 0,characterID,movieID,movie_title,genres,dialogue,speakers
0,u0,m0,10 things i hate about you,['comedy' 'romance'],"[They do not!, I hope so., Let's go., Okay -- ...","[BIANCA, BIANCA, BIANCA, BIANCA, BIANCA, BIANC..."
1,u1,m0,10 things i hate about you,['comedy' 'romance'],"[Just sent 'em through., Never, Didn't have yo...","[BRUCE, BRUCE, BRUCE]"
2,u10,m0,10 things i hate about you,['comedy' 'romance'],"[Absolutely not., Your daughters went to the p...","[SHARON, SHARON, SHARON, SHARON, SHARON, SHARO..."
3,u100,m6,8mm,['crime' 'mystery' 'thriller'],[She died in her sleep three days ago. It was...,"[AMY, AMY, AMY, AMY, AMY, AMY, AMY, AMY, AMY, ..."
4,u1000,m65,from dusk till dawn,['action' 'crime' 'horror' 'thriller'],[Yeah and I'm gonna be right back at it tomorr...,"[MCGRAW, MCGRAW, MCGRAW, MCGRAW, MCGRAW, MCGRA..."
...,...,...,...,...,...,...
8744,u995,m65,from dusk till dawn,['action' 'crime' 'horror' 'thriller'],"[Open the door. I'm coming aboard., You said i...","[BORDER GUARD, BORDER GUARD, BORDER GUARD, BOR..."
8745,u996,m65,from dusk till dawn,['action' 'crime' 'horror' 'thriller'],"[Vamanos!, Yeah follow us., It's hardly been u...","[CARLOS, CARLOS, CARLOS, CARLOS, CARLOS, CARLO..."
8746,u997,m65,from dusk till dawn,['action' 'crime' 'horror' 'thriller'],"[I meant me my son and my daughter., Oh that's...","[JACOB, JACOB, JACOB, JACOB, JACOB, JACOB, JAC..."
8747,u998,m65,from dusk till dawn,['action' 'crime' 'horror' 'thriller'],"[Everybody goes home!, I'm going for 'em!, I s...","[KATE, KATE, KATE, KATE, KATE, KATE, KATE, KAT..."


In [63]:
grouped_df['genres'] = grouped_df['genres'].apply(lambda x: x[1:-1].replace('\'', '').replace('"', '').split())

In [64]:
grouped_df.to_json('../data/dialogs.json')

In [65]:
grouped_df = pd.read_json('../data/dialogs.json')

In [4]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

In [6]:
texts = [
    "I like you. I love you",
    "Fuck you shit! Mother fuck",
    "I fuck your ass",
]

inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
    logits = model(**inputs).logits

print(logits)

tensor([[-4.3032,  4.6750],
        [ 2.8683, -2.3304],
        [ 2.1973, -1.8939]])


In [7]:
scores = logits.detach().numpy()

In [8]:
import pandas as pd
import scipy.stats

def ent(data):
    """Calculates entropy of the passed `pd.Series`
    """
    p_data = data.value_counts()           # counts occurrence of each value
    entropy = scipy.stats.entropy(p_data)  # get entropy from counts
    return entropy

In [9]:
from scipy.special import softmax
from scipy.stats import entropy
import numpy as np

In [10]:
np.set_printoptions(suppress=True)

In [11]:
entropy(softmax(scores, axis=1), axis=1)

array([0.0012584 , 0.03406714, 0.08385529], dtype=float32)

In [12]:
predictions = scores.argmax(axis=1)

In [13]:
predictions

array([1, 0, 0])

In [14]:
[model.config.id2label[idx] for idx in predictions]

['POSITIVE', 'NEGATIVE', 'NEGATIVE']

In [15]:
entropy([0.5,0.5])

0.6931471805599453

In [16]:
np.argmax([0.5, 0.5])

0

In [17]:
t = [[{'label': 'anger', 'score': 0.004419783595949411},
  {'label': 'disgust', 'score': 0.0016119900392368436},
  {'label': 'fear', 'score': 0.0004138521908316761},
  {'label': 'joy', 'score': 0.9771687984466553},
  {'label': 'neutral', 'score': 0.005764586851000786},
  {'label': 'sadness', 'score': 0.002092392183840275},
  {'label': 'surprise', 'score': 0.008528684265911579}]]

In [41]:
w = {}

for u in t[0]:
    w[u['label']] = [u['score']]

In [42]:
w

{'anger': [0.004419783595949411],
 'disgust': [0.0016119900392368436],
 'fear': [0.0004138521908316761],
 'joy': [0.9771687984466553],
 'neutral': [0.005764586851000786],
 'sadness': [0.002092392183840275],
 'surprise': [0.008528684265911579]}

In [44]:
pd.DataFrame(w)

Unnamed: 0,anger,disgust,fear,joy,neutral,sadness,surprise
0,0.00442,0.001612,0.000414,0.977169,0.005765,0.002092,0.008529


In [18]:
def find_max(scores):
    max_score = 0
    target_label = None

    for row in scores:
        if row['score'] > max_score:
            max_score = row['score']
            target_label = row['label']
            
    return target_label

In [53]:
dialogs_df = pd.read_json('../data/dialogs.json')

In [49]:
dialogs_df['speakers'] = dialogs_df['speakers'].apply(lambda x: x[0])

In [54]:
dialogs_df

Unnamed: 0,characterID,movieID,movie_title,genres,dialogue,speakers
0,u0,m0,10 things i hate about you,"[comedy, romance]","[They do not!, I hope so., Let's go., Okay -- ...","[BIANCA, BIANCA, BIANCA, BIANCA, BIANCA, BIANC..."
1,u1,m0,10 things i hate about you,"[comedy, romance]","[Just sent 'em through., Never, Didn't have yo...","[BRUCE, BRUCE, BRUCE]"
2,u10,m0,10 things i hate about you,"[comedy, romance]","[Absolutely not., Your daughters went to the p...","[SHARON, SHARON, SHARON, SHARON, SHARON, SHARO..."
3,u100,m6,8mm,"[crime, mystery, thriller]",[She died in her sleep three days ago. It was...,"[AMY, AMY, AMY, AMY, AMY, AMY, AMY, AMY, AMY, ..."
4,u1000,m65,from dusk till dawn,"[action, crime, horror, thriller]",[Yeah and I'm gonna be right back at it tomorr...,"[MCGRAW, MCGRAW, MCGRAW, MCGRAW, MCGRAW, MCGRA..."
...,...,...,...,...,...,...
8744,u995,m65,from dusk till dawn,"[action, crime, horror, thriller]","[Open the door. I'm coming aboard., You said i...","[BORDER GUARD, BORDER GUARD, BORDER GUARD, BOR..."
8745,u996,m65,from dusk till dawn,"[action, crime, horror, thriller]","[Vamanos!, Yeah follow us., It's hardly been u...","[CARLOS, CARLOS, CARLOS, CARLOS, CARLOS, CARLO..."
8746,u997,m65,from dusk till dawn,"[action, crime, horror, thriller]","[I meant me my son and my daughter., Oh that's...","[JACOB, JACOB, JACOB, JACOB, JACOB, JACOB, JAC..."
8747,u998,m65,from dusk till dawn,"[action, crime, horror, thriller]","[Everybody goes home!, I'm going for 'em!, I s...","[KATE, KATE, KATE, KATE, KATE, KATE, KATE, KAT..."


In [55]:
dialogs_df[:4000].to_json('reduced.json')

In [None]:
dialogs_df['speakers']

In [19]:
find_max(t[0])

'joy'

In [20]:
import pandas as pd
from transformers import pipeline

# Load the dataset containing dialogs
dialogs_df = pd.read_json('../data/dialogs.json')

# Select a subset of dialogs
dialogs_df = dialogs_df.sample(n=1000, random_state=42)


def find_max(scores):
    max_score = 0
    target_label = None

    for row in scores:
        if row['score'] > max_score:
            max_score = row['score']
            target_label = row['label']

    return target_label


# Load the emotion classifier model
classifier = pipeline('text-classification',
                      model='j-hartmann/emotion-english-distilroberta-base',
                      tokenizer='j-hartmann/emotion-english-distilroberta-base')


Xformers is not installed correctly. If you want to use memorry_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [21]:
dialogs_df['dialogue'] = dialogs_df['dialogue'].map(lambda x: [y for y in x if isinstance(y, str)])

In [22]:
dialogs_df['dialogue'] = dialogs_df['dialogue'].apply(lambda x: ' '.join(x))

In [23]:
dialogs_df['dialogue'] = dialogs_df['dialogue'].apply(lambda x: x[:512])

In [24]:
results = dialogs_df['dialogue'].apply(lambda x: find_max(classifier(x)))

In [27]:
dialogs_df['emotion'] = results

In [28]:
dialogs_df

Unnamed: 0,characterID,movieID,movie_title,genres,dialogue,speakers,emotion
3910,u4614,m305,cool hand luke,"[crime, drama]",But I made this arrangement -- I don't think y...,"[ALIBI, ALIBI, ALIBI, ALIBI, ALIBI, ALIBI, ALI...",neutral
2777,u3592,m237,alien vs. predator,"[action, adventure, sci-fi]",Move it baby or they're gonna be chewin' on my...,"[GUTTIEREZ, GUTTIEREZ, GUTTIEREZ, GUTTIEREZ, G...",neutral
3355,u4113,m275,bloodmoon,"[action, thriller]",We're going to leave that up to Detective Will...,"[F.B.I. MAN, F.B.I. MAN, F.B.I. MAN]",neutral
3055,u3844,m254,badlands,"[crime, drama, romance, thriller]",Yes. Fine. Yes. Just thinking. Stay as long as...,"[RICH MAN, RICH MAN, RICH MAN, RICH MAN, RICH ...",neutral
6399,u696,m44,the cider house rules,"[drama, romance]",You mean *swimmin'*. I ain't goin' in that vat...,"[JACK, JACK, JACK, JACK, JACK]",disgust
...,...,...,...,...,...,...,...
3059,u3848,m255,the adventures of buckaroo banzai across the 8...,"[adventure, romance, comedy, sci-fi]",Then that's it! You mean the guy from the old ...,"[BILLY, BILLY, BILLY, BILLY, BILLY]",surprise
4961,u5649,m374,the grapes of wrath,[drama],Well there's plenty of work for you about fort...,"[SPENCER, SPENCER, SPENCER, SPENCER]",neutral
1659,u2586,m168,rebel without a cause,"[drama, romance]",Good luck Buzz. Feel okay? Buzzie--we better g...,"[JUDY, JUDY, JUDY, JUDY, JUDY, JUDY, JUDY, JUD...",joy
177,u1160,m76,gladiator,"[action, adventure, drama]",Is this Rome? Are we just going to be execute...,"[JUBA, JUBA, JUBA, JUBA]",fear


In [46]:
dialogs_df[dialogs_df['movie_title'] == 'bloodmoon']

Unnamed: 0,characterID,movieID,movie_title,genres,dialogue,speakers,emotion
3355,u4113,m275,bloodmoon,"[action, thriller]",We're going to leave that up to Detective Will...,"[F.B.I. MAN, F.B.I. MAN, F.B.I. MAN]",neutral
3352,u4110,m275,bloodmoon,"[action, thriller]",See you Tuesday Frank. Frank I have something ...,"[BEN, BEN, BEN, BEN, BEN, BEN, BEN, BEN, BEN]",neutral
3365,u4122,m275,bloodmoon,"[action, thriller]",Okay I'm goin'. You'll see. I'll get you. He's...,"[YELLOW MAN, YELLOW MAN, YELLOW MAN, YELLOW MA...",disgust


In [29]:
genres = list(set(y for x in dialogs_df['genres'].tolist() for y in x))

In [30]:
import gradio as gr

# Function to calculate emotion statistics based on genre
def calculate_emotion_statistics(genres):
    # Filter the subset dialogs based on selected genre(s)
    filtered_dialogs_df = dialogs_df[dialogs_df['genres'].map(lambda x: set(x) == set(genres))]

    # Calculate emotion statistics
    emotion_stats = filtered_dialogs_df['emotion'].value_counts().to_dict()

    return emotion_stats

# Interface using Gradio
iface = gr.Interface(
    fn=calculate_emotion_statistics,
    inputs=gr.inputs.CheckboxGroup(genres),
    outputs=gr.outputs.Textbox()
)

# Run the interface
iface.launch()

  super().__init__(


Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




In [32]:
dialogs_df.to_json('labeled-emotions.json')

3910     True
2777    False
3355    False
3055    False
6399    False
5317    False
7110    False
1385    False
4067    False
6481    False
Name: genres, dtype: bool