# Label subtitles data

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from tokenizers import Tokenizer, normalizers
from tokenizers.normalizers import BertNormalizer, Replace, Strip
from transformers import pipeline
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

  from .autonotebook import tqdm as notebook_tqdm


## Import files

In [2]:
files = Path('./data/csv/').glob('**/*.csv')
dfs = []

for f in files:
  print(f)
  dfs.append(pd.read_csv(f))

data = pd.concat(dfs, ignore_index=True)
data

data/csv/apichatpong weerasethakul/Memoria.csv
data/csv/apichatpong weerasethakul/Uncle Boonmee Who Can Recall His Past Lives.csv
data/csv/kar wai wong/Chungking Express.csv
data/csv/kar wai wong/The Grandmaster.csv
data/csv/kar wai wong/Fallen Angels.csv
data/csv/kar wai wong/Happy Together.csv
data/csv/kar wai wong/Days of Being Wild.csv
data/csv/rajkumar hirani/3 Idiots.csv
data/csv/rajkumar hirani/PK.csv
data/csv/chang dong lee/Burning.csv
data/csv/chang dong lee/Peppermint Candy.csv
data/csv/chang dong lee/Oasis.csv
data/csv/chang dong lee/Secret Sunshine.csv
data/csv/chang dong lee/jeon tae-il.csv
data/csv/christopher nolan/Insomnia.csv
data/csv/christopher nolan/The Dark Knight Rises.csv
data/csv/christopher nolan/The Prestige.csv
data/csv/christopher nolan/Batman Begins.csv
data/csv/christopher nolan/Following.csv
data/csv/christopher nolan/Tenet.csv
data/csv/christopher nolan/Dunkirk.csv
data/csv/christopher nolan/Memento.csv
data/csv/christopher nolan/Inception.csv
data/csv/c

Unnamed: 0,id,startAt,endAt,text
0,8399288,"00:00:06,000","00:00:12,074",Watch Online Movies and Series for FREE\nwww.o...
1,8399288,"00:07:51,875","00:07:53,750","- Hello, there.\n- Hi!"
2,8399288,"00:07:53,792","00:07:54,917",Hm.
3,8399288,"00:07:56,458","00:07:59,125",- It's so nice to see you here.\n- How are you...
4,8399288,"00:07:59,167","00:08:00,292",Hmm...
...,...,...,...,...
41069,2331143,"01:56:21,808","01:56:22,650",Yes.
41070,2331143,"01:56:34,221","01:56:36,531","- Let's go inside.\n- Yes, let's go inside."
41071,2331143,"01:56:40,494","01:56:43,498",Did you know Spiderman\nreally is a spider?
41072,2331143,"01:56:46,199","01:56:47,542",I had no idea.


## Clean non dialog text

In [3]:
list(data[data['text'].str.contains("www.|opensubtitle|subtitles by", na = False, case = False, regex = True)].text)

['Watch Online Movies and Series for FREE\nwww.osdb.link/lm',
 'Watch Online Movies and Series for FREE\nwww.osdb.link/lm',
 'Use the free code JOINNOW at\nâ\x80¨www.playships.eu',
 'Watch Online Movies and Series for FREE\nwww.osdb.link/lm',
 'Advertise your product or brand here\ncontact www.OpenSubtitles.org today',
 'Watch Online Movies and Series for FREE\nwww.osdb.link/lm',
 'Watch Online Movies and Series for FREE\nwww.osdb.link/lm',
 'Use the free code JOINNOW at\nâ\x80¨www.playships.eu',
 'Watch Online Movies and Series for FREE\nwww.osdb.link/lm',
 'Support us and become VIP member\nto remove all ads from www.OpenSubtitles.org',
 'Support us and become VIP member\nto remove all ads from www.OpenSubtitles.org',
 'Advertise your product or brand here\ncontact www.OpenSubtitles.org today',
 'Watch Online Movies and Series for FREE\nwww.osdb.link/lm',
 'Support us and become VIP member\nto remove all ads from www.OpenSubtitles.org',
 'Watch Online Movies and Series for FREE\nwww.

In [4]:
data = data[
    ~data['text'].isna() &
    ~data['text'].str.contains("www.|opensubtitle|subtitles by", na = False, case = False, regex = True)
].reset_index(drop = True)
data

Unnamed: 0,id,startAt,endAt,text
0,8399288,"00:07:51,875","00:07:53,750","- Hello, there.\n- Hi!"
1,8399288,"00:07:53,792","00:07:54,917",Hm.
2,8399288,"00:07:56,458","00:07:59,125",- It's so nice to see you here.\n- How are you...
3,8399288,"00:07:59,167","00:08:00,292",Hmm...
4,8399288,"00:08:01,375","00:08:03,458",- Sleepy.\n- Mm.
...,...,...,...,...
41002,2331143,"01:56:19,539","01:56:20,381",Come on.
41003,2331143,"01:56:21,808","01:56:22,650",Yes.
41004,2331143,"01:56:34,221","01:56:36,531","- Let's go inside.\n- Yes, let's go inside."
41005,2331143,"01:56:40,494","01:56:43,498",Did you know Spiderman\nreally is a spider?


## Normalize text

In [5]:
normalizer = normalizers.Sequence([
  BertNormalizer(),
  Replace('<i>', ''),
  Replace('</i>', ''),
  Replace('-', ''),
  Strip()
])
normalizer.normalize_str(data.loc[0]['text'])

'hello, there.  hi!'

In [6]:
data['normalizedText'] = data['text'].apply(normalizer.normalize_str)
data

Unnamed: 0,id,startAt,endAt,text,normalizedText
0,8399288,"00:07:51,875","00:07:53,750","- Hello, there.\n- Hi!","hello, there. hi!"
1,8399288,"00:07:53,792","00:07:54,917",Hm.,hm.
2,8399288,"00:07:56,458","00:07:59,125",- It's so nice to see you here.\n- How are you...,it's so nice to see you here. how are you doing?
3,8399288,"00:07:59,167","00:08:00,292",Hmm...,hmm...
4,8399288,"00:08:01,375","00:08:03,458",- Sleepy.\n- Mm.,sleepy. mm.
...,...,...,...,...,...
41002,2331143,"01:56:19,539","01:56:20,381",Come on.,come on.
41003,2331143,"01:56:21,808","01:56:22,650",Yes.,yes.
41004,2331143,"01:56:34,221","01:56:36,531","- Let's go inside.\n- Yes, let's go inside.","let's go inside. yes, let's go inside."
41005,2331143,"01:56:40,494","01:56:43,498",Did you know Spiderman\nreally is a spider?,did you know spiderman really is a spider?


In [7]:
tokenizer = Tokenizer.from_pretrained("bert-base-uncased")

In [8]:
data.loc[660]['normalizedText']

'no sign of tools, or they have decomposed.'

## Word count

In [9]:
data['wordCount'] = data['normalizedText'].apply(lambda str : len(str.split()))
data

Unnamed: 0,id,startAt,endAt,text,normalizedText,wordCount
0,8399288,"00:07:51,875","00:07:53,750","- Hello, there.\n- Hi!","hello, there. hi!",3
1,8399288,"00:07:53,792","00:07:54,917",Hm.,hm.,1
2,8399288,"00:07:56,458","00:07:59,125",- It's so nice to see you here.\n- How are you...,it's so nice to see you here. how are you doing?,11
3,8399288,"00:07:59,167","00:08:00,292",Hmm...,hmm...,1
4,8399288,"00:08:01,375","00:08:03,458",- Sleepy.\n- Mm.,sleepy. mm.,2
...,...,...,...,...,...,...
41002,2331143,"01:56:19,539","01:56:20,381",Come on.,come on.,2
41003,2331143,"01:56:21,808","01:56:22,650",Yes.,yes.,1
41004,2331143,"01:56:34,221","01:56:36,531","- Let's go inside.\n- Yes, let's go inside.","let's go inside. yes, let's go inside.",7
41005,2331143,"01:56:40,494","01:56:43,498",Did you know Spiderman\nreally is a spider?,did you know spiderman really is a spider?,8


## Sentimental classification

In [10]:
# Huggingface
# classifier = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment-latest", tokenizer="bert-base-uncased")

nltk.download(["stopwords", "vader_lexicon"])

analyser = SentimentIntensityAnalyzer()

def classifier(text):
    tokens = word_tokenize(text)
    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]
    
    result = analyser.polarity_scores(' '.join(filtered_tokens))

    if (result['neu'] > result['pos']) & (result['neu'] > result['neg']):
        return { 'label': 'neutral', 'score': result['neu'] }
    elif (result['pos'] > result['neu']) & (result['pos'] > result['neg']):
        return { 'label': 'positive', 'score': result['pos'] }
    else:
        return { 'label': 'negative', 'score': result['neg'] }

[nltk_data] Downloading package stopwords to /home/lkz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/lkz/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [11]:
# data['sentiment'] = data['normalizedText'].apply(lambda str: classifier(str)[0])
data['sentiment'] = data['normalizedText'].apply(classifier)
data

Unnamed: 0,id,startAt,endAt,text,normalizedText,wordCount,sentiment
0,8399288,"00:07:51,875","00:07:53,750","- Hello, there.\n- Hi!","hello, there. hi!",3,"{'label': 'neutral', 'score': 1.0}"
1,8399288,"00:07:53,792","00:07:54,917",Hm.,hm.,1,"{'label': 'neutral', 'score': 1.0}"
2,8399288,"00:07:56,458","00:07:59,125",- It's so nice to see you here.\n- How are you...,it's so nice to see you here. how are you doing?,11,"{'label': 'positive', 'score': 0.583}"
3,8399288,"00:07:59,167","00:08:00,292",Hmm...,hmm...,1,"{'label': 'neutral', 'score': 1.0}"
4,8399288,"00:08:01,375","00:08:03,458",- Sleepy.\n- Mm.,sleepy. mm.,2,"{'label': 'neutral', 'score': 1.0}"
...,...,...,...,...,...,...,...
41002,2331143,"01:56:19,539","01:56:20,381",Come on.,come on.,2,"{'label': 'neutral', 'score': 1.0}"
41003,2331143,"01:56:21,808","01:56:22,650",Yes.,yes.,1,"{'label': 'positive', 'score': 1.0}"
41004,2331143,"01:56:34,221","01:56:36,531","- Let's go inside.\n- Yes, let's go inside.","let's go inside. yes, let's go inside.",7,"{'label': 'neutral', 'score': 0.748}"
41005,2331143,"01:56:40,494","01:56:43,498",Did you know Spiderman\nreally is a spider?,did you know spiderman really is a spider?,8,"{'label': 'neutral', 'score': 1.0}"


In [12]:
data = data.join(pd.json_normalize(data['sentiment'])).drop('sentiment', axis='columns')
data

Unnamed: 0,id,startAt,endAt,text,normalizedText,wordCount,label,score
0,8399288,"00:07:51,875","00:07:53,750","- Hello, there.\n- Hi!","hello, there. hi!",3,neutral,1.000
1,8399288,"00:07:53,792","00:07:54,917",Hm.,hm.,1,neutral,1.000
2,8399288,"00:07:56,458","00:07:59,125",- It's so nice to see you here.\n- How are you...,it's so nice to see you here. how are you doing?,11,positive,0.583
3,8399288,"00:07:59,167","00:08:00,292",Hmm...,hmm...,1,neutral,1.000
4,8399288,"00:08:01,375","00:08:03,458",- Sleepy.\n- Mm.,sleepy. mm.,2,neutral,1.000
...,...,...,...,...,...,...,...,...
41002,2331143,"01:56:19,539","01:56:20,381",Come on.,come on.,2,neutral,1.000
41003,2331143,"01:56:21,808","01:56:22,650",Yes.,yes.,1,positive,1.000
41004,2331143,"01:56:34,221","01:56:36,531","- Let's go inside.\n- Yes, let's go inside.","let's go inside. yes, let's go inside.",7,neutral,0.748
41005,2331143,"01:56:40,494","01:56:43,498",Did you know Spiderman\nreally is a spider?,did you know spiderman really is a spider?,8,neutral,1.000


In [13]:
data.to_csv('./data/labelled_data.csv', index=False)