# Label subtitles data

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from tokenizers import Tokenizer, normalizers
from tokenizers.normalizers import BertNormalizer, Replace, Strip
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


## Import files

In [2]:
files = Path('scripts/subtitles/data/csv/').glob('**/*.csv')
dfs = []

for f in files:
  print(f)
  dfs.append(pd.read_csv(f))

data = pd.concat(dfs, ignore_index=True)
data

scripts/subtitles/data/csv/apichatpong weerasethakul/Memoria.csv
scripts/subtitles/data/csv/apichatpong weerasethakul/Uncle Boonmee Who Can Recall His Past Lives.csv
scripts/subtitles/data/csv/rajkumar hirani/3 Idiots.csv
scripts/subtitles/data/csv/rajkumar hirani/PK.csv
scripts/subtitles/data/csv/christopher nolan/Insomnia.csv
scripts/subtitles/data/csv/christopher nolan/The Prestige.csv
scripts/subtitles/data/csv/christopher nolan/Batman Begins.csv
scripts/subtitles/data/csv/christopher nolan/Following.csv
scripts/subtitles/data/csv/christopher nolan/Tenet.csv
scripts/subtitles/data/csv/christopher nolan/Dunkirk.csv
scripts/subtitles/data/csv/christopher nolan/Memento.csv
scripts/subtitles/data/csv/christopher nolan/Inception.csv
scripts/subtitles/data/csv/christopher nolan/Interstellar.csv
scripts/subtitles/data/csv/hirokazu koreeda/Shoplifters.csv
scripts/subtitles/data/csv/hirokazu koreeda/_The Makanai_ Cooking for the Maiko House_ Kaeru.csv
scripts/subtitles/data/csv/hirokazu kor

Unnamed: 0,id,startAt,endAt,text
0,8399288,"00:07:51,875","00:07:53,750","- Hello, there.\n- Hi!"
1,8399288,"00:07:53,792","00:07:54,917",Hm.
2,8399288,"00:07:56,458","00:07:59,125",- It's so nice to see you here.\n- How are you...
3,8399288,"00:07:59,167","00:08:00,292",Hmm...
4,8399288,"00:08:01,375","00:08:03,458",- Sleepy.\n- Mm.
...,...,...,...,...
28963,2331143,"01:56:19,539","01:56:20,381",Come on.
28964,2331143,"01:56:21,808","01:56:22,650",Yes.
28965,2331143,"01:56:34,221","01:56:36,531","- Let's go inside.\n- Yes, let's go inside."
28966,2331143,"01:56:40,494","01:56:43,498",Did you know Spiderman\nreally is a spider?


## Normalize text

In [3]:
normalizer = normalizers.Sequence([
  BertNormalizer(),
  Replace('<i>', ''),
  Replace('</i>', ''),
  Replace('-', ''),
  Strip()
])
normalizer.normalize_str(data.loc[0]['text'])

'hello, there.  hi!'

In [4]:
data['normalizedText'] = data['text'].apply(normalizer.normalize_str)
data

Unnamed: 0,id,startAt,endAt,text,normalizedText
0,8399288,"00:07:51,875","00:07:53,750","- Hello, there.\n- Hi!","hello, there. hi!"
1,8399288,"00:07:53,792","00:07:54,917",Hm.,hm.
2,8399288,"00:07:56,458","00:07:59,125",- It's so nice to see you here.\n- How are you...,it's so nice to see you here. how are you doing?
3,8399288,"00:07:59,167","00:08:00,292",Hmm...,hmm...
4,8399288,"00:08:01,375","00:08:03,458",- Sleepy.\n- Mm.,sleepy. mm.
...,...,...,...,...,...
28963,2331143,"01:56:19,539","01:56:20,381",Come on.,come on.
28964,2331143,"01:56:21,808","01:56:22,650",Yes.,yes.
28965,2331143,"01:56:34,221","01:56:36,531","- Let's go inside.\n- Yes, let's go inside.","let's go inside. yes, let's go inside."
28966,2331143,"01:56:40,494","01:56:43,498",Did you know Spiderman\nreally is a spider?,did you know spiderman really is a spider?


In [5]:
tokenizer = Tokenizer.from_pretrained("bert-base-uncased")

In [6]:
data.loc[660]['normalizedText']

'no sign of tools, or they have decomposed.'

## Word count

In [7]:
data['wordCount'] = data['normalizedText'].apply(lambda str : len(str.split()))
data

Unnamed: 0,id,startAt,endAt,text,normalizedText,wordCount
0,8399288,"00:07:51,875","00:07:53,750","- Hello, there.\n- Hi!","hello, there. hi!",3
1,8399288,"00:07:53,792","00:07:54,917",Hm.,hm.,1
2,8399288,"00:07:56,458","00:07:59,125",- It's so nice to see you here.\n- How are you...,it's so nice to see you here. how are you doing?,11
3,8399288,"00:07:59,167","00:08:00,292",Hmm...,hmm...,1
4,8399288,"00:08:01,375","00:08:03,458",- Sleepy.\n- Mm.,sleepy. mm.,2
...,...,...,...,...,...,...
28963,2331143,"01:56:19,539","01:56:20,381",Come on.,come on.,2
28964,2331143,"01:56:21,808","01:56:22,650",Yes.,yes.,1
28965,2331143,"01:56:34,221","01:56:36,531","- Let's go inside.\n- Yes, let's go inside.","let's go inside. yes, let's go inside.",7
28966,2331143,"01:56:40,494","01:56:43,498",Did you know Spiderman\nreally is a spider?,did you know spiderman really is a spider?,8


## Sentimental classification

In [8]:
classifier = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment-latest", tokenizer="bert-base-uncased")

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
data['sentiment'] = data['normalizedText'].apply(lambda str: classifier(str)[0])
data

Unnamed: 0,id,startAt,endAt,text,normalizedText,wordCount,sentiment
0,8399288,"00:07:51,875","00:07:53,750","- Hello, there.\n- Hi!","hello, there. hi!",3,"{'label': 'neutral', 'score': 0.8354941010475159}"
1,8399288,"00:07:53,792","00:07:54,917",Hm.,hm.,1,"{'label': 'neutral', 'score': 0.7507055401802063}"
2,8399288,"00:07:56,458","00:07:59,125",- It's so nice to see you here.\n- How are you...,it's so nice to see you here. how are you doing?,11,"{'label': 'negative', 'score': 0.5471012592315..."
3,8399288,"00:07:59,167","00:08:00,292",Hmm...,hmm...,1,"{'label': 'neutral', 'score': 0.7913289666175842}"
4,8399288,"00:08:01,375","00:08:03,458",- Sleepy.\n- Mm.,sleepy. mm.,2,"{'label': 'neutral', 'score': 0.8589414954185486}"
...,...,...,...,...,...,...,...
28963,2331143,"01:56:19,539","01:56:20,381",Come on.,come on.,2,"{'label': 'neutral', 'score': 0.8275664448738098}"
28964,2331143,"01:56:21,808","01:56:22,650",Yes.,yes.,1,"{'label': 'neutral', 'score': 0.8629628419876099}"
28965,2331143,"01:56:34,221","01:56:36,531","- Let's go inside.\n- Yes, let's go inside.","let's go inside. yes, let's go inside.",7,"{'label': 'neutral', 'score': 0.7763033509254456}"
28966,2331143,"01:56:40,494","01:56:43,498",Did you know Spiderman\nreally is a spider?,did you know spiderman really is a spider?,8,"{'label': 'neutral', 'score': 0.8166266679763794}"


In [10]:
data = data.join(pd.json_normalize(data['sentiment'])).drop('sentiment', axis='columns')
data

Unnamed: 0,id,startAt,endAt,text,normalizedText,wordCount,label,score
0,8399288,"00:07:51,875","00:07:53,750","- Hello, there.\n- Hi!","hello, there. hi!",3,neutral,0.835494
1,8399288,"00:07:53,792","00:07:54,917",Hm.,hm.,1,neutral,0.750706
2,8399288,"00:07:56,458","00:07:59,125",- It's so nice to see you here.\n- How are you...,it's so nice to see you here. how are you doing?,11,negative,0.547101
3,8399288,"00:07:59,167","00:08:00,292",Hmm...,hmm...,1,neutral,0.791329
4,8399288,"00:08:01,375","00:08:03,458",- Sleepy.\n- Mm.,sleepy. mm.,2,neutral,0.858941
...,...,...,...,...,...,...,...,...
28963,2331143,"01:56:19,539","01:56:20,381",Come on.,come on.,2,neutral,0.827566
28964,2331143,"01:56:21,808","01:56:22,650",Yes.,yes.,1,neutral,0.862963
28965,2331143,"01:56:34,221","01:56:36,531","- Let's go inside.\n- Yes, let's go inside.","let's go inside. yes, let's go inside.",7,neutral,0.776303
28966,2331143,"01:56:40,494","01:56:43,498",Did you know Spiderman\nreally is a spider?,did you know spiderman really is a spider?,8,neutral,0.816627


In [11]:
data.to_csv('scripts/subtitles/data/labelled_data.csv', index=False)