In [1]:
import pandas as pd

### Data

#### Import Data

In [37]:
char_cols = ["charID", "charName", "movieID", "mTitle", "sex", "credPos"]
characters = pd.read_csv(
    "./data/initial_data/movie_characters_metadata.tsv", 
    sep="\t",
    header=None,
    on_bad_lines = 'warn'
)
characters.columns=char_cols

Skipping line 6565: expected 6 fields, saw 13



In [38]:
characters.head()

Unnamed: 0,charID,charName,movieID,mTitle,sex,credPos
0,u0,BIANCA,m0,10 things i hate about you,f,4
1,u1,BRUCE,m0,10 things i hate about you,?,?
2,u2,CAMERON,m0,10 things i hate about you,m,3
3,u3,CHASTITY,m0,10 things i hate about you,?,?
4,u4,JOEY,m0,10 things i hate about you,m,6


In [39]:
convers_cols = ["firstCharID", "secondCharID", "movieID", "utterances"]
conversations = pd.read_csv(
    "data/initial_data/movie_conversations.tsv", 
    sep="\t",
    header=None,
    on_bad_lines = 'warn'
)
conversations.columns = convers_cols

In [40]:
conversations["conv_list"] = conversations["utterances"].str.findall(r"[L]\d{1,}")

In [41]:
conversations.head()

Unnamed: 0,firstCharID,secondCharID,movieID,utterances,conv_list
0,u0,u2,m0,['L194' 'L195' 'L196' 'L197'],"[L194, L195, L196, L197]"
1,u0,u2,m0,['L198' 'L199'],"[L198, L199]"
2,u0,u2,m0,['L200' 'L201' 'L202' 'L203'],"[L200, L201, L202, L203]"
3,u0,u2,m0,['L204' 'L205' 'L206'],"[L204, L205, L206]"
4,u0,u2,m0,['L207' 'L208'],"[L207, L208]"


In [42]:
lines_cols = ["lineID", "characterID", "movieID", "charName", "aText"]
lines = pd.read_csv(
    "data/initial_data/movie_lines.tsv", 
    sep="\t",
    header=None,
    on_bad_lines = 'warn'
)
lines.columns=lines_cols

Skipping line 32288: expected 5 fields, saw 7
Skipping line 32351: expected 5 fields, saw 6
Skipping line 32390: expected 5 fields, saw 6
Skipping line 32583: expected 5 fields, saw 6
Skipping line 32585: expected 5 fields, saw 6
Skipping line 35684: expected 5 fields, saw 6
Skipping line 62132: expected 5 fields, saw 6
Skipping line 86637: expected 5 fields, saw 6
Skipping line 86722: expected 5 fields, saw 6
Skipping line 86914: expected 5 fields, saw 6
Skipping line 86960: expected 5 fields, saw 6
Skipping line 87010: expected 5 fields, saw 6
Skipping line 87011: expected 5 fields, saw 6
Skipping line 87086: expected 5 fields, saw 6
Skipping line 120607: expected 5 fields, saw 6
Skipping line 120719: expected 5 fields, saw 7
Skipping line 120739: expected 5 fields, saw 6
Skipping line 120783: expected 5 fields, saw 6
Skipping line 130284: expected 5 fields, saw 7
Skipping line 131048: expected 5 fields, saw 6

Skipping line 150955: expected 5 fields, saw 8
Skipping line 162777: expe

In [43]:
lines.sample(5)

Unnamed: 0,lineID,characterID,movieID,charName,aText
256785,L549915,u7883,m533,JACKIE,You think <u>I</u> didn't have some dirty word...
236515,L473112,u7311,m494,DEIRDRE,I'm a patriate.
46671,L295842,u1413,m94,BERT,Level with me Eddie. You ever play billiards b...
126403,L53511,u3801,m251,CHRIS,That makes you an expert I guess.
147939,L127405,u4440,m294,MISS DUNLOP,Hi Jody. What can I DO for you?


In [44]:
title_cols = ["movieID", "title", "year", "rating", "no.votes", "genres"]
titles = pd.read_csv(
    "data/initial_data/movie_titles_metadata.tsv", 
    sep="\t",
    header=None,
    on_bad_lines = 'warn'
)
titles.columns=title_cols

In [45]:
titles.head()

Unnamed: 0,movieID,title,year,rating,no.votes,genres
0,m0,10 things i hate about you,1999,6.9,62847.0,['comedy' 'romance']
1,m1,1492: conquest of paradise,1992,6.2,10421.0,['adventure' 'biography' 'drama' 'history']
2,m2,15 minutes,2001,6.1,25854.0,['action' 'crime' 'drama' 'thriller']
3,m3,2001: a space odyssey,1968,8.4,163227.0,['adventure' 'mystery' 'sci-fi']
4,m4,48 hrs.,1982,6.9,22289.0,['action' 'comedy' 'crime' 'drama' 'thriller']


#### PrepareData

In [46]:
MAX_DIALOG_LEN = 10
def get_line(line_list: list[str]):
    df = lines.loc[lines['lineID'].isin(line_list)]
    chars = df["charName"].unique()
    dialog_line = df["aText"][::-1].values # reversed because pandas sort that way
    dialog_line = dialog_line[:MAX_DIALOG_LEN]
    return dialog_line, chars

In [47]:
dialog_line_chars = conversations["conv_list"].apply(lambda x: get_line(x))

In [48]:
df = conversations.copy()
df_merged = df.merge(titles, left_on="movieID", right_on="movieID", how="left")

In [87]:
df_final = df_merged[["title", "genres"]].copy()
df_final["lines_chars"] = dialog_line_chars

In [94]:
def try_eval(string):
    try:
        return pd.eval(string)
    except:
        pass
df_final['genres'] = df_final['genres'].str.replace(" ", ",").apply(try_eval);

In [95]:
df_final.head()

Unnamed: 0,title,genres,lines_chars
0,10 things i hate about you,"[comedy, romance]",([Can we make this quick? Roxanne Korrine and...
1,10 things i hate about you,"[comedy, romance]",([You're asking me out. That's so cute. What'...
2,10 things i hate about you,"[comedy, romance]",([No no it's my fault -- we didn't have a prop...
3,10 things i hate about you,"[comedy, romance]","([Why?, Unsolved mystery. She used to be real..."
4,10 things i hate about you,"[comedy, romance]",([Gosh if only we could find Kat a boyfriend.....


In [96]:
df_final.to_parquet("data/prepared_data/prepared.parquet")

### Ray Serve Realisation

In [1]:
import pandas as pd
import numpy as np
from transformers import pipeline 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_parquet("data/prepared_data/prepared.parquet")

In [3]:
arr = list(data["lines_chars"][0][0])
string = data["lines_chars"][0][0][0]
# frame = pd.DataFrame(data["lines_chars"][0][0])
# batch = [arr,arr]

In [4]:
import ray 
from ray import serve
from starlette.requests import Request

ray.init()
serve.start()

2023-06-16 23:12:35,903	INFO worker.py:1627 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


<ray.serve._private.client.ServeControllerClient at 0x7fa781e41480>

In [11]:
import json
@serve.deployment(route_prefix="/")
class sentiment_pipeline:

    def __init__(self):
        self.pipeline = pipeline(
            model="finiteautomata/bertweet-base-sentiment-analysis"
        )

    def _unswer(self, text):
        return self.pipeline(text)[0]

    def __call__(self, request: Request) -> str:
        data = request.query_params["data"]
        text = json.loads(data)
        unswers = []
        if isinstance(text, list):
            for i, value in enumerate(text):
                unswer = self._unswer(text)
                unswer["sentence"] = i
                unswers.append(unswer)
        elif isinstance(text, str):
            unswer = self._unswer(text)
            unswer["sentence"] = 0
            unswers.append(unswer)
        else:
            raise TypeError(f"Unexpected type")
        
        return json.dumps(unswers)

sentiment_pipeline.deploy()

[2m[36m(ServeController pid=28708)[0m INFO 2023-06-16 23:15:42,245 controller 28708 deployment_state.py:1298 - Deploying new version of deployment sentiment_pipeline.
[2m[36m(ServeReplica:sentiment_pipeline pid=29039)[0m INFO 2023-06-16 23:15:42,245 controller 28708 deployment_state.py:1298 - Deploying new version of deployment sentiment_pipeline.
[2m[36m(ServeReplica:sentiment_pipeline pid=29039)[0m INFO 2023-06-16 23:15:42,245 controller 28708 deployment_state.py:1298 - Deploying new version of deployment sentiment_pipeline.
[2m[36m(ServeReplica:sentiment_pipeline pid=29039)[0m INFO 2023-06-16 23:15:42,245 controller 28708 deployment_state.py:1298 - Deploying new version of deployment sentiment_pipeline.
[2m[36m(ServeReplica:sentiment_pipeline pid=29039)[0m INFO 2023-06-16 23:15:42,245 controller 28708 deployment_state.py:1298 - Deploying new version of deployment sentiment_pipeline.
[2m[36m(ServeReplica:sentiment_pipeline pid=29039)[0m INFO 2023-06-16 23:15:42,245 

In [None]:
import requests
import json

In [17]:
# for string
request = requests.get("http://127.0.0.1:8000/", params={"data": json.dumps(string)})
json.loads(request.text)

[{'label': 'NEG', 'score': 0.9783918261528015, 'sentence': 0}]

[2m[36m(ServeReplica:sentiment_pipeline pid=29110)[0m INFO 2023-06-16 23:17:13,580 sentiment_pipeline sentiment_pipeline#xTlmBN CGUcfFvequ / replica.py:654 - __CALL__ OK 139.5ms


In [18]:
# for array
request = requests.get("http://127.0.0.1:8000/", params={"data": json.dumps(arr)})
json.loads(request.text)

[{'label': 'NEG', 'score': 0.9783918261528015, 'sentence': 0},
 {'label': 'NEG', 'score': 0.9783918261528015, 'sentence': 1},
 {'label': 'NEG', 'score': 0.9783918261528015, 'sentence': 2},
 {'label': 'NEG', 'score': 0.9783918261528015, 'sentence': 3}]

[2m[36m(ServeReplica:sentiment_pipeline pid=29110)[0m INFO 2023-06-16 23:17:19,939 sentiment_pipeline sentiment_pipeline#xTlmBN jNbaRVOziT / replica.py:654 - __CALL__ OK 1980.2ms


In [23]:
arr = list(data["lines_chars"][100][0])
print(arr)

# for array
request = requests.get("http://127.0.0.1:8000/", params={"data": json.dumps(arr)})
print(json.loads(request.text))

['Yeah', 'What do you think?']
[{'label': 'POS', 'score': 0.7530627846717834, 'sentence': 0}, {'label': 'POS', 'score': 0.7530627846717834, 'sentence': 1}]


[2m[36m(ServeReplica:sentiment_pipeline pid=29110)[0m INFO 2023-06-16 23:20:05,614 sentiment_pipeline sentiment_pipeline#xTlmBN dBmlOGZylM / replica.py:654 - __CALL__ OK 303.7ms
