In [53]:
import pandas as pd

META_PATH: str = '../data/metacritic'
IMDB_PATH: str = '../data/imdb'

In [54]:
# load full dataset
raw: pd.DataFrame = pd.read_csv(f'{META_PATH}._raw.csv')
raw.head()

Unnamed: 0,title,metascore,userscore,summary
0,The Godfather,100,9.2,\n Francis Ford Coppola...
1,Citizen Kane,100,8.4,\n Following the death ...
2,Rear Window,100,8.7,\n A wheelchair-bound p...
3,Casablanca,100,8.8,"\n A Casablanca, Morocc..."
4,Boyhood,100,7.4,\n Filmed over 12 years...


In [55]:
# format: remove html tags, convert sentiment to category
formatted: pd.DataFrame = (
    pd.DataFrame()
    .assign(
        title=raw['title'],
        metascore=raw['metascore'] * 0.01,
        userscore=raw['userscore']
        .replace('tbd', 0.0)
        .astype('float32') * 0.1,
        summary=raw['summary']
        .str.replace(r'\n', '', regex=True),
    )
    .drop_duplicates()
    .set_index('title')
)
formatted.head()

Unnamed: 0_level_0,metascore,userscore,summary
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
The Godfather,1.0,0.92,Francis Ford Coppola's...
Citizen Kane,1.0,0.84,Following the death of...
Rear Window,1.0,0.87,A wheelchair-bound pho...
Casablanca,1.0,0.88,"A Casablanca, Morocco ..."
Boyhood,1.0,0.74,Filmed over 12 years w...


In [56]:
test: pd.DataFrame = pd.read_csv(f'{IMDB_PATH}.test.csv')
test.head()

Unnamed: 0,review,sentiment
0,This is not great cinema. The film is cliche r...,negative
1,"Personnaly I really loved this movie, and it p...",positive
2,"First, the current IMDb plot description seems...",negative
3,"If you want a serious laugh pain, watch this m...",positive
4,"Carlito Way, the original is a brilliant story...",negative


In [57]:
def maps(row):

    m_scores: list = []
    u_scores: list = []
    matches: list = []

    for idx, data in formatted.iterrows():
        if idx in row['review'] and len(idx) > 8:
            m_scores.append(data['metascore'])
            u_scores.append(data['userscore'])
            matches.append(idx)

    if len(m_scores) == 0:
        return 0.0, 0.0, []

    return (
        sum(m_scores) / len(m_scores),
        sum(u_scores) / len(u_scores),
        matches
    )

test[['metascore', 'userscore', 'matches']] = test[: 64].apply(maps, axis=1,  result_type="expand")
test.head(64)

Unnamed: 0,review,sentiment,metascore,userscore,matches
0,This is not great cinema. The film is cliche r...,negative,0.00,0.000,[]
1,"Personnaly I really loved this movie, and it p...",positive,0.00,0.000,[]
2,"First, the current IMDb plot description seems...",negative,0.30,0.655,"[The Hitcher, The Hitcher]"
3,"If you want a serious laugh pain, watch this m...",positive,0.00,0.000,[]
4,"Carlito Way, the original is a brilliant story...",negative,0.59,0.845,"[Carlito's Way, Lucky Number Slevin]"
...,...,...,...,...,...
59,The dazzling seventeen-minute dance sequence o...,positive,0.77,0.715,"[An American in Paris, On the Town]"
60,This show is amazing! I love each and every ep...,positive,0.00,0.000,[]
61,This Columbo episode is probably noted more fo...,positive,0.00,0.000,[]
62,Jonathan Rivers (Michael Keaton) suddenly beco...,negative,0.30,0.540,[White Noise]


In [58]:
# test.to_csv('test.csv')