In [68]:
import pandas as pd
import os
def avg(l): return sum(l)/len(l)

'''
counts the length of rationales
'''
def count_rationale_length(df, batch=True):
    if batch:
        rationale_dict = {}
        for doc_evidences in df['evidences']:
            for evidence in doc_evidences:
                evidence = evidence[0]
                rationale_length = evidence['end_token'] - evidence['start_token']
                assert (rationale_length == len(evidence['text'].split()))
                doc_id = evidence['docid']
                if doc_id not in rationale_dict.keys(): rationale_dict[doc_id] = rationale_length
                else: rationale_dict[doc_id] += rationale_length
        return list(rationale_dict.values())
    else:
        rationale_len = []
        for doc_evidences in df['evidences']:
            for evidence in doc_evidences:
                rationale_length = evidence[0]['end_token'] - evidence[0]['start_token']
                assert (rationale_length == len(evidence[0]['text'].split()))
                rationale_len.append(rationale_length)
        return rationale_len

'''
counts the average length of the text files
'''
def count_text_length(doc_path):
    def count_file_length(file):
        f = open(file, 'rb')   
        text_length = 0
        for line in f.readlines():
            text_length += len(line.rstrip().split())
        return text_length

    text_dict = {}
    data_dir = directory + 'docs'
    for filename in os.listdir(data_dir):
        f = os.path.join(data_dir, filename)
        if os.path.isfile(f):
            text_dict[filename] = count_file_length(f)
    text_lengths = list(text_dict.values())
    return avg(text_length)
#     return text_length

In [142]:
'''
converts rationales to binary masks over text
1 to include a token, 0 to exclude
'''
def evidence_to_mask(tokens, evidence_list):
    mask = [1]*len(tokens)
    for evidence in evidence_list:
        if type(evidence) is list:
            evidence = evidence[0]
        if type(evidence) is not dict:
            print("???")
            return
        else:
            start_token = evidence['start_token']
            end_token = evidence['end_token']
            for i in range(start_token, end_token):
                mask[i] = 0
    return mask  

'''
converts dataframe to csv with only the texts, labels and rationale masks
'''
def to_data_df(df):
    data_df = []
    columns = ['text', 'classification', 'rationale']
    for i in range(len(df)):
        df_row = df.loc[i]
        text_id = df_row['annotation_id']
        evidence_list = df_row['evidences']
        classification = df_row['classification']
        
        file = directory + 'docs/' + text_id
        if os.path.isfile(file):
            f = open(file, 'r') 
            text = ''
            for line in f.readlines():
                text += line.rstrip() + ' '
        else:
            print("???")
        
        tokens = text.split()
        rationale_mask = evidence_to_mask(tokens, evidence_list)
        
        data_df.append([text, classification, rationale_mask])
    data_df = pd.DataFrame(data_df, columns=columns)
    return data_df

'''
truncates a data_df so that each segment contains at least 1 rationale
'''
def truncate(data_df, lim=512):
    for i in range(len(data_df)):
        row = data_df.iloc[i]
        text = row['text']
        tokens = text.split()
        if len(text) > lim:
            rationale = row['rationale']
            start = rationale.index(0)
            # exceeds limit
            if len(text)-1-start > lim:
                end = start + lim
            # not enough
            else:
                start = len(text) - 1 - lim
                end = -1
                
            trunc_rationale = rationale[start:end]
            trunc_tokens = tokens[start:end]
            trunc_text = ' '.join([str(token) for token in trunc_tokens])
            data_df.iloc[i]['rationale'] = trunc_rationale
            data_df.iloc[i]['text'] = trunc_text      
            print(len(data_df.iloc[i]['rationale']))
    return data_df

In [145]:
data_df = to_data_df(val)
data_df.to_csv('val.csv')
text_len = []
for i in range(len(data_df)):
    row = data_df.iloc[i]
    text = row['rationale']
    text_len.append(len(text))
text_len

[701,
 340,
 475,
 1086,
 497,
 767,
 454,
 1785,
 512,
 259,
 699,
 1013,
 615,
 1014,
 441,
 1435,
 753,
 300,
 1011,
 745,
 1030,
 581,
 667,
 608,
 555,
 541,
 1058,
 371,
 368,
 881,
 270,
 1009,
 1248,
 816,
 1357,
 874,
 530,
 764,
 884,
 675,
 400,
 807,
 894,
 348,
 916,
 707,
 862,
 970,
 677,
 1088,
 588,
 621,
 843,
 652,
 284,
 440,
 421,
 128,
 906,
 556,
 627,
 351,
 830,
 566,
 777,
 975,
 681,
 823,
 284,
 822,
 456,
 1570,
 786,
 604,
 1201,
 179,
 835,
 731,
 805,
 480,
 415,
 321,
 860,
 947,
 571,
 540,
 661,
 276,
 930,
 945,
 823,
 956,
 867,
 1880,
 663,
 398,
 644,
 485,
 572,
 440,
 812,
 1199,
 1116,
 980,
 926,
 923,
 787,
 1488,
 845,
 1020,
 1072,
 867,
 1318,
 525,
 303,
 828,
 765,
 1607,
 1070,
 1506,
 554,
 658,
 786,
 504,
 465,
 181,
 673,
 647,
 748,
 647,
 785,
 643,
 739,
 864,
 802,
 278,
 1311,
 1030,
 524,
 966,
 640,
 251,
 663,
 406,
 1076,
 1221,
 1195,
 285,
 598,
 725,
 478,
 632,
 631,
 701,
 787,
 655,
 897,
 520,
 1045,
 943,
 1029,
 44

In [144]:
data_df = truncate(data_df)

83
80
295
512
450
512
191
512
404
253
512
512
512
512
215
512
512
179
512
512
512
445
512
448
512
464
425
349
284
512
63
512
512
512
512
512
486
512
512
512
395
512
512
331
512
512
512
512
512
512
512
512
512
512
256
274
376
98
512
361
121
200
512
512
443
512
512
377
170
512
344
512
512
473
512
168
512
512
512
321
345
264
512
512
455
512
512
275
512
512
512
444
512
512
512
389
512
366
473
384
512
512
512
512
512
512
340
512
512
512
512
416
512
458
247
512
512
512
512
512
512
512
512
344
284
110
311
512
512
512
512
469
357
494
336
138
512
512
303
481
512
190
512
390
512
512
512
173
512
512
278
225
512
249
396
459
512
163
512
512
512
390
512
431
335
512
194
485
512
512
512
370
512
512
273
512
303
344
512
461
512
126
512
46
512
274
317
512
512
512
512
512
512
512
475
512
512
512
254
420


In [139]:
data_df

Unnamed: 0,text,classification,rationale
0,nothing really worth watching . only die - har...,NEG,"[0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,"overacts his psycho routine . unfortunately , ...",NEG,"[0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,borders on caricature but goes beyond it . it ...,NEG,"[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,"the plodding , tedious opening sequence finall...",NEG,"[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,the audience is left in the dark . the center ...,NEG,"[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...,...
195,"i liked these scenes best , because the politi...",POS,"[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
196,a fascinating look into the last days in the l...,POS,"[0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
197,a poignant and clever drama . a conflict gradu...,POS,"[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
198,"turn in outstanding supporting performances , ...",POS,"[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [140]:
text_len = []
for i in range(len(data_df)):
    row = data_df.iloc[i]
    text = row['rationale']
    text_len.append(len(text))
avg(text_len)

431.15

In [141]:
text_len

[83,
 80,
 295,
 512,
 450,
 512,
 191,
 512,
 404,
 253,
 512,
 512,
 512,
 512,
 215,
 512,
 512,
 179,
 512,
 512,
 512,
 445,
 512,
 448,
 512,
 464,
 425,
 349,
 284,
 512,
 63,
 512,
 512,
 512,
 512,
 512,
 486,
 512,
 512,
 512,
 395,
 512,
 512,
 331,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 256,
 274,
 376,
 98,
 512,
 361,
 121,
 200,
 512,
 512,
 443,
 512,
 512,
 377,
 170,
 512,
 344,
 512,
 512,
 473,
 512,
 168,
 512,
 512,
 512,
 321,
 345,
 264,
 512,
 512,
 455,
 512,
 512,
 275,
 512,
 512,
 512,
 444,
 512,
 512,
 512,
 389,
 512,
 366,
 473,
 384,
 512,
 512,
 512,
 512,
 512,
 512,
 340,
 512,
 512,
 512,
 512,
 416,
 512,
 458,
 247,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 344,
 284,
 110,
 311,
 512,
 512,
 512,
 512,
 469,
 357,
 494,
 336,
 138,
 512,
 512,
 303,
 481,
 512,
 190,
 512,
 390,
 512,
 512,
 512,
 173,
 512,
 512,
 278,
 225,
 512,
 249,
 396,
 459,
 512,
 163,
 512,
 512,
 512,
 390,
 512,
 431,
 335,
 512,
 194,
 4

In [None]:
directory = 'data/movies/'

In [12]:
train_f_suffix = 'train'
test_f_suffix = 'test'
val_f_suffix = 'val'

In [14]:
train = pd.read_json(directory + train_f_suffix + '.jsonl', lines=True)
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600 entries, 0 to 1599
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   annotation_id   1600 non-null   object 
 1   classification  1600 non-null   object 
 2   evidences       1600 non-null   object 
 3   query           1600 non-null   object 
 4   query_type      0 non-null      float64
dtypes: float64(1), object(4)
memory usage: 62.6+ KB


In [15]:
test = pd.read_json(directory + test_f_suffix + '.jsonl', lines=True)
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199 entries, 0 to 198
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   annotation_id   199 non-null    object 
 1   classification  199 non-null    object 
 2   docids          0 non-null      float64
 3   evidences       199 non-null    object 
 4   query           199 non-null    object 
 5   query_type      0 non-null      float64
dtypes: float64(2), object(4)
memory usage: 9.5+ KB


In [16]:
val = pd.read_json(directory + val_f_suffix + '.jsonl', lines=True)
val.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   annotation_id   200 non-null    object 
 1   classification  200 non-null    object 
 2   evidences       200 non-null    object 
 3   query           200 non-null    object 
 4   query_type      0 non-null      float64
dtypes: float64(1), object(4)
memory usage: 7.9+ KB


In [48]:
train.iloc[0]['evidences']

[[{'docid': 'negR_000.txt',
   'end_sentence': 6,
   'end_token': 67,
   'start_sentence': 5,
   'start_token': 63,
   'text': 'mind - fuck movie'}],
 [{'docid': 'negR_000.txt',
   'end_sentence': 21,
   'end_token': 375,
   'start_sentence': 20,
   'start_token': 371,
   'text': 'the sad part is'}],
 [{'docid': 'negR_000.txt',
   'end_sentence': 13,
   'end_token': 203,
   'start_sentence': 12,
   'start_token': 196,
   'text': 'downshifts into this " fantasy " world'}],
 [{'docid': 'negR_000.txt',
   'end_sentence': 17,
   'end_token': 318,
   'start_sentence': 16,
   'start_token': 309,
   'text': 'i get kind of fed up after a while'}],
 [{'docid': 'negR_000.txt',
   'end_sentence': 35,
   'end_token': 698,
   'start_sentence': 34,
   'start_token': 696,
   'text': 'pretty redundant'}],
 [{'docid': 'negR_000.txt',
   'end_sentence': 11,
   'end_token': 187,
   'start_sentence': 10,
   'start_token': 182,
   'text': "it 's simply too jumbled"}],
 [{'docid': 'negR_000.txt',
   'end_se

In [61]:
data_df = to_data_df(val)
data_df.to_csv('val.csv')

In [63]:
val.iloc[0]['evidences']

[[{'docid': 'negR_800.txt',
   'end_sentence': 32,
   'end_token': 700,
   'start_sentence': 31,
   'start_token': 692,
   'text': 'definitely the cinematic equivalent of a sleeper car'}],
 [{'docid': 'negR_800.txt',
   'end_sentence': 28,
   'end_token': 622,
   'start_sentence': 27,
   'start_token': 618,
   'text': 'nothing really worth watching'}]]

In [27]:
train_dict = count_rationale_length(train)
test_dict = count_rationale_length(test)
val_dict = count_rationale_length(val)

# rationale_dict = {**train_dict, **test_dict, **val_dict}
# rationale_len = list(rationale_dict.values())
rationale_len = train_dict + test_dict + val_dict
sum(rationale_len)/len(rationale_len)

60.749249249249246

In [148]:
def build_rationale_dict(df):
    rationale_dicts = []
    for i in range(len(df)):
        row = df.loc[i]
        classification = row['classification']
        for rationale in row['evidences']:
            rationale = rationale[0]
            text = rationale['text']
            doc_id = rationale['docid']
            rationale_length = rationale['end_token'] - rationale['start_token']
            assert (rationale_length == len(text.split()))
            rationale_dicts.append({'text':text, 'doc_id':doc_id, 'rationale_length':rationale_length, 'class':classification})
            
    return rationale_dicts

rationale_dicts = build_rationale_dict(train) + build_rationale_dict(test) + build_rationale_dict(val)

neg_lengths = []
pos_lengths = []
neg_ratios = []
pos_ratios = []
for rationale in rationale_dicts:
    text_len = text_dict[rationale['doc_id']]
    rationale_len = rationale['rationale_length']
    ratio = rationale_len/text_len
    if rationale['class'] == 'REFUTES': 
        neg_lengths.append(rationale_len)
        neg_ratios.append(ratio)
    else: 
        pos_lengths.append(rationale_len)
        pos_ratios.append(ratio)

In [146]:
avg(pos_ratios)

0.1428665268469391

In [54]:
avg(neg_ratio)

0.09976097922381488

In [55]:
avg(pos_ratio)

0.07092435780061514

In [155]:
text_lengths = list(text_dict.values())
rationale_lengths = list(rationale_dict.values())

mean_rationale_length = avg(rationale_lenth)
mean_text_length = avg(text_lengths)
mean_rationale_percent = sum(rationale_lengths)/sum(text_lengths)
mean_rationale_length_class = [avg(neg_lengths), avg(pos_lengths)]
mean_rationale_percent_class = [avg(neg_ratios), avg(pos_ratios)]
mean_text_length_class = 0
mean_text_length_z = mean_text_length
mean_text_length_all = mean_text_length
mean_text_length_train = 0
mean_text_length_dev = 0
mean_text_length_test = 0
mean_rationale_length_z = 0

In [161]:
stats_path = directory + 'stats.csv'
stats = pd.read_csv(stats_path)
scifact = ['SciFact', 
          ['REFUTES', 'SUPPORTS'],
          len(train),
          len(test),
          len(val),
          'RC,',
          mean_rationale_length,
          mean_text_length,
          mean_rationale_percent,
          mean_rationale_length_class,
          mean_rationale_percent_class,
          mean_text_length_class,
          mean_text_length_z,
          mean_text_length_all,
          mean_text_length_train,
          mean_text_length_dev,
          mean_text_length_test,
          mean_rationale_length_z]
stats.loc[0] = scifact
stats.to_csv(stats_path)

ValueError: Must have equal len keys and value when setting with an iterable

In [157]:
scifact

['SciFact',
 ['REFUTES', 'SUPPORTS'],
 405,
 188,
 100,
 'RC,',
 74.84439359267735,
 265.5903890160183,
 0.2818038479101867,
 [38.20302375809935, 34.74519230769231],
 [0.13399179869568198, 0.1428665268469391],
 0,
 265.5903890160183,
 265.5903890160183,
 0,
 0,
 0,
 0]

In [25]:
a = lambda s: s.strip("[").strip("]").split()

In [28]:
a('[definitely the cinematic equivalent of a sleeper car", "nothing really worth watching]')

['definitely',
 'the',
 'cinematic',
 'equivalent',
 'of',
 'a',
 'sleeper',
 'car",',
 '"nothing',
 'really',
 'worth',
 'watching']

In [2]:
len([    0, 40776,  4832,    80,  6066,  8571,   213,     7,    10,  2352,
          537,  2156,  4076,     8,   172,  1305,   479, 10010,   120,    88,
           41,  3213,   479,  1264,     9,     5,  1669,  8524,  2156,    53,
           39,  6096,  1388,     7,   192,   123,    11,    69,   301,  2156,
            8,    34, 31634,   479, 12196,   128,    29,     5,   432, 17487,
        11018,     5,  1569,     8,    22,  2345,   102,    22,   465,    66,
          479,   479,   479, 35031,  5150,  4832,    10,  1508,   111, 26536,
         1569,    13,     5,  6066,  2706,    14, 12325,    15,    10,   182,
         3035,  1114,  2156,    53,  6822,    24,    11,    10,   182,  1099,
         3737,   479,  5488,    16,    99,   817,    42,  1551,    41,   190,
         4851,    65,     7,  3116,  2156,   187,   939,  3489, 19477,  3541,
           61,  2120,     7,  1108,     5, 16140,  2156,  7319,    19,   110,
          471,     8,   215,    36,   685,  6418,   359,   475,  6285,   139,
         4839,  2156,    53,    89,    32,   205,     8,  1099,  1319,     9,
          442,    70,  3505,     9,  3541,  2156,     8,   209,  5450,    95,
          222,   295,    75, 29650,    42,    65, 12461,     4, 10010,  2045,
            7,    33,   551,    42,  1256, 19427,  4286,  2156,    53,  9390,
           24, 23213,   479,  2527,    99,    32,     5,  1272,    19,     5,
         1569, 17487,  3056,  2156,    63,  1049,   936,    16,    14,    24,
          128,    29,  1622,   350,  1236, 10434,     4,   405,  2012,   160,
           22,  2340,    22,    53,   172,   159,  1193, 22833,    88,    42,
           22,  8235,    22,   232,    11,    61,    47,  2156,    25,    41,
         2437,   919,  2156, 11990,   117,  1114,    99,   128,    29,   164,
           15,     4,  8585,    32,  7416,  2156,    89,    32,  3768,   567,
          124,    31,     5,  1462,  2156,    89,    32,   643,    54,   356,
          101,     5,  1462,  2156,    89,    32,  7782, 41736,  8237,  2156,
           89,    32, 14529,  5332,  2156,    89,    32,    10,   784, 41242,
         9210,     9,  7859,  5422,  2156,    89,    32,  7741,     9,  7735,
          383,    14,  1369,  2156,     8,   144,     9,    24,    16,  1622,
           45,  2002,   479,  8310,   939,  5636,   109,   295,    75,  1508,
          667,     7, 21697,    10,   822,   358,   122,     8,   172,  2156,
           53,    77,    70,    24,   473,    16,   492,   162,     5,   276,
        18664,    81,     8,    81,   456,  2156,   939,   120,   761,     9,
         9789,    62,    71,    10,   150,  2156,    61,    16,    42,   822,
          128,    29,   934,   936,   479,   405,   128,    29,  3334,   300,
           42,   380,  3556,     7,  7433,  2156,    53,    24,  1302,     7,
          236,     7,  7433,    24,  2198,   454,    63,   507,   292,   728,
          479,   463,   109,    51,   146,   383, 11110,  2156, 16208,    50,
          190,  7580,  2156,    11,     5, 10299, 17487,  3654,   269,   479,
          627,  5074,   233,    16,    14,     5, 27899,     8,   939,   258,
         8512,    15,   856, 36562,   101,    42,  2156,    98,    52,   888,
        11464,   144,     9,    24,    66,    30,     5,   457,   111,   169,
          477,  2156,    98,    70,     9,     5, 31083, 14186,    71,    14,
          222,   386,     7,   146,    10,   410,   828,     9,  1472,  2156,
           53,    24,   202,   222,   295,    75,     5,   146,     5,   822,
           70,    14,    55, 11110,   479,   118,  4443,     5,  2576,   516,
           19,  4133,   101,    42,    16,    14,    47,   197,   460,   146,
          686,    14,     5,  2437,    16,    22,    88,    24,    22,   190,
          137,    51,    32,   576,     5,  3556, 14844,     7,  2914,   110,
          232,     9,  2969,   479,   118,  1266,  2156,  2018, 15352, 12109,
        17929,   991,  8690,   878,   409,    31, 28420,    13,    59,   291,
          728,     2])

512

In [39]:
s="""plot : two teen couples go to a church party , drink and then drive .
they get into an accident .
one of the guys dies , but his girlfriend continues to see him in her life , and has nightmares .
what 's the deal ?
watch the movie and " sorta " find out . . .
critique : a mind - fuck movie for the teen generation that touches on a very cool idea , but presents it in a very bad package .
which is what makes this review an even harder one to write , since i generally applaud films which attempt to break the mold , mess with your head and such ( lost highway & memento ) , but there are good and bad ways of making all types of films , and these folks just did n't snag this one correctly
.
they seem to have taken this pretty neat concept , but executed it terribly .
so what are the problems with the movie ?
well , its main problem is that it 's simply too jumbled
.
it starts off " normal " but then downshifts into this " fantasy " world in which you , as an audience member ,
have no idea what 's going on
.
there are dreams , there are characters coming back from the dead , there are others who look like the dead , there are strange apparitions , there are disappearances , there are a looooot of chase scenes , there are tons of weird things that happen , and most of it is simply not explained .
now i personally do n't mind trying to unravel a film every now and then , but when all it does is give me the same clue over and over again , i get kind of fed up after a while , which is this film 's biggest problem .
it 's obviously got this big secret to hide , but it seems to want to hide it completely until its final five minutes .
and do they make things entertaining , thrilling or even engaging , in the meantime ?
not really .
the sad part is that the arrow and i both dig on flicks like this , so we actually figured most of it out by the half - way point , so all of the strangeness after that did start to make a little bit of sense , but it still did n't the make the film all that more entertaining .
i guess the bottom line with movies like this is that you should always make sure that the audience is " into it " even before they are given the secret password to enter your world of understanding .
i mean , showing melissa sagemiller running away from visions for about 20 minutes throughout the movie is just plain lazy ! !
okay , we get it .
. .
there are people chasing her and we do n't know who they are .
do we really need to see it over and over again ?
how about giving us different scenes offering further insight into all of the strangeness going down in the movie ?
apparently , the studio took this film away from its director and chopped it up themselves , and it shows .
there might 've been a pretty decent teen mind - fuck movie in here somewhere , but i guess " the suits " decided that turning it into a music video with little edge , would make more sense .
the actors are pretty good for the most part , although wes bentley just seemed to be playing the exact same character that he did in american beauty , only in a new neighborhood .
but my biggest kudos go out to sagemiller , who holds her own throughout the entire film , and actually has you feeling her character 's unraveling .
overall , the film does n't stick
because it does n't entertain , it 's confusing , it rarely excites and
it feels pretty redundant for most of its runtime , despite a pretty cool ending and explanation to all of the craziness that came before it .
oh ,
and by the way , this is not a horror or teen slasher flick . . .
it 's just packaged to look that way because someone is apparently assuming that the genre is still hot with the kids .
it also wrapped production two years ago and has been sitting on the shelves ever since .
whatever . .
. skip it !
where 's joblo coming from ?
a nightmare of elm street 3 ( 7/10 ) - blair witch 2 ( 7/10 ) - the crow ( 9/10 ) - the crow : salvation ( 4/10 )
- lost highway ( 10/10 ) - memento ( 10/10 ) - the others ( 9/10 ) - stir of echoes ( 8/10 )"""

In [7]:
s="""the happy bastard 's quick movie review damn
that y2k bug .
it 's got a head start in this movie starring jamie lee curtis and another baldwin brother ( william this time ) in a story regarding a crew of a tugboat that comes across a deserted russian tech ship that has a strangeness to it when they kick the power back on .
little do they know the power within . . .
going for the gore and bringing on a few action sequences here and there , virus
still feels very empty , like a movie going for all flash and no substance .
we do n't know why the crew was really out in the middle of nowhere , we do n't know the origin of what took over the ship ( just that a big pink flashy thing hit the mir ) , and , of course , we do n't know why donald sutherland is stumbling around drunkenly throughout .
here , it 's just
" hey , let 's chase these people around with some robots " .
the acting is below average , even from the likes of curtis .
you 're more likely to get a kick out of her work in halloween h20 .
sutherland is wasted and baldwin , well , he 's acting like a baldwin , of course .
the real star here are stan winston 's robot design , some schnazzy cgi , and the occasional good gore shot , like picking into someone 's brain .
so , if robots and body parts really turn you on , here 's your movie .
otherwise , it 's pretty much a sunken ship of a movie ."""

In [8]:
len(s.rstrip().split())

291

In [10]:
len([    0,   627,  1372, 43143,   128,    29,  2119,  1569,  1551, 16490,
         6025,  1423,   176,   330, 13673,   479,   405,   128,    29,   300,
           10,   471,   386,    11,    42,  1569,  8996, 11914,   324,  2084,
          242,  5350, 42334,     8,   277, 24876,  5640,  2138,    36,    40,
         6009,    42,    86,  4839,    11,    10,   527,  2624,    10,  3419,
            9,    10, 24447, 14859,    14,   606,   420,    10, 31712,   910,
        42472,  2903,  3627,    14,    34,    10, 31083, 14186,     7,    24,
           77,    51,  3151,     5,   476,   124,    15,   479, 27635,   109,
           51,   216,     5,   476,   624,   479,   479,   479, 12891,    13,
            5, 43739,     8,  2406,    15,    10,   367,   814, 26929,   259,
            8,    89,  2156,  6793, 17830,  2653,   182,  5802,  2156,   101,
           10,  1569,   164,    13,    70,  7462,     8,   117,  6572,   479,
         1694,   109,   295,    75,   216,   596,     5,  3419,    21,   269,
           66,    11,     5,  1692,     9,  9261,  2156,    52,   109,   295,
           75,   216,     5,  9813,     9,    99,   362,    81,     5,  3627,
           36,    95,    14,    10,   380,  6907, 31005,   631,   478,     5,
        13235,  4839,  2156,     8,  2156,     9,   768,  2156,    52,   109,
          295,    75,   216,   596,   218,  5618,   579, 48588,    16, 28019,
          198, 19835,   352,  1328,   479, 10859,  2156,    24,   128,    29,
           95,   113, 17232,  2156,   905,   128,    29,  7859,   209,    82,
          198,    19,   103, 12129,    22,   479,   627,  3501,    16,   874,
          674,  2156,   190,    31,     5,  3829,     9,  5350, 42334,   479,
         6968,   128,   241,    55,   533,     7,   120,    10,  3151,    66,
            9,    69,   173,    11,  5179,  1722, 14102,  1368,   844,   479,
           29, 48588,    16, 14260,     8, 24876,  5640,  2156,   157,  2156,
           37,   128,    29,  3501,   101,    10, 24876,  5640,  2156,     9,
          768,   479,   627,   588,   999,   259,    32,  1690,   260,   339,
         6712,   128,    29,  9916,  1521,  2156,   103,   579, 13212,  1222,
         5144,   740, 15696,  2156,     8,     5, 12577,   205, 43739,   738,
         2156,   101,  6201,    88,   951,   128,    29,  2900,   479,  2527,
         2156,   114, 12129,     8,   809,  1667,   269,  1004,    47,    15,
         2156,   259,   128,    29,   110,  1569,   479,  7443, 10715,  2156,
           24,   128,    29,  1256,   203,    10, 21168,   225,  3627,     9,
           10,  1569,   479,     2])

334