## Imports

In [1]:
import pandas as pd
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt

## Tokenizer

In [2]:
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize

def split_movie_script(movie_file, sentence=True, pad=500):
    ''' Gets a movie script name as an input. Cleans \n chars.
    If sentence == True, returns a dataFrame of the sentences of the script
    If sentence = False, returns a dataFrame of chunks of n=pad words from the script (with punctuation, no tokenization, no lemmatization)
    '''

    path = '/home/alemedeiros_/code/sebhochart/movie-sentiment-analysis/raw_data/screenplay_data/data/raw_texts/raw_texts/'

    # reading the movie script
    movie_script = open(path + movie_file,'r').read()

    # cleaning \n chars
    movie_script_cleaned = ' '.join(movie_script.split('\n'))

    # returning the DataFrame of sentences
    if sentence == True:
        return pd.DataFrame(sent_tokenize(movie_script_cleaned))

    # returning a DataFrame of n=pad words
    else:

        # split by word (keeping punctuation)
        movie_script_words = movie_script_cleaned.split(' ')

        # removing empty words
        while '' in movie_script_words:
            movie_script_words.remove('')

        # creating the chunks
        chunks = []
        for i in range(0, len(movie_script_words), pad):
            chunks.append(' '.join(movie_script_words[i:i+pad]))

        return pd.DataFrame(chunks)


In [4]:
data = split_movie_script('Up_1049413.txt')
data = data.rename(columns={0:'script'})
data

Unnamed: 0,script
0,...
1,A 1930's NEWSREEL.
2,NEWSREEL ANNOUNCER (V.O.)
3,"""Movietown News"" presents..."
4,Spotlight on Adventure!
...,...
2760,CARL Maybe I need new glas...
2761,"Overhead, MUNTZ'S DIRIGIBLE is parked, its lad..."
2762,DISSOLVE TO: EXT.
2763,PARADISE FALLS - AFTERNOON ...


In [6]:
from vader import model

In [7]:
model(data,n=5)

Unnamed: 0,script,neu,neg,pos,compound
0,...,0.874,0.000,0.126,0.3802
1,The mysterious SOUTH AMERICAN JUNGLE.A massive...,0.871,0.044,0.085,0.4926
2,Who would dare set foot on...,1.000,0.000,0.000,0.0000
3,NEWSREEL ANNOUNCER (V.O.)The beloved explorer ...,0.792,0.077,0.131,0.4753
4,This lighter-than-air craft was ...,0.898,0.000,0.102,0.6114
...,...,...,...,...,...
548,Ready everybody?The Camp M...,0.940,0.000,0.060,0.3612
549,Russell has chocolate and Carl has butter-bric...,0.921,0.000,0.079,0.0516
550,CARL Red one.100.Russell G...,1.000,0.000,0.000,0.0000
551,"RUSSELL Mr. Fredricksen, y...",0.641,0.128,0.231,0.4466


In [12]:
k = 100
ran = np.arange(0, len(data), k)
ran

aa = data.iloc[:5,0].values
b = ''
for a in aa:
    b += a
b

'                                                     UP                                                Written by                               Pete Docter, Bob Peterson &amp; Thomas McCarthy                                                                                            1.A 1930\'s NEWSREEL.NEWSREEL ANNOUNCER (V.O.)"Movietown News" presents...Spotlight on Adventure!'

In [13]:
result = []
for r in ran:
    aux = ''
    sentences = data.iloc[r:r+k,0]
    for sen in sentences:
        aux += sen
    result.append(aux)
result = pd.DataFrame(result, columns=['script'])

## Model

In [20]:
analyzer = SentimentIntensityAnalyzer()
result['result'] = result.script.apply(lambda x: analyzer.polarity_scores(x))
result['neg'] = result.result.apply(lambda x: x['neg'])
result['neu'] = result.result.apply(lambda x: x['neu'])
result['pos'] = result.result.apply(lambda x: x['pos'])
result['compound'] = result.result.apply(lambda x: x['compound'])

In [23]:
result = result.drop(columns='result')
result

Unnamed: 0,script,neg,neu,pos,compound
0,...,0.025,0.876,0.099,0.994
1,Mount Rushmore!Hard to sta...,0.038,0.93,0.032,-0.6663
2,"Ellie whispers, as if to protect a N...",0.027,0.88,0.093,0.9913
3,Carl gives it a try and points out a ...,0.054,0.827,0.119,0.9942
4,"He grabs his cane, with four tennis balls stuc...",0.02,0.893,0.088,0.993
5,Carl closes the door in Russell's face.INT.CAR...,0.095,0.857,0.048,-0.9723
6,INT.COURTROOM - DAY A G...,0.012,0.894,0.094,0.9945
7,He CHUCKLES and settles into his chair.He clos...,0.042,0.872,0.085,0.9763
8,A huge lightning bolt flash lights up the room...,0.023,0.913,0.064,0.9775
9,"MOUNTAIN, ROCKY LANDSCAPE ...",0.039,0.894,0.067,0.9514


In [None]:
# plt.plot(result.index, result.neg)
# plt.plot(result.index, result.pos)
# plt.plot(data.index, data.neu)
plt.plot(result.index, result.compound)
plt.show()