In [1]:
import os, re
import csv
import numpy as np
import pandas as pd
from collections import defaultdict
from bs4 import BeautifulSoup
import spacy
spacy.load('en_core_web_sm')
nlp = spacy.load('en')

## Sentence length

In [2]:
cornell_texts = []
with open('cornell_all_reviews.txt', 'r+') as fo:
    cornell_texts = fo.readlines()

In [3]:
cornell_sent_lengths = []
cornell_sent_count = []

for text in cornell_texts:
    strip = text.strip()
    if strip:
        doc = nlp(strip)
        num_words = 0
        num_sents = 0
        for s in doc.sents:
            num_words += len(s)
            num_sents += 1
        cornell_sent_lengths.append(num_words)
        cornell_sent_count.append(num_sents)

In [4]:
movies_texts = []
with open('movies_all_reviews.txt', 'r+') as fo:
    movies_texts = fo.readlines()

In [5]:
movies_sent_lengths = []
movies_sent_count = []

for text in movies_texts:
    strip = text.strip()
    if strip:
        doc = nlp(strip)
        num_words = 0
        num_sents = 0
        for s in doc.sents:
            num_words += len(s)
            num_sents += 1
        movies_sent_lengths.append(num_words)
        movies_sent_count.append(num_sents)

In [6]:
cornell_texts[:5]

['\n',
 '\n',
 "both films consist of a predominately male cast . both films follow young men as they illicitly fight the traditional system for their own desires . and both films are seen through the eyes of one narrator , who eventually realizes that these men have to be stopped . while boiler room writer/director ben younger does not get his point across as well as david fincher does for fight club , he does contribute another impressive work to a series of films aiming to represent the new generation . a generation which has seen the internet prosper and where everyone wants to be a millionaire . paying homage to oliver stone's 1987 classic wall street , younger is almost modernizing the tale by using younger , hipper actors to play the greedy villains as opposed to the older , more experienced types . as is true in real life , younger minds are becoming richer and richer from their knowledge of more standard technology . boiler room dismisses the notion of ingenuity and shows that

In [7]:
movies_texts[:5]

['Aug. 2, 2007 Watchers who warned that a six-year gap between installments would cost the \'Rush Hour\' franchise some energy \' especially from Jackie Chan, now in his early 50s \' aren\'t entirely off the mark. But the latest picture to feature one of the movies\' oddest crime-fighting tandems nevertheless stays true to the franchise formula of East-West fusion action, broad cultural comedy and international intrigue, this time largely in Paris. August rollout is like money in the bank for New Line, which will milk this likely final installment for maximum revenues down the ancillary stream. Though late summer timing is just right for the franchise, "Rush Hour 3" opens just a week after "The Bourne Ultimatum," and while auds may take some relief in the bouncy comic rapport between Chan and Chris Tucker, they\'re bound to find the action mild if not downright tame by comparison. The action bar has been raised to exceptional heights -- higher than even the great Chan can leap across. 

In [8]:
sum(cornell_sent_lengths) / sum(cornell_sent_count)

22.674497203740472

In [9]:
sum(movies_sent_lengths) / sum(movies_sent_count)

26.91007615532716

## Word length

In [10]:
cornell_df = pd.read_csv('pos/Cornell_pos.csv', index_col=0).dropna()

  mask |= (ar1 == a)


In [11]:
cornell_df.head()

Unnamed: 0,text_num,token,pos
0,1,magoo,NOUN
1,1,',PUNCT
2,1,was,VERB
3,1,by,ADP
4,1,far,ADV


In [12]:
movies_df = pd.read_csv('pos/Movies_pos.csv', index_col=0).dropna()

  mask |= (ar1 == a)


In [13]:
movies_df.head()

Unnamed: 0,text_num,token,pos
0,0,Tuesday,PROPN
1,0,",",PUNCT
2,0,Sep,PROPN
3,0,18,NUM
4,0,2007,NUM


In [14]:
cornell_df['token'].str.len().mean()

4.006302449392916

In [15]:
movies_df['token'].str.len().mean()

4.170183282992839

## Relative frequency of 'film' and 'pic' and 'I'

In [16]:
len(cornell_df[cornell_df['token'].str.lower() == 'film']) / len(cornell_df)

0.006090563036191126

In [17]:
len(movies_df[movies_df['token'].str.lower() == 'film']) / len(movies_df)

0.0025482685976582595

In [18]:
len(cornell_df[cornell_df['token'].str.lower() == 'pic']) / len(cornell_df)

4.678878539655167e-06

In [19]:
len(movies_df[movies_df['token'].str.lower() == 'pic']) / len(movies_df)

0.0006290527247239376

In [20]:
len(cornell_df[cornell_df['token'].str.lower() == 'i']) / len(cornell_df)

0.005574549574389156

In [21]:
len(movies_df[movies_df['token'].str.lower() == 'i']) / len(movies_df)

0.0012362382774443092

## Lexical diversity

In [22]:
lexdiv = []

for text in cornell_texts:
    strip = text.strip()
    if strip:
        doc = nlp(strip)

        numtokens = 0
        types = set()
        
        for s in doc.sents:
            for w in s:
                if not re.match('[^a-zA-Z]+$', w.text):
                    numtokens += 1
                    types.add(w.text)
                    
        lexdiv.append(len(types)/numtokens)

In [23]:
sum(lexdiv)/len(lexdiv)

0.5335532169077012

In [24]:
lexdiv = []

for text in movies_texts:
    strip = text.strip()
    if strip:
        doc = nlp(strip)

        numtokens = 0
        types = set()
        
        for s in doc.sents:
            for w in s:
                if not re.match('[^a-zA-Z]+$', w.text):
                    numtokens += 1
                    types.add(w.text)
                    
        lexdiv.append(len(types)/numtokens)

In [25]:
sum(lexdiv)/len(lexdiv)

0.6367346200111095

## Word and POS unigrams and bigrams

In [26]:
stopwords = pd.read_csv('smart.csv', header=None, names=['stopword']).drop_duplicates()

In [27]:
cornell_denominator = len(cornell_df['token'].str.isalpha().index)
movies_denominator = len(movies_df['token'].str.isalpha().index)

In [28]:
cornell_pos_counts = cornell_df['pos'].value_counts().reset_index().rename({'pos':'counts','index':'unigram'}, axis=1)
cornell_pos_counts['freq'] = cornell_pos_counts['counts'] / cornell_denominator * 1000000
cornell_pos_counts

Unnamed: 0,unigram,counts,freq
0,NOUN,337960,225896.255895
1,VERB,231400,154670.356297
2,PUNCT,207112,138435.984586
3,ADJ,155540,103964.681151
4,ADP,153692,102729.457217
5,DET,143253,95751.912492
6,ADV,98739,65998.255447
7,PRON,58717,39247.101602
8,CCONJ,47373,31664.644723
9,PART,40859,27310.614036


In [29]:
cornell_tok_counts = cornell_df['token'].value_counts().reset_index().rename({'token':'counts','index':'unigram'}, axis=1)

_alpha = cornell_tok_counts[cornell_tok_counts['unigram'].str.isalpha()]
_merged = _alpha.merge(stopwords, left_on='unigram', right_on='stopword', how='left', indicator=True)
cornell_tok_counts = _merged[_merged['_merge'] == 'left_only'][['unigram', 'counts']]
cornell_tok_counts['freq'] = cornell_tok_counts['counts'] / cornell_denominator * 1000000
cornell_tok_counts.head(10)

Unnamed: 0,unigram,counts,freq
14,film,9112,6090.563036
26,movie,5435,3632.81498
61,time,2319,1550.045619
62,good,2316,1548.040385
68,story,2090,1396.97945
70,character,1988,1328.801505
78,characters,1824,1219.182065
85,make,1596,1066.784307
90,life,1518,1014.648232
97,plot,1439,961.843746


In [30]:
cornell_df['next_pos'] = cornell_df['pos'].shift(-1)
cornell_df['next_token'] = cornell_df['token'].shift(-1)

_alpha = cornell_df[cornell_df['token'].str.isalpha() & cornell_df['next_token'].str.isalpha()]
_merged = _alpha.merge(stopwords, left_on='token', right_on='stopword', how='left', indicator='indicator1')
_merged = _merged.merge(stopwords, left_on='next_token', right_on='stopword', how='left', indicator='indicator2')
cornell_tok_bi_counts = _merged[(_merged['indicator1'] == 'left_only') & (_merged['indicator2'] == 'left_only')][['token', 'next_token']]
cornell_tok_bi_counts['bigram'] = cornell_tok_bi_counts['token'].str.cat(cornell_tok_bi_counts['next_token'], sep=' ')
cornell_tok_bi_counts = cornell_tok_bi_counts['bigram'].value_counts().reset_index().rename({'bigram':'counts','index':'bigram'}, axis=1)
cornell_tok_bi_counts['freq'] = cornell_tok_bi_counts['counts'] / cornell_denominator * 1000000

cornell_tok_bi_counts.head(10)

Unnamed: 0,bigram,counts,freq
0,special effects,372,248.648974
1,high school,163,108.951029
2,star wars,157,104.940562
3,takes place,115,76.86729
4,star trek,115,76.86729
5,supporting cast,111,74.193645
6,science fiction,109,72.856823
7,running time,105,70.183178
8,years ago,100,66.841122
9,action sequences,94,62.830655


In [31]:
cornell_df['bigram'] = cornell_df['pos'].str.cat(cornell_df['next_pos'], sep=' ')
cornell_pos_bi_counts = cornell_df['bigram'].value_counts().reset_index().rename({'bigram':'counts','index':'bigram'}, axis=1)

cornell_pos_bi_counts['freq'] = cornell_pos_bi_counts['counts'] / cornell_denominator * 1000000

cornell_pos_bi_counts.head(10)

Unnamed: 0,bigram,counts,freq
0,NOUN PUNCT,109997,73523.228961
1,ADJ NOUN,86144,57579.616131
2,DET NOUN,84451,56447.995936
3,NOUN ADP,63355,42347.19284
4,ADP DET,59983,40093.310206
5,NOUN VERB,55045,36792.695602
6,NOUN NOUN,47858,31988.824164
7,PRON VERB,40849,27303.929924
8,DET ADJ,39131,26155.599448
9,VERB DET,38103,25468.472714


In [32]:
_lowered = movies_df
_lowered['token'] = movies_df['token'].str.lower()
movies_df = _lowered

In [33]:
movies_pos_counts = movies_df['pos'].value_counts().reset_index().rename({'pos':'counts','index':'unigram'}, axis=1)
movies_pos_counts['freq'] = movies_pos_counts['counts'] / movies_denominator * 1000000
movies_pos_counts

Unnamed: 0,unigram,counts,freq
0,NOUN,887201,188355.11523
1,PUNCT,711281,151006.834659
2,VERB,606224,128702.956123
3,ADP,496493,105406.775044
4,ADJ,474065,100645.251416
5,PROPN,432579,91837.66406
6,DET,407317,86474.474747
7,ADV,245750,52173.373979
8,CCONJ,146270,31053.507271
9,PART,126144,26780.704323


In [34]:
movies_tok_counts = movies_df['token'].value_counts().reset_index().rename({'token':'counts','index':'token'}, axis=1)

_alpha = movies_tok_counts[movies_tok_counts['token'].str.isalpha()]
_merged = _alpha.merge(stopwords, left_on='token', right_on='stopword', how='left', indicator=True)
movies_tok_counts = _merged[_merged['_merge'] == 'left_only'][['token', 'counts']]

movies_tok_counts['freq'] = movies_tok_counts['counts'] / movies_denominator * 1000000

movies_tok_counts.head(10)

Unnamed: 0,token,counts,freq
26,movie,12853,2728.725842
28,film,12003,2548.268598
52,time,6225,1321.583939
71,director,4767,1012.046689
72,story,4743,1006.951425
73,man,4715,1001.006951
76,life,4632,983.385832
89,good,3745,795.07339
90,world,3691,783.609047
99,make,3294,699.324899


In [39]:
movies_df['next_pos'] = movies_df['pos'].shift(-1)
movies_df['next_token'] = movies_df['token'].shift(-1)

_alpha = movies_df[movies_df['token'].str.isalpha() & movies_df['next_token'].str.isalpha()]
_merged = _alpha.merge(stopwords, left_on='token', right_on='stopword', how='left', indicator='indicator1')
_merged = _merged.merge(stopwords, left_on='next_token', right_on='stopword', how='left', indicator='indicator2')
movies_tok_bi_counts = _merged[(_merged['indicator1'] == 'left_only') & (_merged['indicator2'] == 'left_only')][['token', 'next_token']]
movies_tok_bi_counts['bigram'] = movies_tok_bi_counts['token'].str.cat(movies_tok_bi_counts['next_token'], sep=' ')
movies_tok_bi_counts = movies_tok_bi_counts['bigram'].value_counts().reset_index().rename({'bigram':'counts','index':'bigram'}, axis=1)

movies_tok_bi_counts['freq'] = movies_tok_bi_counts['counts'] / movies_denominator * 1000000
movies_tok_bi_counts.head(10)

Unnamed: 0,bigram,counts,freq
0,running time,1074,228.013036
1,high school,618,131.203032
2,mpaa rating,582,123.560137
3,film festival,578,122.710926
4,los angeles,544,115.492637
5,production designer,529,112.308097
6,adult guardian,498,105.726715
7,accompanying parent,496,105.30211
8,requires accompanying,495,105.089807
9,years ago,428,90.86553


In [53]:
movies_df['bigram'] = movies_df['pos'].str.cat(movies_df['next_pos'], sep=' ')
movies_pos_bi_counts = movies_df['bigram'].value_counts().reset_index().rename({'bigram':'counts','index':'bigram'}, axis=1)

movies_pos_bi_counts['freq'] = movies_pos_bi_counts['counts'] / movies_denominator * 1000000

movies_pos_bi_counts.head(10)

Unnamed: 0,bigram,counts,freq
0,NOUN PUNCT,284686,60439.589602
1,ADJ NOUN,255576,54259.459728
2,DET NOUN,211206,44839.591555
3,NOUN ADP,208092,44178.481132
4,ADP DET,177829,37753.566313
5,PROPN PUNCT,139840,29688.401291
6,PROPN PROPN,135472,28761.063356
7,NOUN VERB,118256,25106.061092
8,DET ADJ,114339,24274.471648
9,NOUN NOUN,107177,22753.960134


In [37]:
cornell_pos_counts.round(2).to_csv('Cornell_POS.csv', index=None)
cornell_pos_bi_counts.round(2).to_csv('Cornell_POS_bigrams.csv', index=None)
cornell_tok_counts.round(2).to_csv('Cornell_tokens.csv', index=None)
cornell_tok_bi_counts.round(2).to_csv('Cornell_token_bigrams.csv', index=None)

In [54]:
movies_pos_counts.round(2).to_csv('Movies_POS.csv', index=None)
movies_pos_bi_counts.round(2).to_csv('Movies_POS_bigrams.csv', index=None)
movies_tok_counts.round(2).to_csv('Movies_tokens.csv', index=None)
movies_tok_bi_counts.round(2).to_csv('Movies_token_bigrams.csv', index=None)

In [48]:
print(len(movies_tok_bi_counts[movies_tok_bi_counts['counts'] == 1]))
print(len(movies_tok_bi_counts[movies_tok_bi_counts['counts'] == 1]) / movies_denominator * 1000000)

378457
80347.42053352928


In [50]:
print(len(cornell_tok_bi_counts[cornell_tok_bi_counts['counts'] == 1]))
print(len(cornell_tok_bi_counts[cornell_tok_bi_counts['counts'] == 1]) / cornell_denominator * 1000000)

95138
63591.30664367332
