# Text Processing 

In [None]:
%%bash
pip install TextBlob
pip install python-rake
conda install gensim

In [70]:
import pandas as pd
import numpy as np
from gensim import corpora, models, similarities
import textblob as tb
import nltk
from nltk.corpus import stopwords as sw
from nltk.stem import *
import urllib2
from gensim import corpora, models, similarities
import slugify as sl
import pickle
import re
import RAKE
from ast import literal_eval

## Load, clean and filter our data

In [71]:
dropbox = "/Users/mr/Dropbox/moviemeta/"

In [75]:
movieplots = pd.read_csv(dropbox + 'movieplots.csv')

In [81]:
def clean(row):
    '''apply on rows of a dataframe to clean up'''
    #this is because CSV conversion has converted list into string
    row['plots'] = literal_eval(row['plots'])
    try:
        row['year'] = int(row['year'])
        return row
    except:
        row['year'] = np.nan
        return row

In [77]:
movieplots = movieplots.apply(clean, axis = 1)

In [6]:
dftouse = movieplots[movieplots.year > 2014]

In [7]:
print movieplots.shape
print dftouse.shape
dftouse.head()

(259028, 5)
(16943, 5)


Unnamed: 0,title,source,year,plots,type
2,#1 at the Apocalypse Box Office (2015),imdb,2015,"[Jules is, self declared, the most useless per...",user plot
9,#50Fathers (2015),imdb,2015,[#50Fathers is an American Dramatic Comedy. Do...,user plot
15,#BeRobin the Movie (2015),imdb,2015,[A documentary about Margaret Cho's homeless o...,user plot
16,#Beings (2015),imdb,2015,[Beings is the second feature film of Stefanes...,user plot
20,#Disneyland60 (2015),imdb,2015,"[Kate, a 15 year old college student documents...",user plot


## Process the movie plots

First we define some functions that we are going to use to process the plots

In [8]:
def slugify (text):
    """replace special characters with ascii, see https://github.com/un33k/python-slugify"""
    return sl.slugify(text)

In [10]:
def stem(word):
    """stem a word with Porter Stemmer"""
    return PorterStemmer().stem(word)

In [11]:
num = re.compile('\d')
def contains_number(word):
    """check if a word contains a number"""
    return bool(num.search(word))

In [12]:
Rake = RAKE.Rake('/Users/mr/Devel/Harvard/CS109/Project/moviemeta/data/stoplists/FoxStoplist.txt')
def keywords(text):
    """extract keywords from text using RAKE algorithm"""
    keywords = Rake.run(text)
    return ' '.join([tup[0] for tup in keywords])

In [None]:
def sentences(text):
    """ tokenize text into sentences using nltk's punkt tokenizer"""
    blob = tb.TextBlob(text.decode('unicode-escape', 'ignore'))
    return blob.sentences

In [59]:
def process(sents, movie_id=None, stop=False, postags=None ):
    """process a list of sentences
    
    Apply stemming, removal of numbers, slugifying to text
    Optionally remove stop words, only include words with a certain POS tag
    
    Args:
        sents (list): sentences
        movie_id (int): movie id of the document that is processed
        stop (bool): remove stopwords
        postags(list): list of POS tags
    Returns:
        tuple containing two represantions of a daocument:
        A list of tagged sentences (e.g. for doc2vec) and a list of words (e.g. for topic detection)
    """
    if stop:
        stopwords = set(sw.words('english'))
    doc_sents  = []
    doc_words  = []
    for sent in sents:
        if postags:
            words = sent.tags
        else:
            words = sent.words
        words_processed = []
        for word in words:
            if postags:
                if word[1] not in postags:
                    continue
                else:
                    word = word[0]
            if stop and word in stopwords:
                    continue
            if contains_number(word):
                    continue
            word = stem(slugify(word))
            if len(word) < 2:
                continue
            words_processed.append(word)
        doc_sents.append(models.doc2vec.LabeledSentence(words_processed,[movie_id]))
        doc_words += words_processed 
    return (doc_sents, doc_words)

Now we do the actual processing. For topic detection we apply keyword extraction, as we want to use only use those words that carry meaning. We then apply normalization and transform the plots into lists of words, that will later be transformed into bags of words for topic detection.
For doc2vec we return lists of tagged sentences, as this is the input for a doc2vec training corpus.

In [96]:
def process_td(row):
    """process a row of dataframe for topic detection"""
    return process(sentences(keywords(' '.join(row))))[1]
documents_tm = dftouse['plots'].apply(process_td)

In [60]:
def process_d2v(row):
    movie = row[0]
    plots = ' '.join(row[1])
    return  process(sentences(plots), movie_id=movie, stop=True)[0]
documents_d2v = dftouse[['title','plots']].apply(process_d2v, axis=1)

### Example

In [107]:
print dftouse['plots'][2]
print
print documents_tm[2]
print
print documents_d2v[2]

['Jules is, self declared, the most useless person in the post apocalyptic world, until he finds an old film camera and determines to make the greatest movie in the new world... the only movie in the new world. But his first day filming is proving to be much more difficult than he imagined. Who knew that making a movie after the end of the world would be so hard?']

[u'post', u'apocalypt', u'world', u'day', u'film', u'film', u'camera', u'useless', u'person', u'self', u'declar', u'world', u'determin', u'movi', u'prove', u'jule', u'imagin', u'hard', u'difficult']

[TaggedDocument(words=[u'jule', u'self', u'declar', u'useless', u'person', u'post', u'apocalypt', u'world', u'find', u'old', u'film', u'camera', u'determin', u'make', u'greatest', u'movi', u'new', u'world', u'movi', u'new', u'world'], tags=['#1 at the Apocalypse Box Office (2015)']), TaggedDocument(words=[u'but', u'first', u'day', u'film', u'prove', u'much', u'difficult', u'imagin'], tags=['#1 at the Apocalypse Box Office (2015

### Save to disc

In [None]:
def save(docs, name):
    with open(dropbox + name + '.list', 'wb') as f:
        for plot in docs:
            f.write("%s\n" % plot)

In [105]:
#save the documents
save(documents_tm, 'imdb_plots_since_2014.list')
save(documents_d2v, 'imdb_plots_since_2014_d2v.list')

In [114]:
#save the corresponding movies
movies = dftouse['title'].values
with open(dropbox +'imdb_movies_since_2014.pickle', 'wb') as f:
    np.save(f,movies)

### Testing

In [108]:
doc = """St. Johns, Newfoundland is a city with a deep musical spirit and few bands embody that spirit better that The Once. Named for a unique Newfoundland phrase that means "imminently," now is indeed their time. Live at the Stagehouse is an hour of interviews and performances by this amazing trio, featuring songs recorded live off the floor in late 2013 at the picturesque Stagehouse Recording Studio in St. Philip's. The Once embraces a different vision of Newfoundland music. Their sounds do not come from the noisy pubs and dockside taverns that fuel so much of the Island's energy. Instead, their music comes from a quieter and more thoughtful place. Hope and tragedy are intertwined, whether they are singing an old lament from World War I, original songs that speak of love defeated, or tasteful songs from the artists whose music inspires them. The Once: Live at the Stagehouse features some of their best-loved tunes and some never heard before. It is a unique opportunity to see one of Canada's fastest rising bands in the place that inspires them and feeds their soul."""

In [111]:
test = process(sentences(doc))
testpostags = process(sentences(doc),postags=['FW', 'JJ','JJR','JJS','NN','NNS','NNP', 'NNPS','VB','VBD','VBG','VBN','VBP','VBZ'])
testkeywords = process(sentences(keywords(doc)))

In [113]:
print test
print
print testkeywords
print
print testpostags

([TaggedDocument(words=[u'st', u'john', u'newfoundland', u'is', u'citi', u'with', u'deep', u'music', u'spirit', u'and', u'few', u'band', u'embodi', u'that', u'spirit', u'better', u'that', u'the', u'onc'], tags=[None]), TaggedDocument(words=[u'name', u'for', u'uniqu', u'newfoundland', u'phrase', u'that', u'mean', u'immin', u'now', u'is', u'inde', u'their', u'time'], tags=[None]), TaggedDocument(words=[u'live', u'at', u'the', u'stagehous', u'is', u'an', u'hour', u'of', u'interview', u'and', u'perform', u'by', u'thi', u'amaz', u'trio', u'featur', u'song', u'record', u'live', u'off', u'the', u'floor', u'in', u'late', u'at', u'the', u'picturesqu', u'stagehous', u'record', u'studio', u'in', u'st', u'philip'], tags=[None]), TaggedDocument(words=[u'the', u'onc', u'embrac', u'differ', u'vision', u'of', u'newfoundland', u'music'], tags=[None]), TaggedDocument(words=[u'their', u'sound', u'do', u'not', u'come', u'from', u'the', u'noisi', u'pub', u'and', u'docksid', u'tavern', u'that', u'fuel', u's

In [None]:
#process with spark
'''import findspark
findspark.init()
import pyspark
conf = (pyspark.SparkConf()
    .setMaster('local')
    .setAppName('pyspark')
    .set("spark.executor.memory", "2g"))
sc = pyspark.SparkContext(conf=conf)
from pyspark.sql import SQLContext
sqlsc=SQLContext(sc)
plots_sdf = sqlsc.createDataFrame(dftouse)
plots = (plots_sdf[['title','plots']]
    .map(lambda x : (' '.join(x[1])))
)
docs = (plots
    .map(sentences)
    .map(process)          
).cache()
docs_sents = docs.map(lambda x : x[0]).collect()
docs_words = docs.map(lambda x : x[1]).collect()
'''