# Text Processing 

In [None]:
%%bash
pip install TextBlob
pip install python-rake
conda install gensim

In [136]:
import pandas as pd
import numpy as np
import textblob as tb
import nltk
from nltk.corpus import stopwords as sw
from nltk.stem import *
import urllib2
from gensim import models
import slugify as sl
import pickle
import re
import RAKE
from ast import literal_eval
import os

## Load, clean and filter our data

In [138]:
dropbox_dir = "/Users/mr/Dropbox/moviemeta/"
#dropbox = urllib2.urlopen('https://www.dropbox.com/sh/bhrp12eqlj3zw0f/AAAlf4VJED2JXHLL6yUzuWoea?dl=0')

In [144]:
imdb_df = pd.read_csv(dropbox + 'movieplots.csv')
wiki_df = pd.DataFrame()
for year in range(2000, 2015):
    df = pd.read_table(os.path.join(dropbox_dir, "wikipedia", "wikipedia_plots_%d.csv" % year))
    wiki_df = pd.concat([wiki_df, df], ignore_index=True)
print imdb_df.shape
print wiki_df.shape

(259028, 5)
(26180, 12)


In [145]:
def clean_imdb_df(row):
    '''apply on rows of imdb dataframe to clean up'''
    #concatanate multiple plots into one
    row['plot'] = '\n'.join(literal_eval(row['plots']))
    try:
        row['year'] = int(row['year'])
        return row
    except:
        row['year'] = np.nan
        return row

In [221]:
imdb_df = imdb_df.apply(clean_imdb_df, axis = 1)
wiki_df.rename(columns = {'wiki_plot':'plot'}, inplace=True)
imdb_df[['title','year']].to_csv(dropbox + 'imdb_meta_df.csv')
wiki_df[['title','year', 'countries']].to_csv(dropbox + 'wiki_meta_df.csv')

In [220]:
imdb_df.head()

Unnamed: 0,title,source,year,plots,type,plot
0,#1 Cheerleader Camp (2010) (V),imdb,2010,"[""When they're hired to work at a cheerleading...",user plot,When they're hired to work at a cheerleading c...
1,#1 Serial Killer (2013),imdb,2013,"[""Years of seething rage against the racism he...",user plot,Years of seething rage against the racism he's...
2,#1 at the Apocalypse Box Office (2015),imdb,2015,"['Jules is, self declared, the most useless pe...",user plot,"Jules is, self declared, the most useless pers..."
3,#137 (2011),imdb,2011,"['#137 is a SCI/FI thriller about a girl, Marl...",user plot,"#137 is a SCI/FI thriller about a girl, Marla,..."
4,#29 (2012),imdb,2012,"[""In #29, the constant zooming into certain la...",user plot,"In #29, the constant zooming into certain land..."


In [214]:
wiki_df.head()

Unnamed: 0.1,Unnamed: 0,year,title,languages,countries,released,gross,comment,abstract,dbpediaLink,wikipediaLink,plot
0,0,2000,Well-Founded Fear,English,United States,As CNN Presents: Asylum in America,,Well-Founded Fear is a 2000 documentary film f...,Well-Founded Fear is a 2000 documentary film f...,http://dbpedia.org/resource/Well-Founded_Fear,http://en.wikipedia.org/wiki/Well-Founded_Fear,"On average, only one in two hundred asylum app..."
2,2,2000,Thenali,Tamil,India,2000-10-26,,Thenali is a 2000 Indian Tamil comedy-drama fi...,Thenali is a 2000 Indian Tamil comedy-drama fi...,http://dbpedia.org/resource/Thenali,http://en.wikipedia.org/wiki/Thenali,Thenali Soman (Kamal Hassan) is a man who fear...
6,6,2000,The Season of Men,"Arabic, French","France, Tunisia",,,The Season of Men (Arabic: موسم الرجال (فيلم)‎...,The Season of Men (Arabic: موسم الرجال (فيلم)‎...,http://dbpedia.org/resource/The_Season_of_Men,http://en.wikipedia.org/wiki/The_Season_of_Men,"An 18-year-old on the island Djerba, Aicha, is..."
7,7,2000,Beautiful Mistake,English,Wales,,,Beautiful Mistake (Welsh: Camgymeriad Gwych) i...,Beautiful Mistake (Welsh: Camgymeriad Gwych) i...,http://dbpedia.org/resource/Beautiful_Mistake_...,http://en.wikipedia.org/wiki/Beautiful_Mistake...,This documentary film follows a group of Welsh...
8,8,2000,Ready to Rumble,English,United States,Australia,12452362.0,Ready to Rumble is a 2000 American comedy film...,Ready to Rumble is a 2000 American comedy film...,http://dbpedia.org/resource/Ready_to_Rumble,http://en.wikipedia.org/wiki/Ready_to_Rumble,"For most of their lives, Sewage workers Gordie..."


## Process the movie plots

First we define some functions that we are going to use to process the plots

In [8]:
def slugify (text):
    """replace special characters with ascii, see https://github.com/un33k/python-slugify"""
    return sl.slugify(text)

In [10]:
def stem(word):
    """stem a word with Porter Stemmer"""
    return PorterStemmer().stem(word)

In [11]:
num = re.compile('\d')
def contains_number(word):
    """check if a word contains a number"""
    return bool(num.search(word))

In [12]:
Rake = RAKE.Rake('/Users/mr/Devel/Harvard/CS109/Project/moviemeta/data/stoplists/FoxStoplist.txt')
def keywords(text):
    """extract keywords from text using RAKE algorithm"""
    keywords = Rake.run(text)
    return ' '.join([tup[0] for tup in keywords])

In [None]:
def sentences(text):
    """ tokenize text into sentences using nltk's punkt tokenizer"""
    blob = tb.TextBlob(text.decode('unicode-escape', 'ignore'))
    return blob.sentences

In [59]:
def process(sents, movie_id=None, stop=False, postags=None ):
    """process a list of sentences
    
    Apply stemming, removal of numbers, slugifying to text
    Optionally remove stop words, only include words with a certain POS tag
    
    Args:
        sents (list): sentences
        movie_id (int): movie id of the document that is processed
        stop (bool): remove stopwords
        postags(list): list of POS tags
    Returns:
        tuple containing two represantions of a daocument:
        A list of tagged sentences (e.g. for doc2vec) and a list of words (e.g. for topic detection)
    """
    if stop:
        stopwords = set(sw.words('english'))
    doc_sents  = []
    doc_words  = []
    for sent in sents:
        if postags:
            words = sent.tags
        else:
            words = sent.words
        words_processed = []
        for word in words:
            if postags:
                if word[1] not in postags:
                    continue
                else:
                    word = word[0]
            if stop and word in stopwords:
                    continue
            if contains_number(word):
                    continue
            word = stem(slugify(word))
            if len(word) < 2:
                continue
            words_processed.append(word)
        doc_sents.append(models.doc2vec.LabeledSentence(words_processed,[movie_id]))
        doc_words += words_processed 
    return (doc_sents, doc_words)

Now we do the actual processing. For topic detection we apply keyword extraction, as we want to use only use those words that carry meaning. We then apply normalization and transform the plots into lists of words, that will later be transformed into bags of words for topic detection.
For doc2vec we return lists of tagged sentences, as this is the input for a doc2vec training corpus.

In [202]:
def process_tm(row):
    """process a row of a dataframe for topic modeling"""
    return process(sentences(keywords((row))))[1]

In [None]:
imdb_plots_tm = imdb_df['plot'].apply(process_tm)

In [203]:
wiki_df = wiki_df[~ wiki_df['plot'].isnull()]
wiki_plots_tm = wiki_df['plot'].apply(process_tm)


In [209]:
def process_d2v(row):
    title = row[0]
    plot = row[1]
    return  process(sentences(plot), movie_id=title, stop=True)[0]

In [211]:
imdb_plots_d2v = imdb_df[['title','plot']].apply(process_d2v, axis=1)
#TODO

ValueError: could not broadcast input array from shape (3) into shape (2)

In [None]:
wiki_plots_d2v = wiki_df[['title','plot']].apply(process_d2v, axis=1)

### Example

In [107]:
print dftouse['plots'][2]
print
print documents_tm[2]
print
print documents_d2v[2]

['Jules is, self declared, the most useless person in the post apocalyptic world, until he finds an old film camera and determines to make the greatest movie in the new world... the only movie in the new world. But his first day filming is proving to be much more difficult than he imagined. Who knew that making a movie after the end of the world would be so hard?']

[u'post', u'apocalypt', u'world', u'day', u'film', u'film', u'camera', u'useless', u'person', u'self', u'declar', u'world', u'determin', u'movi', u'prove', u'jule', u'imagin', u'hard', u'difficult']

[TaggedDocument(words=[u'jule', u'self', u'declar', u'useless', u'person', u'post', u'apocalypt', u'world', u'find', u'old', u'film', u'camera', u'determin', u'make', u'greatest', u'movi', u'new', u'world', u'movi', u'new', u'world'], tags=['#1 at the Apocalypse Box Office (2015)']), TaggedDocument(words=[u'but', u'first', u'day', u'film', u'prove', u'much', u'difficult', u'imagin'], tags=['#1 at the Apocalypse Box Office (2015

### Save to disc

In [None]:
def save(docs, name):
    with open(dropbox + name + '.list', 'wb') as f:
        for plot in docs:
            f.write("%s\n" % plot)

In [206]:
#save the documents
#save(imdb_plots_tm , 'imdb_plots')
#save(wiki_plots_tm , 'wiki_plots_2000-2015')
with open(dropbox +'imdb_plots_d2v.pickle', 'wb') as f:
    np.save(f, documents_d2v.values)
with open(dropbox +'wiki_plots_d2v.pickle', 'wb') as f:
    np.save(f, documents_d2v.values)

In [114]:
#save the corresponding movies
movies = dftouse['title'].values
with open(dropbox +'imdb_movies_since_2014.pickle', 'wb') as f:
    np.save(f,movies)

### Testing

In [108]:
doc = """St. Johns, Newfoundland is a city with a deep musical spirit and few bands embody that spirit better that The Once. Named for a unique Newfoundland phrase that means "imminently," now is indeed their time. Live at the Stagehouse is an hour of interviews and performances by this amazing trio, featuring songs recorded live off the floor in late 2013 at the picturesque Stagehouse Recording Studio in St. Philip's. The Once embraces a different vision of Newfoundland music. Their sounds do not come from the noisy pubs and dockside taverns that fuel so much of the Island's energy. Instead, their music comes from a quieter and more thoughtful place. Hope and tragedy are intertwined, whether they are singing an old lament from World War I, original songs that speak of love defeated, or tasteful songs from the artists whose music inspires them. The Once: Live at the Stagehouse features some of their best-loved tunes and some never heard before. It is a unique opportunity to see one of Canada's fastest rising bands in the place that inspires them and feeds their soul."""

In [111]:
test = process(sentences(doc))
testpostags = process(sentences(doc),postags=['FW', 'JJ','JJR','JJS','NN','NNS','NNP', 'NNPS','VB','VBD','VBG','VBN','VBP','VBZ'])
testkeywords = process(sentences(keywords(doc)))

In [126]:
print test[1]
print
print testkeywords[1]
print
print testpostags[1]

[u'st', u'john', u'newfoundland', u'is', u'citi', u'with', u'deep', u'music', u'spirit', u'and', u'few', u'band', u'embodi', u'that', u'spirit', u'better', u'that', u'the', u'onc', u'name', u'for', u'uniqu', u'newfoundland', u'phrase', u'that', u'mean', u'immin', u'now', u'is', u'inde', u'their', u'time', u'live', u'at', u'the', u'stagehous', u'is', u'an', u'hour', u'of', u'interview', u'and', u'perform', u'by', u'thi', u'amaz', u'trio', u'featur', u'song', u'record', u'live', u'off', u'the', u'floor', u'in', u'late', u'at', u'the', u'picturesqu', u'stagehous', u'record', u'studio', u'in', u'st', u'philip', u'the', u'onc', u'embrac', u'differ', u'vision', u'of', u'newfoundland', u'music', u'their', u'sound', u'do', u'not', u'come', u'from', u'the', u'noisi', u'pub', u'and', u'docksid', u'tavern', u'that', u'fuel', u'so', u'much', u'of', u'the', u'island', u'energi', u'instead', u'their', u'music', u'come', u'from', u'quieter', u'and', u'more', u'thought', u'place', u'hope', u'and', u't