Idea
----

Crear un **corpus** en base a los comentarios de una lista de películas con temáticas diversas.
Por ejemplo: películas en Cannes, mejores películas de un año

Crear un vocabulario por película y conteo de palabras
* Palabras que aparezcan al menos en 5% de los comentarios. 
* Palabras que aparezcan en menos del 50% (para evitar palabras muy específicas de la película).
* Asociar palabra a la proporción de **comentarios** (y no veces) en las que aparece

Unir vocabularios en **vocabulario_global**
* Colocar palabras que aparezcan en al menos 5% de los vocabularios.
* Colocar palabras que aparezcan en menos de un 20% de los vocabularios (para evitar palabras comunes a todas las películas como best movie, worst movie, waste time, etc.)

Para calcular el peso de una palabra p:
* Identificar máxima proporción de p en los vocabularios en los que aparece (e.g., 0.25)
* *p_norm*: Vector de proporciones normalizadas para que la máxima proporción sea 1.0 (e.g., multiplicando por 4)
* El peso debería ser **inversamente proporcional** a la suma de las proporciones normalizadas (e.g., *nb_movies/sum(p_norm)*). De esta manera, si la palabra aparece en similar proporción en todas las películas tendrá un peso = 1.0 (el menor). El peso mayor ocurre si la palabra aparece en sólo 1 película. 


In [1]:
import time
import requests
from bs4 import BeautifulSoup

def Extract_UserReview(imdbID, tag, reviews, pages_per_movie):
    key=""
    n=0
    while n < pages_per_movie and key!=None:
        if key=="":
          url = 'https://www.imdb.com/title/{}/reviews/_ajax?sort=submissionDate'.format(imdbID)
        else:
          url = 'https://www.imdb.com/title/{}/reviews/_ajax?sort=submissionDate&paginationKey={}'.format(imdbID,key)

        data = requests.get(url)
        soup = BeautifulSoup(data.text, 'html.parser')      

        #Extracting the user Review of the movie
        rev = soup.findAll('div', {'class': 'review-container'})
          
        for res in rev:
            reviews.id.append(imdbID)
            rating_div =  res.findNext('span',  {'class': 'point-scale'})
            #print(int(rating_div.find_previous_sibling('span').text))
            if rating_div != None:
                reviews.rating.append(int(rating_div.find_previous_sibling('span').text))
            else:
                 reviews.rating.append(-1)
            reviews.title.append(res.findNext('a',  {'class': 'title'}).text)
            reviews.data.append(res.findNext('div',  {'class': 'text'}).text)
            reviews.target.append(tag)
            #userReview.append(res.findNext('div',  {'class': 'text'}).text)
            #if userReview is None:
            #   userReview='N/A'

        loadmore = soup.find('div', {'class': 'load-more-data'})
        if loadmore == None: break
        key = loadmore['data-key']
        n += 1
        time.sleep(1)
       
    return n

In [2]:
import time

def Extract_Reviews_List(imdb_list, reviews, pages_per_movie=1, n_movies=5):
    i=1
    while True:
      url = 'https://www.imdb.com/list/{}/?sort=moviemeter,asc&st_dt=&mode=detail?page={}'.format(imdb_list,i)
      data = requests.get(url)
      soup = BeautifulSoup(data.text, 'html.parser')      

      #Extracting the user Review of the movie
      movie = soup.findAll('div', {'class': 'lister-item-content'})
      if movie == None: return
      n = 0
      for res in movie:
        time.sleep(1)
        id_movie = res.h3.a["href"].split('/')[-2]
        print(id_movie)
        reviews[id_movie]=Reviews()
        Extract_UserReview(id_movie, id_movie, reviews[id_movie], pages_per_movie)
        n += 1
        if n==n_movies: return
      i += 1

In [3]:
class Reviews:
    def __init__(self):
        self.id = []
        self.title = []
        self.data = []
        self.rating = []
        self.target = [] 
        
import csv
import io
reviews = dict()
with io.open("data.csv", encoding="utf-8") as input:
    reader = csv.reader(input)
    for row in reader:
        if len(row)==0: continue
        if row[0] not in reviews: reviews[row[0]]=Reviews()
        reviews[row[0]].id.append(row[0])
        reviews[row[0]].target.append(row[0])
        reviews[row[0]].title.append(row[1])
        reviews[row[0]].data.append(row[2])
        reviews[row[0]].rating.append(float(row[3]))

In [56]:
import re
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

def pre_process(texts):
    documents = []
    stemmer = WordNetLemmatizer()
    for document in texts:
        # remove all single characters
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

        # Remove single characters from the start
        document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 

        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', document, flags=re.I)

        # Removing prefixed 'b'
        document = re.sub(r'^b\s+', '', document)

        # Converting to Lowercase
        document = document.lower()

        # Lemmatization
        document = document.split()

        document = [stemmer.lemmatize(word) for word in document]
        document = ' '.join(document)

        documents.append(document)
    
    return documents

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [230]:
#Creating the vocabulary and vectors
import nltk
from sklearn.datasets import load_files
nltk.download('stopwords')
import pickle
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

for key in reviews:
    x=reviews[key].data
    docs = pre_process(x)
    reviews[key].cv = CountVectorizer(binary=True, min_df=3, max_df=20, ngram_range=(1, 2), stop_words=stopwords.words('english'))
    reviews[key].counter = reviews[key].cv.fit_transform(docs).toarray().sum(axis=0)/len(reviews[key].data)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [235]:
#print(reviews["tt6751668"].cv.get_feature_names()[47])
#print(reviews["tt6751668"].counter)
import numpy as np
  
global_vocab = set()
for key in reviews:
    global_vocab = global_vocab.union(reviews[key].cv.vocabulary_.keys())
    
print(len(global_vocab))

global_vocab2 = dict()
for word in global_vocab:
    count = 0
    arr = np.zeros(len(reviews))
    i=0
    for key in reviews:
        if word in reviews[key].cv.vocabulary_:
            count += 1
            arr[i] = reviews[key].counter[reviews[key].cv.vocabulary_[word]]
        i += 1
    
    
    
    if count >= 2 and count < 20:
        norm1 = arr / np.linalg.norm(arr)
        global_vocab2[word]=1.0/norm1.sum()
        #global_vocab2[word]=count

print (len(global_vocab2))
print (global_vocab2)

    
    

10761
2845


In [76]:
#PreProcessing

x=[]
y=[]
for key in reviews:
    x += [' '.join(reviews[key].data)]
    y += [key]
    #x += reviews[key].data
    #y += [key]*len(reviews[key].data)
docs = pre_process(x) # <-- optimizar!

In [77]:
#Creating the vocabulary and vectors
import nltk
from sklearn.datasets import load_files
nltk.download('stopwords')
import pickle
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


vectorizer = CountVectorizer(min_df=2, max_df=10, ngram_range=(1, 2), stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(docs).toarray()

tfidfconverter = TfidfTransformer(norm='l1')
X = tfidfconverter.fit_transform(X).toarray()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [78]:
X.shape

(99, 59634)

In [None]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []

    for idx, score in sorted_items:
        fname = feature_names[idx]
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [80]:
# you only needs to do this once
feature_names=vectorizer.get_feature_names()

# get the document that we want to extract keywords from
ms_slavic_7='Art is often a gateway for processing the complexities of life. It offers insight into lives and perspectives both familiar and foreign. However, how much do we truly learn about the artist from the things they leave behind? This is one of several questions explored in Sofia Bohdanowicz’s sparse, but effective, MS Slavic 7. \nIn her latest film Bohdanowicz uses her alter ego Audrey Benac (co-director Deragh Campbell who also gives a sensational turn this year in Anne at 13,000 ft) to explore the significance of correspondence of two famed Polish poets. One of which is Zofia Bohdanowiczowa — Bohdanowicz’s great-grandmother — who was displaced after World War II and ended up in Toronto. Zofia wrote several letters to fellow poet Józef Wittlin, who landed in New York, which touched on everything from the sense of isolation their situation caused to the nature of creativity.\nThe bulk of the film follows Audrey as she travels to Harvard, the title refers to the library catalogue the letters are in, to conduct research on the correspondence and gain greater insight into Zofia’s legacy. As Zofia’s literary executor, Audrey toys with the idea of curating the letters into some sort of exhibit. A concept that does not sit well with her aunt (Elizabeth Rucker) who seems determined to stifle every idea Audrey has.\nNavigating family and language barriers, Audrey’s quest to find meaning in the letters proves to be a far more challenging endeavour than she initially anticipates.Similar to its protagonist, one does not find a lot of answers in MS Slavic 7. The film’s sparse and repetitious nature leaves little room for deep fact finding. However, in its pondering of family and the importance of the letters, the film proves to be quite captivating.\nMS Slavic 7 is a work of art that one absorbs and meditates on. Bohdanowicz plays with the cinematic medium in ways that subvert typical conventions. Rather than having Audrey and others read countless letters aloud, Bohdanowicz displays select sentences on screen and lets the sounds of Audrey turning pages and scribbling in a book fill the room. She examines envelopes with the same linger patience of an archeologist at an excavation site. When the audience finally gets to hear one of the letters in full it comes after a rather humorous and unexpected moment.\nHow much Audrey, and by extension Bohdanowicz, learns about Zofia is questionable. However, that is the point. The film challenges one to ponder the nature of connection and correspondence. Can we ever truly know someone through mere fragments of their life? MS Slavic 7 is content with creatively posing the questions, wisely leaving it up to viewers to decipher what it all means.\nSofia Bohdanowicz is a Torontonian whose slender films walk the lines between amateur and professional, fiction and documentary, and which share a common theme of youngsters calling back to previous generations in a quest for nourishment. Her most well-known previous movie, 2017’s Maison du bonheur, was a 62-minute tour of pleasures shot on warm 16mm, in which Bohdanowicz herself visited a friend’s septuagenarian aunt in Paris and basked in the contact high of her hortensia garden, challah dough and pearls of wisdom.\nMS Slavic 7, which clocks in at 64 minutes, is shot digitally and is less a compendium of comforts, but traverses a similar path of searching. The title refers to the library call number of an archived series of letters between Bohdanowicz’s great-grandmother, the poet Zofia Bohdanowiczowa, and the Nobel Prize-nominated author Józef Wittlin. Both had been displaced from Poland during WWII and shared a correspondence between 1957 and 1994, when Bohdanowiczowa lived first in Penrhos, Wales, and later in Toronto, while Wittlin settled in New York City.\nSearching through the letters is Audrey, a character who made appearances in Bohdanowicz’s feature Never Eat Alone (2016) and short Veslemøy’s Song (2018). Originally intended as an analogue for the filmmaker, Audrey has subsequently absorbed characteristics of the actor playing her, Deragh Campbell, as well as generating something of a life of her own. The complication between what constitutes imagination and reality is a curiosity that the movie takes in its stride, opening with title cards explaining the factual context of the letters, continuing with documentary-like scenes and then clearly moving into staged dramatic moments.\nAudrey discovers through the letters that Zofia didn’t want Józef to stay in New York; she believed “you’re smashed by the city,” that urban living was “the utmost evil of our times” and that he could do his best work in the countryside. As might be expected from two wordsmiths cast adrift by the turbulence of war, the letters are articulately suffused with emotion: “We met on the day of atonement when the sky swelled with the souls of Israel,” writes Wittlin. “Being here is like being in a hearth in God’s house,” observes Bohdanowiczowa.\nThe young protagonist, meanwhile, unpacks her belongings in an anodyne hotel room whose main lifelines are a pod-powered coffee-maker and shop-bought bottles of wine. The letters provide a template of perseverance and authenticity, and in a protracted stream of consciousness, Audrey grapples with their power: “There’s something heartbreakingly desperate about how there’s intention in the very meaning of a letter… a letter is so completely tied to its objecthood, that reinforces the content.” In another lovely scene, Audrey and a translator reflect on how Bohdanowiczowa gave Wittlin the nickname ‘mint’, in the sentence, “I feel for you, mint.” It’s a herb that cuts through other flavours, reflects the translator – there’s an indomitable quality to its aromatics.\nThere’s also a villain to this piece, Audrey’s ridiculous aunt, an uptight fusspot in the crossover demographic between Boomer and Gen Xer, who lectures our hero about meddling with family archives. “You can’t just be a curator, you need a Master’s degree, and training,” chides the aunt, who also tells a self-pitying story revealing her own incompetence, about how she laid down to rest in Berlin’s Holocaust Memorial – not only not a restful place, but a notorious spot for pickpockets – only to find herself robbed by teenagers.\nThe film indeed has strange asides of humour – an anal jobsworth warns in monotone than no pens must be taken into the library; the emotional scene with the translator cuts to a punchline where he and Audrey are in a decidedly more lighthearted entanglement. Between these, the soulful gravity of the letters and the dismissiveness of the aunt, MS Slavic 7 becomes a story of pilgrimage to inspiration in a contemporary milieu whose surface would otherwise offer only demotivation.\nIn this sense, Bohdanowicz is a quintessentially millennial filmmaker, depicting a generation whose struggles with ‘adulting’ have launched infinite thinkpieces, and affirming there are solid role models to hang onto. MS Slavic 7 is an odd duck that embodies both disarming sweetness and lurching awkwardness, which perhaps befits a tale of personal discovery. Along with the likes of Dan Sallitt’s Fourteen, another Berlinale highlight, it represents the ranks of small-scale North American filmmakers with distinct points of view and growing bodies of work, who may well be on the verge of a breakthrough in finding new audiences."MS Slavic 7, the new film from directors Sofia Bohdanowicz and Deragh Campbell, is deceptively small and inanimate. Its story is based on the existence of real correspondence between Bohdanowicz’s great-grandmother Zofia and the poet and Nobel Prize nominee Józef Wittlin. The inherent limitations of the film’s focus are due in part to budget constraints, but that’s a reductive complaint; what’s done within those limits is what makes the film feel as substantial as it does.Toronto-based Audrey (Campbell)—as a stand-in for Bohdanowicz—visits Houghton Library at Harvard for a chance to study the aforementioned letters. At first, her examination appears to focus entirely on their tactile existence, their touch and appearance, even the sounds they make while handled. Through clever use of subtitles and Audrey’s halting explanations, however, it becomes clear there’s more at work here, philosophically and emotionally; the letters begin to transcend their humble objecthood. MS Slavic 7 juxtaposes this inner journey with Audrey’s outer struggle to gain control of the material from her family, namely her aunt Ania (Elizabeth Rucker). This appears as a provincial problem, one that frustrates Audrey as she travels, literally and figuratively, through time and space on her search. The modest tension of the film, such as it is, comes easily through its present-day back and forths, animated capably by Campbell, yet it’s Zofia’s heartrending missives, unchanged after decades, that linger.In this realm of pure language, MS Slavic 7 is able to explore longing, sadness, dislocation, and the very means by which we grasp to articulate these and other feelings to each other. As it arrives at these points, suddenly Bohdanowicz and Campbell’s film does not seem so small after all. And while a glance at any single frame doesn’t necessarily reveal much emotive power, the film’s very minimalism—subtitles overlaid on silence, still long takes, and other bits of fine-grained minutia—achieves new levels of meaning.There is a line found in the correspondence between Polish poet Zofia Bohdanowiczowa and Polish author Jozef Wittlin, where one says to the other, “sit in a quiet corner and enjoy your soul.” The filmmaker tandem of Sofia Bohdanowicz (great-granddaughter of the aforementioned poet) and Deragh Campbell take that direction to heart by centering the majority of MS Slavic 7 around that exact piece of advice.The film’s title doubles as the call code at the Houghton Library at Harvard University for a collection of letters written from Bohdanowiczowa to Wittlin between the years of 1957 to 1964. Both writers were displaced during WWII, with Zofia relocating to Toronto by way of Wales and Jozef living in New York City. Literary executor, Audrey Benac (played by Campbell as a surrogate for her co-director), gains access these documents in part because it is her responsibility (one that she takes very seriously) but more so because the sheer act of rifling through these papers brings her great pleasure.A small room at the Houghton Library is her quiet corner, and her solitary pursuits of genealogical academia seem to evoke an immense joy in her life. You can tell in the way she delicately folds and unfolds each letter with the lightest of touches, exclusive fingertip holds reverse-engineering each step in the preparation for its temporary home. She is just as enamored with the tangible aspects of the letters as she is with their contents, treating them like ancient artifacts from the beginning of time.Audrey’s obsession with the letters extends into quasi-interviews that she partakes in at a local coffee shop, where she not only breaks down the writing and the repetitive symbolism used throughout but she also delves deep into the inherent qualities of a letter as an object. Verbalizing their essence to an unknown individual, she wanders out into the weeds thinking her way through their importance. Every facet of these letters has become her life — so much so, that it has become the only way in which she can really connect with other people.Bohdanowicz and Campbell accentuate the peacefulness of Audrey’s endeavor as well as her infatuation with the articles themselves by focusing the camera in close, most of the frame devoted to Campbell’s hands gingerly manipulating the pages. They are drawing our focus in tandem with her focus, putting the viewer in Audrey’s headspace. Alongside this narrow focus is the lack of any accompanying audible translations of the text, with no voiceover recitals of their contents. Instead, Bohdanowicz and Campbell go the route of having on-screen subtitles appear at the bottom of the frame, translating particular excerpts as if presenting Audrey’s internal dialogue telepathically to the surface.It is easy to see why Audrey prefers the peaceful confines of an archive surrounded by the intermittent rustling of paper. Much of her interactions, which mostly take place at a relative’s 60th wedding anniversary, are mired in frustration and an inability to connect or communicate the importance of what she is doing. Her goal of someday public exhibiting her great-grandmother’s correspondence is met with skepticism and concern.Bohdanowicz and Campbell have collaborated on a small-scale docufiction that showcases the ability for individuals to partially find themselves through a connection with the past, especially when that individual comes from a family that is countryless, so to speak. Audrey is able to find a kindred spirit in her great-grandmother through her words as speaks about being without a homeland. So too does Audrey begin to understand her inability to connect with her Polish family. The of being uprooted  and the lack of connection that comes with it have been passed down through each generation, and Zofia Bohdanowiczowa’s words on the matter seem to be a great solace.'

winter_woman = []
winter_woman.append('Based on a serial novel by Cho Hae-il, “Winter Woman” deals with the sexual awakening of Yi-hwa, the daughter of a prosperous Christian preacher who has been raised to be morally and sexually conservative. The book and film earned the condemnation of conservative critics, however the author’s leftist subtext went unchallenged overshadowed by the sexual themes. The film was the best selling Korean film of the 1970s and made a star of its female lead, Chang Mi-hee.')
winter_woman.append('In 1970s South Korea, a new type of film emerged as a result of the relaxation of anti-obscenity censorship laws and the emergence of directors that were part of the post-liberation generation. The films often focused on hostesses (a type of bargirl or prostitute) and dealt with taboo topics like sexuality. Winter Woman (1977) doesn’t actually feature a hostess in the main role, but is still considered part of this wave of so-called hostess melodramas and is the most successful work in the genre; until 1990, it was the most highly sold domestic film in South Korea. Despite this, it appears to be little known outside of the country. The film centers on a young woman, Ie-hwa, who was raised in a middle-class, conservatively Christian family, and how her relationships with three men lead to her sexual awakening. It is worth noting that the film is based on the eponymous novel by Cho Hae-il, however, judging from the descriptions and analyses of the novel that I have read, they differ quite a bit in plot. The book appears to be a muddled Marxist allegory story in which the main character starts offering her body for others to use for free after being raped because she acquires class consciousness, while the film is a far more unequivocal coming-of-age tale. Although the criticism of the family unit is present in both works, it seems less pronounced in the adaptation. ”Meekness” is a fitting word to describe Ie-hwa. She could have been so much more, but for the most part, her personality is largely defined by the three men in her lives. Out of the three men involved with her, only one of them is really amiable; the stalker is a lowlife goon who deserves no sympathy, and the university student is a thoroughly mediocre individual. On the other hand, the teacher (played wonderfully by Shin Seong-il) is quite likable; his arc is the best part of the story. The film also features some nudity (rather tame by modern standards, but when Winter Woman first came out, it made conservative critics shriek in horror) alongside a few beautiful landscape shots and strange dream sequences which I desperately wish there had been more of. Despite not being a masterpiece, Winter Woman is far from bad. It suffers from having a meek protagonist and a first half that is dragged down by two unlikable men, but the second half makes up for it. Ultimately, is very interesting to see how South Korean cinema has evolved overtime (and what South Korea looked like during the 1970s!) and compare this film to works which are considered risqué nowadays.')
winter_woman.append('Solid on the technical level, but the rather tame story is a definite product of and specific to its times. Some nerve-wracking balcony climbing though, so theres that.')
winter_woman.append('Presents us with three major episodes in the coming of age of a Korean woman: the initial, halting awakening of romantic feelings; the doomed first true love; and her, um, "liberated" phase, where she throws herself at her (much older) school teacher. None of these sequences work: the first fails to trigger the desired emotional response because it’s shot like horror; the second is a maudlin affair dominated by pretty landscape shots and a surge in the soundtrack, and the final stretch is downright creepy, the actors doing their best with softcore lines like ‘Teacher, I’ll be your lover.’ As each scenario is defined almost exclusively by our protagonist’s relationship to a man, and none meaningfully informs any other, she remains a blank slate throughout, with no discernible character arc. Directed by Kim Ho-sun, who made Yeongja’s Heydays, it’s almost a thematic cousin to that work, but (it saddens me to say) it tarnishes the earlier film by comparison, and has none of its sharp eye for dissecting power structures of 70s Seoul.')

olla = []
olla.append('Different meanings and interpretations are possible. The main actress plays very well and excels at alternating between timorous and wanton behaviour. Some scenes made me laugh out loud. All in all, the overall atmosphere of the movie is a bit strange but I didnt regret watching it.')
olla.append('A gentle slight movie, highlight Olla a internet "bride" from the internet who is brought over from Eastern Europe to the still back waters of a French suburb to look after Pierre and his elderly mother. Things dont go as planned, or do they. Depending on whose perspective you watch thx film. Pierre or Olla.')
olla.append('No storyline, no acting, no directorial sense. Waste of 27 minutes of time. Theres no plot. Just random scenes clubbed together.')
olla.append('It follows Olla, a woman from eastern Europe, who responds to an advertisement on a dating site. She moves in with Pierre, who lives with his old mother in a suburb in France. But nothing goes as planned.')
olla.append('This half-hour short film by Ariane Labed is a sparkily absurdist declaration of female independence that bucks against male fantasy. Romanna Lobach, blood-orange tresses blazing through the mist as she walks across a field in the opening shot, is the Russian mail-order bride shipped in by Pierre (Grégoire Tachnakian), a French bachelor living in a time-warp suburban house with his infirm mother. Fawning over his bride across the kitchen table, he christens her Lola and sets her to work as nursemaid. Labed – who also acted in her husband Yorgos Lanthimos’s projects Alps, Attenberg and The Lobster – favours the medium shot, all the better for deadpan dispatch of character. Lola and Pierre shuffle around the living room in fabric shoe covers to walk and polish at the same time; she bonds with his mum by playing with the remote-control footrest on the pensioner’s La-Z-Boy recliner. The film builds up a nice head of quirk, underpinned by Lola’s outsider status. Even the many-headed hydra of estate ne’er-do-wells who greet her as one – “Dirty slut!” – as she trots past in ankle boots seems strangely endearing. The fantasy turns sour after Lola gives a Nancy Reagan-style makeover to the old lady. Pierre’s displeased reaction pushes his bride to reclaim her real name and begin venting. Labed’s comic inflections degenerate into hysteria and violence – especially in the aftermath of one key dance scene – in a controlled manner that promises well for any full-length work she might have en route. The vermillion shade of Lola’s hair continues to bloom on the colour palette – in bathroom tiles, sunset glints on windows, her supermarket carrier bag – as if in solidarity.')


#kansas_reviews = Reviews()
#Extract_UserReview("tt0044789", "kansas", kansas_reviews, 2)

#fear_desire = Reviews()
#Extract_UserReview("tt0045758", "kubrick", fear_desire, 4)

#etre_avoir = Reviews()
#Extract_UserReview("tt0318202", "etre_avoir", etre_avoir, 4)

#monos = Reviews()
#Extract_UserReview("tt6062774", "monos", monos, 10)

#gabriel = Reviews()
#Extract_UserReview("tt6805354", "gabriel", gabriel, 10)

#parasite = Reviews()
#Extract_UserReview("tt6751668", "parasite", parasite, 10)
 
#into_the_wild = Reviews()
#Extract_UserReview("tt0758758", "into_the_wild", into_the_wild, 10)

#marx_brothers = Reviews()
#Extract_UserReview("tt0020640", "marx_brothers", marx_brothers, 10)

rev = Reviews()
Extract_UserReview("tt1392190", "mad_max", rev, 10)

review2 = pre_process(rev.data)
#generate tf-idf for the given document
tf_idf_vector=tfidfconverter.transform(vectorizer.transform(review2))

#sort the tf-idf vectors by descending order of scores
sorted_items=sort_coo(tf_idf_vector.tocoo())

#extract only the top n; n here is 10
keywords=extract_topn_from_vector(feature_names,sorted_items,100)

#print(keywords["letter"])
#print(keywords["library"])
#print(keywords["documentary"])
#print(keywords["strange"])
#print(keywords["research"])
#print(keywords["work art"])
#print(keywords["coming age"])

#for rev in review2:
#    print ([rev[m.start()-40:m.end()+40] for m in re.finditer('unpredictable', rev)])

# now print the results
print("\n===Keywords===")
for k in keywords:
    print(k,keywords[k])
    




===Keywords===
say watch 1.0
old one 1.0
ok great 1.0
movie clearly 1.0
movie action 1.0
good interesting 1.0
giving 10 1.0
film perfect 1.0
dick 1.0
could hardly 1.0
better action 1.0
action movies 0.5
jumping around 0.584
like costume 0.577
quality get 0.561
made hour 0.561
movie art 0.555
made perfect 0.553
amazing action 0.553
perfect perfect 0.552
costume music 0.552
character evolution 0.53
action action 0.53
lil 0.524
people going 0.518
plot well 0.514
everything need 0.509
hollywood ha 0.5
hollyweird 0.5
best action 0.5
part well 0.491
time despite 0.486
milk 0.482
good also 0.476
without plot 0.47
action plot 0.47
keep hooked 0.448
describe movie 0.448
wa beautifully 0.447
perfect every 0.447
feast 0.445
bearable 0.439
40 year 0.439
lot film 0.423
insanity 0.416
chick 0.414
hoarse 0.388
movie decade 0.381
editing cinematography 0.38
utter crap 0.378
actor actresses 0.378
contemplate 0.377
film happy 0.361
every action 0.359
favourite actor 0.354
whole set 0.351
hardy 0.273
co

In [41]:
import numpy as np
def get_similarity(data1, data2, vectorizer, tfidfconverter):
    vec1=tfidfconverter.transform(vectorizer.transform(pre_process(data1))).toarray()
    vec1_norm=vec1.sum(axis=0)
    vec1_norm=vec1_norm/vec1.shape[0]
    vec2=tfidfconverter.transform(vectorizer.transform(pre_process(data2))).toarray()
    vec2_norm=vec2.sum(axis=0)
    vec2_norm=vec2_norm/vec2.shape[0]
    sim=0.0
    for i in range(vec1_norm.shape[0]):
        sim+= np.sqrt(vec1_norm[i]*vec2_norm[i])
    return sim

In [83]:
revs = Reviews()
Extract_UserReview("tt0364569", "-", revs, 5)
for rev in reviews:
    print(rev, ":", get_similarity(revs.data,reviews[rev].data,vectorizer, tfidfconverter))

tt6751668 : 0.06645980166445761
tt9368628 : 0.01748071408315074
tt7670212 : 0.03798650612255568
tt8649186 : 0.023578637347627513
tt8228288 : 0.03789343155235988
tt7286456 : 0.028946205077731085
tt0437086 : 0.039623401799538674
tt7366338 : 0.02435856448029495
tt1190634 : 0.024541731096271602
tt8772296 : 0.023133454862043594
tt6156138 : 0.02493636313876226
tt6611916 : 0.05094887733498615
tt1950186 : 0.02562913193576093
tt5593384 : 0.011669106943187374
tt10199590 : 0.021073960017736374
tt8991268 : 0.019733101638935456
tt9074574 : 0.008809655141395418
tt7131622 : 0.04252670618730704
tt8579674 : 0.04101074348686164
tt5727208 : 0.020914711223168368
tt6292852 : 0.032176186103997126
tt8739752 : 0.015940836234573635
tt6821044 : 0.01967417849346166
tt7139936 : 0.030146055147983547
tt9185066 : 0.021764313118711204
tt8535968 : 0.01904205354254929
tt4648786 : 0.03993015320085286
tt8526872 : 0.025901884871303994
tt3513548 : 0.0317769316045726
tt5606664 : 0.036545878320969456
tt9561862 : 0.0394723043