# Content-Based Movie Recommendation

Dataset Source:

https://www.kaggle.com/datasets/jrobischon/wikipedia-movie-plots/

In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
df = pd.read_csv("../data/wiki_movie_plots_deduped.csv")
df.shape

(34886, 8)

In [4]:
df.sample(5)

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
5307,1951,The Big Night,American,Joseph Losey,"John Drew Barrymore, Dorothy Comingore",film noir,https://en.wikipedia.org/wiki/The_Big_Night_(1...,"On his teenaged son Georgie's birthday, Andy L..."
3302,1942,Flying Tigers,American,David Miller,John Wayne,war,https://en.wikipedia.org/wiki/Flying_Tigers_(f...,Jim Gordon (John Wayne in his first war film) ...
16548,2013,Her,American,Spike Jonze,"Joaquin Phoenix, Amy Adams, Rooney Mara, Scarl...",romance,https://en.wikipedia.org/wiki/Her_(film),"In a near future Los Angeles, Theodore Twombly..."
4283,1946,The Mask of Diijon,American,Lew Landers,"Erich von Stroheim, Jeanne Bates",drama,https://en.wikipedia.org/wiki/The_Mask_of_Diijon,"Diijon, a tired magician, gives up his act to ..."
29973,1992,Onna Irukka Kathukanum,Tamil,V. Shekar,"Sivakumar, Jeeva, Goundamani",unknown,https://en.wikipedia.org/wiki/Onna_Irukka_Kath...,The story begins with a group of government of...


## Preprocessing

### Converting to String

In [5]:
df.columns

Index(['Release Year', 'Title', 'Origin/Ethnicity', 'Director', 'Cast',
       'Genre', 'Wiki Page', 'Plot'],
      dtype='object')

In [6]:
df = df.astype(str)
df.dtypes

Release Year        object
Title               object
Origin/Ethnicity    object
Director            object
Cast                object
Genre               object
Wiki Page           object
Plot                object
dtype: object

### Data Cleaning

Action Items:

* Lower-Case the whole data frame
* Director: Removing 'Director:' and 'Cast:'
* Director, Cast: Removing '\r\n', '\n' and '\r'

* Genre: Replacing '/' with Space
* Director, Cast, Genre: Removing 'Uknonwn' and 'Nan'

* Director: Separating Directors and Actors names
* Director, Cast: Checking if the names are separated with ' and ', ' & '

* Director, Cast: Merging the first names and last names together
* Director, Cast: Adding the words of 'Director' and 'Actor' as prefix

* Plot: Removing English Stopwords
* Doc: Removing special characters

In [7]:
df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)

  df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)


In [8]:
df["Director"] = df["Director"].str.replace("director:", "", regex=False)
df["Director"] = df["Director"].str.replace("cast:", "", regex=False)

df["Director"] = df["Director"].str.replace("\r\n", " ", regex=False)
df["Cast"] = df["Cast"].str.replace("\r\n", " ", regex=False)

df["Director"] = df["Director"].str.replace("\n", " ", regex=False)
df["Cast"] = df["Cast"].str.replace("\n", " ", regex=False)

df["Genre"] = df["Genre"].str.replace("/", " ", regex=False)

df["Director"] = df["Director"].str.replace("unknown", "", regex=False)
df["Cast"] = df["Cast"].str.replace("unknown", "", regex=False)
df["Genre"] = df["Genre"].str.replace("unknown", "", regex=False)

df["Director"] = df["Director"].str.replace("nan", "", regex=False)
df["Cast"] = df["Cast"].str.replace("nan", "", regex=False)
df["Genre"] = df["Genre"].str.replace("nan", "", regex=False)


df["Director"] = df["Director"].str.replace(" and ", ",", regex=False)
df["Cast"] = df["Cast"].str.replace(" and ", ",", regex=False)
df["Director"] = df["Director"].str.replace(" & ", ",", regex=False)
df["Cast"] = df["Cast"].str.replace(" & ", ",", regex=False)


df["Director"] = df["Director"].str.replace(" ", "", regex=False)
df["Cast"] = df["Cast"].str.replace(" ", "", regex=False)

df["Director"] = df["Director"].str.replace(",", " ", regex=False)
df["Cast"] = df["Cast"].str.replace(",", " ", regex=False)


In [9]:
df["Director"] = np.where(df["Director"].str.len() > 0,
                          'director' + df["Director"],
                          df["Director"])

df["Cast"] = np.where(df["Cast"].str.len() > 0,
                      'actor' + df["Cast"],
                      df["Cast"])

df["Director"] = df["Director"].str.replace(" ", " director", regex=False)
df["Cast"] = df["Cast"].str.replace(" ", " actor", regex=False)


In [10]:
df.sample(5)

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
26026,2000,baaghi,bollywood,directorrajeshkumarsingh,actorsunildutt actormanishakoirala,action,https://en.wikipedia.org/wiki/baaghi_(2000_film),raja (sanjay dutt) is in love with a night clu...
27449,2012,rambo,kannada,directorm.s.sreenath,actorsharan actormadhuri actordeepika actorshr...,comedy,https://en.wikipedia.org/wiki/rambo_(2012_film),‘rambo’ is a film that revolves around a trick...
30429,2001,kutty,tamil,directorjanakivishwanathan,actorp.shwetha actorramesharavind actoreaswarirao,,https://en.wikipedia.org/wiki/kutty_(2001_film),the film revolves around a young girl who is f...
15457,2008,the house bunny,american,directorfredwolf,actorannafaris actoremmastone actorcolinhanks ...,comedy,https://en.wikipedia.org/wiki/the_house_bunny,shelley darlington (anna faris) is an aspiring...
13681,2000,lost souls,american,directorjanuszkamiński,actorwinonaryder actorbenchaplin,horror,https://en.wikipedia.org/wiki/lost_souls_(film),a small group of fervent roman catholics belie...


### Merging the document

In [11]:
column_weights = {"Release Year": 10,
                  "Title": 1,
                  "Origin/Ethnicity": 5,
                  "Director": 5,
                  "Cast": 1,
                  "Genre": 10,
                  "Plot": 1}

df["doc"] = ""

for col in column_weights.keys():
    df["doc"] += column_weights[col] * (df[col] + ' ')

df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot,doc
0,1901,kansas saloon smashers,american,,,,https://en.wikipedia.org/wiki/kansas_saloon_sm...,"a bartender is working at a saloon, serving dr...",1901 1901 1901 1901 1901 1901 1901 1901 1901 1...
1,1901,love by the light of the moon,american,,,,https://en.wikipedia.org/wiki/love_by_the_ligh...,"the moon, painted with a smiling face hangs ov...",1901 1901 1901 1901 1901 1901 1901 1901 1901 1...
2,1901,the martyred presidents,american,,,,https://en.wikipedia.org/wiki/the_martyred_pre...,"the film, just over a minute long, is composed...",1901 1901 1901 1901 1901 1901 1901 1901 1901 1...
3,1901,"terrible teddy, the grizzly king",american,,,,"https://en.wikipedia.org/wiki/terrible_teddy,_...",lasting just 61 seconds and consisting of two ...,1901 1901 1901 1901 1901 1901 1901 1901 1901 1...
4,1902,jack and the beanstalk,american,directorgeorges.fleming directoredwins.porter,,,https://en.wikipedia.org/wiki/jack_and_the_bea...,the earliest known adaptation of the classic f...,1902 1902 1902 1902 1902 1902 1902 1902 1902 1...


### Removing Special Characters

In [31]:
df["doc"] = df["doc"].str.replace("[^a-z 0-9]+", "", regex=True)

In [32]:
pd.set_option('display.max_colwidth', None)
df[["doc"]].sample(10)

Unnamed: 0,doc
9436,1977 1977 1977 1977 1977 1977 1977 1977 1977 1977 viva knievel american american american american american directorgordondouglas directorgordondouglas directorgordondouglas directorgordondouglas directorgordondouglas actorevelknievel actorlaurenhutton actorgenekelly action biography action biography action biography action biography action biography action biography action biography action biography action biography action biography daredevil motorcycle rider evel knievel stars as himself in this fictional story the film opens with knievel sneaking into an orphanage late at night to deliver presents evel knievel action figures one of the boys casts away his crutches telling knievel that hell walk after his accident just as knievel hadknievel then prepares for another of his stunt jumps we are introduced to his alcoholic mechanic will atkins gene kelly who was a former stunt rider himself before his wife died driving him to drink while signing autographs knievel is ambushed by feminist photojournalist kate morgan lauren hutton who has been sent to photograph the jump if knievel is killed it will be a great storyas it happens evel does crash while attempting the stunt and though badly injured survives he berates morgan announces his retirement and is taken to the hospitalwhile rehabilitating knievel resists all attempts to get back on the horse including those from jessie marjoe gortner a former protg with mysterious backers who want evel to do a jump in mexico eventually though knievel relents and agreesa subplot develops when wills estranged son tommy shows up from boarding school and asks to join the tour will who is reminded of his dead wife is cold to tommy leaving knievel to show the boy kindness likewise kate reappears apologetic for her previous motives and now wishes that he will never stop jumpingmeanwhile jessies benefactor is revealed drug lord stanley millard leslie nielsen millard without jessies knowledge plans to cause a fatal accident during the jump he will then have knievels body transported back to america in an exact duplicate of the tour trailer but one that has a massive supply of drugs hidden in the wallswill however stumbles onto the plot is drugged and sent to a psychiatric ward under the control of the corrupt ralph thompson dabney coleman to prevent him from spilling the beans evel sneaks into the ward late at night when will has dried out but all will can remember is that someone knocked him out knievel leaves him there to keep whoever is behind the plot in the darkas knievel prepares for the jump down a massive ramp and over a fire pit jessiehopped up on drugsconfronts evel claiming that he will prove who the best jumper is jessie knocks evel out and dresses in knievels signature red white and blue outfit jessie then successfully makes the jump however the bike has been sabotaged and he is killed as he lands footage from a real knievel crash was used while the body is taken away for the drug smuggling plot evel wakes up gets on another bike and goes to free willafter breaking out of the psych ward the two find the mockup trailer in which by an amazing coincidence both tommy and kate have been taken hostage pursuing the truck will and evel decide to split up will will disable the semi evel will lead off the guntoting drug lords riding guard in another carat the end of several extended chase scenes the drug lords are defeated will and his son are reunited and kate has fallen head over heels for knievel the film ends with knievel performing a daredevil jump over a pit of fire this time successfullythe end jump is stopped in a freezeframe shot and a color matte similar to that of the one that appears in the opening credits appears over evel in midair the song that plays over the opening credits also plays over the films end credits
1085,1931 1931 1931 1931 1931 1931 1931 1931 1931 1931 charlie chan carries on american american american american american directorhamiltonmacfadden directorhamiltonmacfadden directorhamiltonmacfadden directorhamiltonmacfadden directorhamiltonmacfadden actorwarneroland actorjohngarrick mystery mystery mystery mystery mystery mystery mystery mystery mystery mystery charlie chan tries to solve the murder of a wealthy american found dead in a london hotel room settings include london nice france san remo honolulu and hong kong
1858,1935 1935 1935 1935 1935 1935 1935 1935 1935 1935 china seas american american american american american directortaygarnett directortaygarnett directortaygarnett directortaygarnett directortaygarnett actorclarkgable actorjeanharlow actorwallacebeery adventure adventure adventure adventure adventure adventure adventure adventure adventure adventure alan gaskell clark gable is an abrasive gambling captain of a tramp steamer the kin lu chugging between singapore and hong kong tensions are high before the kin lu sails from hong kong because pirates are discovered disguised as women passengers while others try to smuggle weapons aboarddolly portland jean harlow is alans former girlfriend who alan later describes at the captains table as a professional entertainer and travels with her maid meanwhile another of alans former loves aristocratic sybil barclay rosalind russell from sussex england boards the kin lu i am in your hands again barclay taunts gaskell and eventually they plan to marry when the steamer docks in singapore however dolly tries to win back alan meanwhile jamesy mcardle wallace beery is a corrupt passenger in league with a gang of pirates who plan to steal the gold shipment of gbp250000 gold bullion being carried on the steamer3 portland discovers the plot and attempts to warn capt gaskell against mcardle but he deflects her warningsin calm seas following a typhoon in which the ship suffered damage to its cargo and the deaths of some crew the kin lu is boarded by malay pirates as mcardle expected and with whom he is in alliance the pirates steal personal possessions from passengers unable to find gold in the ships strongbox which capt gaskell has replaced with sand they torture capt gaskell using a malay boot but the captain will not reveal the golds location instead with bravado gaskell instructs the pirates as they prepare to torture him my size is 9c before fainting from pain while leaving the ship minus the gold they intended to steal the pirates ship is bombed by a passenger who commits suicide using a mills bomb as a grenade and later strafed by capt gaskell their ship sinks in the china seasfrustrated by the failed robbery mcardle commits suicide when the kin lu docks in singapore captain gaskell still limping due to his torture settles that his love for sybil is superficial instead he recognises that dolly gave him good warning and he loves her more they decide to marry he says farewell to sybil as the film closes capt gaskell reveals the gold was safe all along hidden inside the ships cargo
28974,1948 1948 1948 1948 1948 1948 1948 1948 1948 1948 gokuladasi tamil tamil tamil tamil tamil directorksubramaniam directorksubramaniam directorksubramaniam directorksubramaniam directorksubramaniam actorchonnappabhagavathar actormvrajamma actortrramachandran actornkrishnamurthi actorsowdhamini actorrmsomasundaram actornthiruvengadam actorksangamuthu actorssarojini actorbabysathyavathi actordancebylalitha actorpadmini kamaroopan honnappa bhagavathar is a lustful king who faces the curse of goddess parvati and is reborn as a common jeweller in the town of gokulam the jeweller happens to set his eyes on a devadasi named anuradha rajamma and tries to woo her anuradha on the other hand was a singerdancer in her previous birth and a devotee of parvati she was cursed by sage narada honnappa bhagavathar for displeasing him and is reborn as anuradha the jeweller keeps trying to seduce anuradha not realising the actions he had committed in his previous birth to settle the matter krishna padmini rids both the jeweller and the devadasi of their curses2
20562,1992 1992 1992 1992 1992 1992 1992 1992 1992 1992 the crying game british british british british british directorneiljordan directorneiljordan directorneiljordan directorneiljordan directorneiljordan actorstephenrea actormirandarichardson actorjayedavidson drama drama drama drama drama drama drama drama drama drama at a fairground in rural northern ireland provisional ira volunteer fergus stephen rea and a unit of other ira members including a woman named jude miranda richardson and led by maguire adrian dunbar kidnap jody forest whitaker a black british soldier after jude lures him to a secluded area with the promise of sex the ira demands the release of imprisoned ira members threatening to execute jody in three days if their demands are not met fergus is tasked with guarding jody and develops a bond with the prisoner much to the chagrin of the other ira men during this time jody tells fergus the story of the scorpion and the frogjody persuades fergus to promise to seek out his girlfriend dil jaye davidson in london should jody be killed the deadline set by jodys captors passes and with none of the iras demands being met jody is to be executed when fergus takes him into the woods to carry out the sentence jody makes a break for it fergus cannot bring himself to shoot the fleeing jody in the back but jody is accidentally run over and killed by a british armoured personnel carrier as they move in to assault the ira safehouse with his ira companions seemingly dead after the attack fergus flees to london where he takes a job as a day labourer using the alias jimmy a few months later fergus finds dil at a hair salon later they talk in a bar where he sees her singing the crying gamefergus suffers from guilt about jodys death and sees him in his dreams bowling a cricket ball to him he pursues dil protecting her from an obsessive suitor and falling in love with her later when he is about to make love to her in her apartment he discovers that she is transgender his initial reaction is of revulsion rushing to the bathroom to vomit he accidentally hits dil in the face a few days later he leaves her a note and the two make up despite everything fergus is still attracted to dil around the same time jude unexpectedly reappears in fergus apartment she tells him that the ira tried and convicted him in absentia and she forces him to agree to help with a new mission to aid in assassinating a judge she also mentions that she knows about fergus and dil warning him that the ira will kill her if fergus does not cooperatefergus unable to overcome his feelings for dil continues to woo her to shield her from possible retribution he gives her a haircut and menswear as a disguise the night before the ira mission is to be carried out dil gets heavily drunk and fergus escorts her to her apartment where she asks him to stay with her fergus complies then admits he had an indirect hand in jodys death dil drunk appears not to understand but in the morning before fergus wakes up dil ties him to the bed she unwittingly prevents fergus from joining the other ira members and completing the planned assassination holding fergus at gunpoint dil forces him to tell her that he loves her and will never leave her she unties him saying that even if he is lying it is nice to hear his words dil then breaks down in tearsmeanwhile jude and maguire gun the judge down but maguire is shot dead by one of the bodyguards a vengeful jude enters dils flat with a gun seeking to kill fergus for missing the assassination dil takes several shots at jude hitting her whilst stating that she is aware that jude was complicit in jodys death and that jude used her sexuality to trick him dil finally kills jude with a shot in the neck she then points the gun at fergus but lowers her hand saying that she cannot kill him because jody will not allow her to fergus prevents dil from shooting herself and tells her to hide out in the club for a while when she is gone he wipes her fingerprints off the gun replacing them with his own and allows himself to be arrested in her placea few months later dil visits fergus in prison where he is serving six years after discussing his postrelease plans she asks why he took the fall for her and he responds as a man once said its in my nature he then tells her the story of the scorpion and the frog
10635,1986 1986 1986 1986 1986 1986 1986 1986 1986 1986 under the cherry moon american american american american american directorprince directorprince directorprince directorprince directorprince actorprince actorkristinscottthomas actorjeromebenton actorfrancescaannis drama drama drama drama drama drama drama drama drama drama gigolos christopher tracy and his brother tricky swindle wealthy french women the situation gets complicated when christopher falls in love with heiress mary sharon after planning to swindle her when he finds out that she receives a 50 million trust fund on her 21st birthday marys father isaac disapproves of the romance and provides an excellent adversary for christopher christopher rivals his brother tricky for the affection of mary
18710,1945 1945 1945 1945 1945 1945 1945 1945 1945 1945 journey together british british british british british directorjohnboulting directorjohnboulting directorjohnboulting directorjohnboulting directorjohnboulting actorrichardattenborough actorjackwatling war war war war war war war war war war two raf aircrew cadets jack wilton richard attenborough and john aynesworth jack watling become friends a friendly rivalry develops between the two while they are training and it ends in a bet they both pass their initial training and are sent to the united states for more advanced instruction however once there it becomes clear that corporal wilton while he is otherwise a great pilot cannot land a plane because of his inability to judge height wilton is devastated and the feeling worsens when he sees that aynsworth is a natural pilot aynsworth proceeds with his pilots training and wilton is sent up to canada to be trained as a navigator insteadhe turns out to be a good navigator but he shows no interest in his training and falls behind his peers then on a practice flight through the connivance of his trainer and aynsworth he realizes how important his job is after graduating he serves as navigator to aynesworth on a bombing operation during the raid the plane is hit and begins to lose fuel wilton must demonstrate everything he learned when they have to land in the sea and he communicates their position it turns out that he perfectly calculated their position and the rescue plane quickly finds them
21761,1981 1981 1981 1981 1981 1981 1981 1981 1981 1981 gas canadian canadian canadian canadian canadian directorlesrose directorlesrose directorlesrose directorlesrose directorlesrose actorphilipakin actorsusananspach comedy comedy comedy comedy comedy comedy comedy comedy comedy comedy a small midwestern town is thrown into chaos when the local oil tycoon duke stuyvesant orchestrates a phony oil shortage in order to increase profits news reporter jane beardsley tries to uncover the plot radio dj nick the noz observing from his stations news helicopter reports on the craziness caused by the gasoline shortage
33706,2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 himitsu no akkochan japanese japanese japanese japanese japanese directoryasuhirokawamura directoryasuhirokawamura directoryasuhirokawamura directoryasuhirokawamura directoryasuhirokawamura actorharukaayase actormasakiokada romance romance romance romance romance romance romance romance romance romance atsuko kagami is a childlike arrogant elementary school girl who has an affinity for mirrors one day her favorite mirror which was given to akko by her mother or in some versions by her father as a present from india is broken and she prefers to bury it in her yard rather than throw it to the trashcanin her dreams she is contacted by a spirit or in some cases the queen of the mirror kingdom who is touched that the girl would treat the mirror so respectfully and not simply throw it away akkochan is then given the gift of a magical mirror and taught enchantments tekumaku mayakon tekumaku mayakon and lamipus lamipus lu lu lu lu lu that will allow her to transform into anything she wishes5
6720,1957 1957 1957 1957 1957 1957 1957 1957 1957 1957 bayou american american american american american directorharolddaniels directorharolddaniels directorharolddaniels directorharolddaniels directorharolddaniels actorpetergraves actorlitamilan drama drama drama drama drama drama drama drama drama drama martin davis an insecure young architect comes to new orleans from the north to compete against a local man for the job of designing a new civic auditorium on a visit to a carnival in the cajun country of southern louisiana martin meets marie a sensual cajun girl of seventeen who works as a crabber in the bayou in order to support herself and her partly senile alcoholic father herbert marie has aroused the lustful instincts of the local storekeeper ulysses a sadistic illiterate bully who has attempted to rape her after helping marie to recover money stolen from her martin asks her to be his guide for the carnival activities in order to impress the local building commissioner martins contractor friend jim tallant enters him in a race using pirogues primitive canoes hollowed out of tree trunks martin and ulysses compete against each other and ulysses who greatly resents martins interest in marie wins when he deliberately cuts in front of martins canoemartin and marie find themselves falling in love and martin arranges to stay longer in the area in order to court her ulysses then threatens to harm marie unless herbert gets rid of martin later at a shivaree celebrating the marriage of an old man and a young girl marie performs a local folk dance ulysses then performs a strange gyrating dance at the conclusion of which he challenges martin to fight for marie but martin walks away suddenly the wind rises and a hurricane sweeps through the area causing much devastation while martin and marie seek shelter in an unoccupied house martin asks marie to marry him and she accepts agitated by the hurricane herbert goes berserk and is killed by a falling tree at herberts funeral ulysses makes a final effort to win marie and taunts martin into a brutal fight however martin is victorious and he and marie leave the bayou to begin a new life together in the north


### Removing Stopwords

In [15]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords

In [16]:
stops = stopwords.words('english')
print(stops)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [17]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def preprocess(sentence):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)
    
    filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
    stem_words=[stemmer.stem(w) for w in filtered_words]
    lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
    return " ".join(lemma_words)

df['doc_clean'] = df['doc'].map(lambda s:preprocess(s)) 

In [18]:
df[["doc", 'doc_clean']].sample(10)

Unnamed: 0,doc,doc_clean
6122,1954 1954 1954 1954 1954 1954 1954 1954 1954 1954 johnny dark american american american american american directorgeorgesherman directorgeorgesherman directorgeorgesherman directorgeorgesherman directorgeorgesherman actortonycurtis actorpiperlaurie drama drama drama drama drama drama drama drama drama drama johnny dark and his pal duke benson work for fielding motors where owner james fielding manufactures familyfriendly automobiles chief engineer scotty overhears the guys complaining about the company and spots a sportscar design johnny and duke have donenew employee liz catches the eye of the guys her secret is that she is fieldings granddaughter when a major stockholder winston protests the companys unwillingness to create new products for more profits scotty blurts out that fielding motors is developing a new sports carliz is chosen as the cars designer while johnny and duke go to work building it duke invites her to go dancing but is jealous when he spots her kissing johnny who has discovered lizs true identityduke is fired after flipping the car during a practice run he blames it on brake failure but johnny feels its just an excuse liz is disappointed in johnny for not defending his friendthe car is entered in a canadatomexico race johnny must drive it himself duke having been hired to drive another vehicle fielding dislikes making a sports car but accepts scottys wager on the race the car has a breakdown and johnny must push it into las vegas but when a radio broadcast implies that fielding doesnt care scotty and a team of mechanics rush to vegas to help johnny get back into the race and win which liz and duke help him celebrate,1954 1954 1954 1954 1954 1954 1954 1954 1954 1954 johnni dark american american american american american directorgeorgesherman directorgeorgesherman directorgeorgesherman directorgeorgesherman directorgeorgesherman actortonycurti actorpiperlauri drama drama drama drama drama drama drama drama drama drama johnni dark pal duke benson work field motor owner jame field manufactur familyfriendli automobil chief engin scotti overhear guy complain compani spot sportscar design johnni duke donenew employe liz catch eye guy secret field granddaught major stockhold winston protest compani unwilling creat new product profit scotti blurt field motor develop new sport carliz chosen car design johnni duke work build duke invit danc jealou spot kiss johnni discov liz true identityduk fire flip car practic run blame brake failur johnni feel excus liz disappoint johnni defend friendth car enter canadatomexico race johnni must drive duke hire drive anoth vehicl field dislik make sport car accept scotti wager race car breakdown johnni must push la vega radio broadcast impli field doesnt care scotti team mechan rush vega help johnni get back race win liz duke help celebr
30437,2001 2001 2001 2001 2001 2001 2001 2001 2001 2001 pandavar bhoomi tamil tamil tamil tamil tamil directorcheran directorcheran directorcheran directorcheran directorcheran actorarunvijay actorshamitha actorrajkiran dhanasekar rajkiran returns to his village with his family after 12 years all the villagers feel happy over welcoming him back to the village dhanasekar decides to build a new bungalow adjacent to his ancestral home in the village and plans to live there he approaches tamizharasan aka tamizh arun vijay an architect to design his new home tamizhs father vijayakumar is also an architect who is famous for traditional architectural designsdhanasekar and his family temporarily stay in his neighbors house until the construction of new house is completed tamizh also stays in a tent near the construction site managing the construction activities vinu chakravarthy lives in the same village and he hates dhanasekar and his family he wants them to be killed and sends his son mukesh tiwari to work with the construction workers so that he can get them killed somedayjeeva shamitha is the niece of dhanasekar and she befriends tamizh tamizh also likes jeeva and slowly friendship transforms into love for tamizh when tamizh proposes his love to jeeva she leaves the place without replying which makes tamizh understand that she does not have any feelings for himone day a family comes to dhanasekars house expressing interest to get jeeva married to their son but dhanasekar informs them that jeeva will be married to his brother ranjith who is in jail for 12 years tamizh gets shocked knowing about this and he gets angry that the family is planning to sacrifice jeeva by getting her married to someone who has been jailed for years dhanasekar comes to know about the love between jeeva and tamizh he calls tamizh and tells a flashback about his familydhanasekar was leading a joyful life in the same village along with his brothers chandrsekhar and ranjith and sisters kavitha and thamarai also shamitha manorama is their mother vinu chakravarthy does not like dhanasekars family and always picks up quarrels with them one day vinu chakravarthy poisons the local pond thereby killing dhanasekars cows and oxen that drank water from the pond dhanasekar gets furious and lodges a complaint with the police following which vinu chakravarthy is arrestedto everyones shock the family comes to know that thamarai is in love with akash the younger son of vinu chakravarthy dhanasekar and manorama warn thamarai to forget akash and they plan to get her married immediately with someone else but on the day of marriage thamarai elopes from home with akash and manorama dies on the same day as she could not tolerate thiswhile dhanasekar and his brothers perform the funeral of their mother thamarai comes along with akash following their wedding ranjith gets furious seeing her and he runs towards them and chops the heads of both thamarai and akash in anger all happened in a fraction of second and ranjith could not believe that he killed his own sister he then feels guilty and surrenders to the police and has been sentenced to 12 years of imprisonment dhanasekar and his family members leave the village and move to tirupur where they earn well and the family comes back to the same village after 12 years but they are worried that ranjiths life will be spoiled as no girl will be interested to marry him as he has been jailed so they decide to get jeeva married to him jeeva though not interested in the proposal agrees to marry her uncle ranjith for the wellbeing of her familytamizh understands the familys situation and decides to give up his love for jeeva but on the other hand jeeva decides to express her interest in marrying tamizh she meets tamizh and conveys her love but tamizh says that he came to know about her familys past and he also felt proud about jeevas decision to sacrifice her life for the happiness of her family he apologizes to jeeva for confusing her predetermined mind in the name of love and he also requests her to marry ranjith who is about to get released from jail soonthe new home is completed and mukesh tiwari plants a bomb in the home so that everyone can be killed as well as the home can be destroyed tamizh gets to know about mukesh tiwaris plans and he alerts everyone also tamizh rushes into the house in search of the bomb and finds it after much struggle he takes the bomb and throws it away before it could explode thereby saving all the family members and also the newly built home the entire family thank tamizh for his help tamizh leaves to his home on the next dayafter a few days dhanasekar comes along with his brothers chandrasekhar and ranjith after getting released from jail to meet tamizh and his father vijayakumar inviting them for the house warming ceremony as well as for jeevas wedding on seeing the invitation card tamizh gets surprised to see his name as groom for jeevaa flashback is shown where ranjith after getting released from jail comes to meet his family they share some good moments as he meets his family members after a period of 12 years as he never allowed anyone to meet him in jail except dhanasekar ranjith is surprised on seeing jeeva as she exactly resembles his sister thamarai in looks ranjith also believes that thamarai has reborn in the form of jeeva and feels happy that he got back his lovable sister whom he killed out of anger also ranjith says that he does not want to marry any one and instead he prefers to stay single along with all the family members the family members then decide to get jeeva married to tamizh the movie ends with both tamizh and jeeva getting united,2001 2001 2001 2001 2001 2001 2001 2001 2001 2001 pandavar bhoomi tamil tamil tamil tamil tamil directorcheran directorcheran directorcheran directorcheran directorcheran actorarunvijay actorshamitha actorrajkiran dhanasekar rajkiran return villag famili year villag feel happi welcom back villag dhanasekar decid build new bungalow adjac ancestr home villag plan live approach tamizharasan aka tamizh arun vijay architect design new home tamizh father vijayakumar also architect famou tradit architectur designsdhanasekar famili temporarili stay neighbor hous construct new hous complet tamizh also stay tent near construct site manag construct activ vinu chakravarthi live villag hate dhanasekar famili want kill send son mukesh tiwari work construct worker get kill somedayjeeva shamitha niec dhanasekar befriend tamizh tamizh also like jeeva slowli friendship transform love tamizh tamizh propos love jeeva leav place without repli make tamizh understand feel himon day famili come dhanasekar hous express interest get jeeva marri son dhanasekar inform jeeva marri brother ranjith jail year tamizh get shock know get angri famili plan sacrific jeeva get marri someon jail year dhanasekar come know love jeeva tamizh call tamizh tell flashback familydhanasekar lead joy life villag along brother chandrsekhar ranjith sister kavitha thamarai also shamitha manorama mother vinu chakravarthi like dhanasekar famili alway pick quarrel one day vinu chakravarthi poison local pond therebi kill dhanasekar cow ox drank water pond dhanasekar get furiou lodg complaint polic follow vinu chakravarthi arrestedto everyon shock famili come know thamarai love akash younger son vinu chakravarthi dhanasekar manorama warn thamarai forget akash plan get marri immedi someon el day marriag thamarai elop home akash manorama die day could toler thiswhil dhanasekar brother perform funer mother thamarai come along akash follow wed ranjith get furiou see run toward chop head thamarai akash anger happen fraction second ranjith could believ kill sister feel guilti surrend polic sentenc year imprison dhanasekar famili member leav villag move tirupur earn well famili come back villag year worri ranjith life spoil girl interest marri jail decid get jeeva marri jeeva though interest propos agre marri uncl ranjith wellb familytamizh understand famili situat decid give love jeeva hand jeeva decid express interest marri tamizh meet tamizh convey love tamizh say came know famili past also felt proud jeeva decis sacrific life happi famili apolog jeeva confus predetermin mind name love also request marri ranjith get releas jail soonth new home complet mukesh tiwari plant bomb home everyon kill well home destroy tamizh get know mukesh tiwari plan alert everyon also tamizh rush hous search bomb find much struggl take bomb throw away could explod therebi save famili member also newli built home entir famili thank tamizh help tamizh leav home next dayaft day dhanasekar come along brother chandrasekhar ranjith get releas jail meet tamizh father vijayakumar invit hous warm ceremoni well jeeva wed see invit card tamizh get surpris see name groom jeevaa flashback shown ranjith get releas jail come meet famili share good moment meet famili member period year never allow anyon meet jail except dhanasekar ranjith surpris see jeeva exactli resembl sister thamarai look ranjith also believ thamarai reborn form jeeva feel happi got back lovabl sister kill anger also ranjith say want marri one instead prefer stay singl along famili member famili member decid get jeeva marri tamizh movi end tamizh jeeva get unit
9057,1974 1974 1974 1974 1974 1974 1974 1974 1974 1974 lovin molly american american american american american directorsidneylumet directorsidneylumet directorsidneylumet directorsidneylumet directorsidneylumet actorblythedanner actoranthonyperkins actorbeaubridges over a span of nearly 40 years gid and johnny a pair of texas farm boys compete for the affections of molly taylor a free spirit who cares for both of them the story is told by three consecutive segments which is narrated by one of the three lead rolesthe first segment is set in 1925 and narrated by gid who introduces himself as well as his best friend johnny and johnnys girlfriend molly taylor with whom gid becomes smitten gid works parttime as a ranch hand at mollys farm and often competes against johnny for mollys affections despite their frequent feud and arguments gid and johnnys friendship never ends during their excursions and errands for mollys father to sell and buy cattle for the family farm molly eventually sleeps with gid as well as johnny but she eventually chooses neither one of them and instead marries school friend eddie after the death of her father gid eventually marries sarah a local widow with several children and johnny leaves town for places unknownthe second segment is set in 1945 and is narrated by molly it was revealed that molly had three sons from her three different suitors and each one of them died in combat during world war ii which is currently waging mollys husband eddie also died from an illness several years before gid had divorced sarah and began spending most of his free time with molly who withheld the news of their sons death in battle when he finally did learn the news gid took it badly and became more depressed johnny reentered their lives after living away and having had married and divorced his own wife took a more active part in helping molly run her late fathers farmthe third and final segment is set in 1964 and is narrated by johnny he reveals that gid is in a local hospital dying from cancer and johnny has been keeping a bedside vigil over him wanting out of the place johnny takes gid away from the hospital for a few days to visit molly who is still living at her fathers farm and is contemplating selling it after working with johnny around the farm to relive their good old days long gone by gid passes away as johnny is taking him back to the hospital after gids funeral johnny meets with molly where they agree and despite they never got married or had a life in operating her family farm they will always be soul mates before johnny leaves molly for the last time,1974 1974 1974 1974 1974 1974 1974 1974 1974 1974 lovin molli american american american american american directorsidneylumet directorsidneylumet directorsidneylumet directorsidneylumet directorsidneylumet actorblythedann actoranthonyperkin actorbeaubridg span nearli year gid johnni pair texa farm boy compet affect molli taylor free spirit care stori told three consecut segment narrat one three lead rolesth first segment set 1925 narrat gid introduc well best friend johnni johnni girlfriend molli taylor gid becom smitten gid work parttim ranch hand molli farm often compet johnni molli affect despit frequent feud argument gid johnni friendship never end excurs errand molli father sell buy cattl famili farm molli eventu sleep gid well johnni eventu choos neither one instead marri school friend eddi death father gid eventu marri sarah local widow sever child johnni leav town place unknownth second segment set 1945 narrat molli reveal molli three son three differ suitor one die combat world war current wage molli husband eddi also die ill sever year gid divorc sarah began spend free time molli withheld news son death battl final learn news gid took badli becam depress johnni reenter live live away marri divorc wife took activ part help molli run late father farmth third final segment set 1964 narrat johnni reveal gid local hospit die cancer johnni keep bedsid vigil want place johnni take gid away hospit day visit molli still live father farm contempl sell work johnni around farm reliv good old day long gone gid pas away johnni take back hospit gid funer johnni meet molli agre despit never got marri life oper famili farm alway soul mate johnni leav molli last time
14350,2003 2003 2003 2003 2003 2003 2003 2003 2003 2003 runaway jury american american american american american directorgaryfleder directorgaryfleder directorgaryfleder directorgaryfleder directorgaryfleder actorjohncusack actorgenehackman actordustinhoffman actorrachelweisz crime drama crime drama crime drama crime drama crime drama crime drama crime drama crime drama crime drama crime drama in new orleans a failed day trader at a stock brokerage firm shows up at the office and opens fire on his former colleagues then kills himself among the dead is jacob wood two years later with attorney wendell rohr jacobs widow celeste takes vicksburg firearms to court on the grounds that the companys gross negligence led to her husbands death during jury selection jury consultant rankin fitch and his team communicate background information on each of the jurors to lead defense attorney durwood cable in the courtroom through electronic surveillancein the jury pool nick easter tries to get himself excused from jury duty judge frederick harkin decides to give nick a lesson in civic duty and fitch tells cable that the judge has now given them no choice and that he must select nick as a juror nicks congenial manner wins him acceptance from his fellow jurors but frank herrera a marine veteran takes an instant dislike to hima woman named marlee makes an offer to fitch and rohr she will deliver the verdict to the first bidder rohr dismisses the offer assuming it to be a tactic by fitch to obtain a mistrial fitch asks for proof that she can deliver though which nick provides fitch orders nicks apartment searched but finds nothing marlee retaliates by getting one of fitchs jurors bounced nick shows the judge surveillance footage of his apartment being searched and the judge orders the jury sequestered fitch then goes after three jurors with blackmail leading one rikki coleman to attempt suiciderohr loses a key witness due to harassment and after confronting fitch decides that he cannot win the case he asks his firms partners for 10million fitch sends an operative janovich to kidnap marlee but she fights him off and raises fitchs price to 15million on principle rohr changes his mind and refuses to pay fitch agrees to pay marlee to be certain of the verdictfitchs subordinate doyle travels to gardner indiana where he discovers that nick is really jeff kerr a law school dropout and that marlees real name is gabby brandt gabbys sister died in a school shooting the town sued the gun manufacturer and fitch helped the defense win the case doyle concludes that nick and marlees offer is a setup and he calls fitch but it is too latenick receives confirmation of receipt of payment and he steers the jury in favor of the plaintiff much to the chagrin of herrera who launches into a rant against the plaintiff which undermines his support the gun manufacturer is found liable with the jury awarding 110million in general damages to celeste woodafter the trial nick and marlee confront fitch with a receipt for the 15million bribe and demand that he retire they inform him that the 15million will benefit the shooting victims in gardner,2003 2003 2003 2003 2003 2003 2003 2003 2003 2003 runaway juri american american american american american directorgaryfled directorgaryfled directorgaryfled directorgaryfled directorgaryfled actorjohncusack actorgenehackman actordustinhoffman actorrachelweisz crime drama crime drama crime drama crime drama crime drama crime drama crime drama crime drama crime drama crime drama new orlean fail day trader stock brokerag firm show offic open fire former colleagu kill among dead jacob wood two year later attorney wendel rohr jacob widow celest take vicksburg firearm court ground compani gross neglig led husband death juri select juri consult rankin fitch team commun background inform juror lead defens attorney durwood cabl courtroom electron surveillancein juri pool nick easter tri get excus juri duti judg frederick harkin decid give nick lesson civic duti fitch tell cabl judg given choic must select nick juror nick congeni manner win accept fellow juror frank herrera marin veteran take instant dislik hima woman name marle make offer fitch rohr deliv verdict first bidder rohr dismiss offer assum tactic fitch obtain mistrial fitch ask proof deliv though nick provid fitch order nick apart search find noth marle retali get one fitch juror bounc nick show judg surveil footag apart search judg order juri sequest fitch goe three juror blackmail lead one rikki coleman attempt suiciderohr lose key wit due harass confront fitch decid cannot win case ask firm partner 10million fitch send oper janovich kidnap marle fight rais fitch price 15million principl rohr chang mind refus pay fitch agre pay marle certain verdictfitch subordin doyl travel gardner indiana discov nick realli jeff kerr law school dropout marle real name gabbi brandt gabbi sister die school shoot town su gun manufactur fitch help defens win case doyl conclud nick marle offer setup call fitch latenick receiv confirm receipt payment steer juri favor plaintiff much chagrin herrera launch rant plaintiff undermin support gun manufactur found liabl juri award 110million gener damag celest woodaft trial nick marle confront fitch receipt 15million bribe demand retir inform 15million benefit shoot victim gardner
12418,1995 1995 1995 1995 1995 1995 1995 1995 1995 1995 the doom generation american american american american american directorgreggaraki directorgreggaraki directorgreggaraki directorgreggaraki directorgreggaraki actorjamesduval teenage lovers jordan white and amy blue pick up a handsome drifter named xavier red while driving home from a club jordan gives xavier the nickname x a latenight stop at a convenience store leaves the three on the run when x accidentally kills the stores owner forcing the trio to hide in a motel to avoid arrest while jordan and amy have sex in the bathtub x learns from the local television news program that the store owners wife disemboweled her children with a machete before committing suicide thus he concludes removing any possibility of the trio being caught by the policelater that evening amy has sex with x even though they do not get along eventually jordan finds out and things become tense as the two men develop a lingering sexual attraction for one another as the trio journeys around the city of los angeles they continue to get into violent situations due to people either claiming to be amys previous lovers or mistaking her for such the fbi has a meeting and declares it will find amy and kill her exactly the same sentiment is voiced by several other parties in the film she is mistakenly identified by a fast food window clerk as sunshine and later by a character played by parker posey as kittenjordan amy and x spend the night in an abandoned warehouse where they engage in a threesome while amy goes to urinate jordan and x are attacked by a trio of neonazis one of whom had previously mistaken amy for his exgirlfriend bambi the gang proceeds to beat up x and then hold jordan down as the aforementioned neonazi ties up and rapes amy on top of an american flag the group finally severs jordans penis with pruning shears and forces it into his mouth after amy breaks free she kills the neonazis with the shears and escapes with x leaving jordan for dead the film ends with amy and x driving aimlessly on the road with no communication as the film fades,1995 1995 1995 1995 1995 1995 1995 1995 1995 1995 doom gener american american american american american directorgreggaraki directorgreggaraki directorgreggaraki directorgreggaraki directorgreggaraki actorjamesduv teenag lover jordan white ami blue pick handsom drifter name xavier red drive home club jordan give xavier nicknam latenight stop conveni store leav three run accident kill store owner forc trio hide motel avoid arrest jordan ami sex bathtub learn local televis news program store owner wife disembowel child machet commit suicid thu conclud remov possibl trio caught policelat even ami sex even though get along eventu jordan find thing becom ten two men develop linger sexual attract one anoth trio journey around citi lo angel continu get violent situat due peopl either claim ami previou lover mistak fbi meet declar find ami kill exactli sentiment voic sever parti film mistakenli identifi fast food window clerk sunshin later charact play parker posey kittenjordan ami spend night abandon warehous engag threesom ami goe urin jordan attack trio neonazi one previous mistaken ami exgirlfriend bambi gang proce beat hold jordan aforement neonazi tie rape ami top american flag group final sever jordan peni prune shear forc mouth ami break free kill neonazi shear escap leav jordan dead film end ami drive aimlessli road commun film fade
5045,1950 1950 1950 1950 1950 1950 1950 1950 1950 1950 at war with the army american american american american american directorhalwalker directorhalwalker directorhalwalker directorhalwalker directorhalwalker actordeanmartin actorjerrylewis actorpollybergen musical comedy musical comedy musical comedy musical comedy musical comedy musical comedy musical comedy musical comedy musical comedy musical comedy the film is set at a united states army base in kentucky at the end of 1944 during world war ii4 the protagonists are first sergeant vic puccinelli dean martin and private first class alvin korwin jerry lewis who were partners in a nightclub songanddance act before joining the armypuccinelli wants to be transferred from his dull job to active duty overseas but is refused transfer and is to be commissioned a warrant officer korwin wants a pass to see his wife and new baby in addition they have to rehearse for the base talent show and avoid the wrath of alvins platoon sergeant sergeant mcvey mike kellinalong the way they both sing a few songs and they do an impression of bing crosby and barry fitzgerald by recreating a scene from going my way for the talent show further complications include a post exchange worker who is pregnant a company commander who gets all his information from his wife a scheming supply sergeant and a defective cocacola machine,1950 1950 1950 1950 1950 1950 1950 1950 1950 1950 war armi american american american american american directorhalwalk directorhalwalk directorhalwalk directorhalwalk directorhalwalk actordeanmartin actorjerrylewi actorpollybergen music comedi music comedi music comedi music comedi music comedi music comedi music comedi music comedi music comedi music comedi film set unit state armi base kentucki end 1944 world war ii4 protagonist first sergeant vic puccinelli dean martin privat first class alvin korwin jerri lewi partner nightclub songandd act join armypuccinelli want transfer dull job activ duti oversea refus transfer commiss warrant offic korwin want pas see wife new babi addit rehears base talent show avoid wrath alvin platoon sergeant sergeant mcvey mike kellinalong way sing song impress bing crosbi barri fitzgerald recreat scene go way talent show complic includ post exchang worker pregnant compani command get inform wife scheme suppli sergeant defect cocacola machin
351,1920 1920 1920 1920 1920 1920 1920 1920 1920 1920 sex american american american american american directorfredniblo directorfredniblo directorfredniblo directorfredniblo directorfredniblo actorlouiseglaum actorirvingcummings drama drama drama drama drama drama drama drama drama drama the film is a morality story on the evils of marital infidelity and the wild lifestyle of new york actors at the same time the film included scenes of seduction and debauchery that made it the subject of controversy over its prurient contentthe films plot centers on adrienne renault played by louise glaum the beautiful queen of the midnight follies at the frivolity theaterthe film opens with renaults current conquest a married millionaire philip overman played by william conklin overman is in his private box watching renault perform her seductive spider dance renault comes on stage dressed as a spider clad in a translucent cloak of webs wrapped cloaklike around a bodyhugging black sheath2in another scene of debauchery the film depicts a party at which stagedoor johnnies drink out of womens slippers and scantily clad chorines slide down banisters their undergarments visible to all and sundry23the film then shifts to mrs overman played by myrtle stedman home alone in her empty mansion her suspicions persuade her to hire a private detective to follow her husband eventually mrs overman uncovers her husbands infidelity she begs renault to release her husband but renault refuses and mrs overman obtains a divorceby this time renault has fallen in love with a new millionaire dick wallace played by irving cummings renault marries wallace but wallace then betrays renault falling in love with renaults young protege daisy played by viola barry it was renault who had coached daisy in the ways of seducing wealthy married men renault begs daisy to release wallace harkening back to the scene where mrs overmire had pleaded with renault as renault had done with mrs overman daisy refuses to release wallacerenault then sails for europe she ends up on the same ship with the reunited overmans who are on a second honeymoon the chastened renault does nothing to disrupt the relationship resigned to a life of solitude the films final intertitle reads the standards of morality eternally demand that the naked soul of sex be stripped of its falsehoods which can only be atoned for through bitter tears23,1920 1920 1920 1920 1920 1920 1920 1920 1920 1920 sex american american american american american directorfredniblo directorfredniblo directorfredniblo directorfredniblo directorfredniblo actorlouiseglaum actorirvingcum drama drama drama drama drama drama drama drama drama drama film moral stori evil marit infidel wild lifestyl new york actor time film includ scene seduct debaucheri made subject controversi prurient contentth film plot center adrienn renault play louis glaum beauti queen midnight folli frivol theaterth film open renault current conquest marri millionair philip overman play william conklin overman privat box watch renault perform seduct spider danc renault come stage dress spider clad transluc cloak web wrap cloaklik around bodyhug black sheath2in anoth scene debaucheri film depict parti stagedoor johnni drink woman slipper scantili clad chorin slide banist undergar visibl sundry23th film shift mr overman play myrtl stedman home alon empti mansion suspicion persuad hire privat detect follow husband eventu mr overman uncov husband infidel beg renault releas husband renault refus mr overman obtain divorcebi time renault fallen love new millionair dick wallac play irv cum renault marri wallac wallac betray renault fall love renault young proteg daisi play viola barri renault coach daisi way seduc wealthi marri men renault beg daisi releas wallac harken back scene mr overmir plead renault renault done mr overman daisi refus releas wallacerenault sail europ end ship reunit overman second honeymoon chasten renault noth disrupt relationship resign life solitud film final intertitl read standard moral etern demand nake soul sex strip falsehood aton bitter tears23
11530,1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 stanley iris american american american american american directormartinritt directormartinritt directormartinritt directormartinritt directormartinritt actorjanefonda actorrobertdeniro actormarthaplimpton actorswoosiekurtz romantic drama romantic drama romantic drama romantic drama romantic drama romantic drama romantic drama romantic drama romantic drama romantic drama iris king a widow still grieving 6 months after the loss of her husband works in a baking factory in connecticut and lives in a highcrime area she lives from paycheck to paycheck as she raises her two children kelly and richard also staying with her are her sister sharon and sharons abusive husband joe both unemployed with money already tight for the family kelly discovers she is pregnant which makes matters worseiris makes the acquaintance of stanley cox a cook in the bakerys lunchroom cafeteria when he comes to her aid after her purse is snatched on a bus but as their friendship develops she begins noticing peculiarities about stanley he doesnt own a car he instead bicycles wherever he needs to go he lives with and supports his elderly father becomes frustrated when asked to sign his name doesnt believe in opening chinese fortune cookies and cannot pick out a specific item from a shelf iris soon realizes that stanley is illiterate and when she innocently mentions this to stanleys boss stanley is fired the next day over food safety and legal concerns despite being a good cook and model employee afterwards stanley is unable to obtain any steady work forcing him to move into a garage and put his father in a shabby retirement home his father dies in the home only a few weeks later upsetting stanley over the fact that his illiteracy prevented him from caring for his father properly stanley seeks iris out and asks her to teach him to read explaining that his travelingsalesman father moved him all over the country when stanley was a boy bouncing him to nearly 50 different schools in total resulting in stanley developing no reading or writing skills from this lack of educational stability iris begins giving stanley basic reading lessons and he gradually grows close to her and her family it is during one of these reading exercises that he tells her that he has wanted to be intimate with her since they first met but iris is hesitantiris tests stanleys developing reading skills by making him a map and having him meet her at a certain street corner in 15 minutes but stanley gets hopelessly lost hours later he reaches the corner where a frantic iris is still waiting frustrated stanley marches off alone without saying a word his interest in learning to read gone iris visits him at his garage home to try to persuade him to continue learning to read looking around she sees a large mechanical project that stanley is working on as he invents things as a hobby he has designed a cakecooling machine that can outperform anything in the commercial marketplace iris is immensely impressed and stanley reveals that a local company has shown interest in his invention and even offered him a job stanley agrees to start reading again with iris and in time learns to write short sentences stanley surprises iris by cooking a big dinner for her and her family and the two of them begin to grow close againafter kelly has her baby iris is displeased when she drops out of school to work at the bakery as she doesnt want her daughter wasting her life in the kind of deadend job she herself is in stanley and iris finally decide to make love but iris is still clinging to her late husbands memory this threatens their budding relationship further and they dont see each other for some time not prepared to give up on iris the way she didnt give up on him stanley finally goes to see her iris hands him an unmailed letter she wrote to him and stanley surprises her by reading it aloud nearly perfectly iris now ready to start letting go of the past accompanies stanley to a fancy hotel where they order room service and spend the night togetherstanley soon moves to detroit for a new wellpaying job he has been offered his inventing ability finally having paid off several months later back in connecticut iris is walking home carrying groceries when an expensive car pulls up next to her and she is surprised to find stanley behind the wheel stanley tells her that hes been given a raise and is looking to buy a large sixbedroom house in detroit and that he wants her to move there with him as his wife iris accepts,1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 stanley iri american american american american american directormartinritt directormartinritt directormartinritt directormartinritt directormartinritt actorjanefonda actorrobertdeniro actormarthaplimpton actorswoosiekurtz romant drama romant drama romant drama romant drama romant drama romant drama romant drama romant drama romant drama romant drama iri king widow still griev month loss husband work bake factori connecticut live highcrim area live paycheck paycheck rais two child kelli richard also stay sister sharon sharon abus husband joe unemploy money alreadi tight famili kelli discov pregnant make matter worseiri make acquaint stanley cox cook bakeri lunchroom cafeteria come aid purs snatch bu friendship develop begin notic peculiar stanley doesnt car instead bicycl wherev need live support elderli father becom frustrat ask sign name doesnt believ open chine fortun cooki cannot pick specif item shelf iri soon realiz stanley illiter innoc mention stanley bos stanley fire next day food safeti legal concern despit good cook model employe afterward stanley unabl obtain steadi work forc move garag put father shabbi retir home father die home week later upset stanley fact illiteraci prevent care father properli stanley seek iri ask teach read explain travelingsalesman father move countri stanley boy bounc nearli differ school total result stanley develop read write skill lack educ stabil iri begin give stanley basic read lesson gradual grow close famili one read exercis tell want intim sinc first met iri hesitantiri test stanley develop read skill make map meet certain street corner minut stanley get hopelessli lost hour later reach corner frantic iri still wait frustrat stanley march alon without say word interest learn read gone iri visit garag home tri persuad continu learn read look around see larg mechan project stanley work invent thing hobbi design cakecool machin outperform anyth commerci marketplac iri immens impress stanley reveal local compani shown interest invent even offer job stanley agre start read iri time learn write short sentenc stanley surpris iri cook big dinner famili two begin grow close againaft kelli babi iri displeas drop school work bakeri doesnt want daughter wast life kind deadend job stanley iri final decid make love iri still cling late husband memori threaten bud relationship dont see time prepar give iri way didnt give stanley final goe see iri hand unmail letter wrote stanley surpris read aloud nearli perfectli iri readi start let past accompani stanley fanci hotel order room servic spend night togetherstanley soon move detroit new wellpay job offer invent abil final paid sever month later back connecticut iri walk home carri groceri expens car pull next surpris find stanley behind wheel stanley tell he given rais look buy larg sixbedroom hous detroit want move wife iri accept
15059,2006 2006 2006 2006 2006 2006 2006 2006 2006 2006 the second chance american american american american american directorstevetaylor directorstevetaylor directorstevetaylor directorstevetaylor directorstevetaylor actormichaelwsmith actorjeffobafemicarr drama drama drama drama drama drama drama drama drama drama ethan jenkins michael w smith is a pastor who enjoys working with his welltodo congregation at the request of his father ethan takes an assignment at second chance church where he meets jake sanders jeff obafemi carr jake is a pastor who lives in a completely different world from that of ethan and spends much of his time dealing with poverty drugs and crime the two different lifestyles of these pastors cause an inevitable conflict as these two men try to bridge the divide,2006 2006 2006 2006 2006 2006 2006 2006 2006 2006 second chanc american american american american american directorstevetaylor directorstevetaylor directorstevetaylor directorstevetaylor directorstevetaylor actormichaelwsmith actorjeffobafemicarr drama drama drama drama drama drama drama drama drama drama ethan jenkin michael smith pastor enjoy work welltodo congreg request father ethan take assign second chanc church meet jake sander jeff obafemi carr jake pastor live complet differ world ethan spend much time deal poverti drug crime two differ lifestyl pastor caus inevit conflict two men tri bridg divid
27892,2005 2005 2005 2005 2005 2005 2005 2005 2005 2005 immini nalloraal malayalam malayalam malayalam malayalam malayalam directorrajase directorrajase directorrajase directorrajase directorrajase actorjayasurya actornavyanair sneha navya nair is a popular actress jeevan is a junior artist jayasurya who is attracted to her one day while shooting jeevan kidnaps sneha and takes her to an unknown location snehas family and her colleagues try to negotiate with jeevan to get sneha back but jeevan does not want to give sneha backafter a few days of keeping her as a hostage jeevan decides to give her back and surrender he confesses that he took sneha as a hostage because he saw her suicide note in her diary sneha understands the true love he has for her and saves him from the charges that were awaiting him finally the two unites,2005 2005 2005 2005 2005 2005 2005 2005 2005 2005 immini nallora malayalam malayalam malayalam malayalam malayalam directorrajas directorrajas directorrajas directorrajas directorrajas actorjayasurya actornavyanair sneha navya nair popular actress jeevan junior artist jayasurya attract one day shoot jeevan kidnap sneha take unknown locat sneha famili colleagu tri negoti jeevan get sneha back jeevan want give sneha backaft day keep hostag jeevan decid give back surrend confess took sneha hostag saw suicid note diari sneha understand true love save charg await final two unit


## TF-IDF

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["doc_clean"])
column_names = vectorizer.get_feature_names_out()

df_tf_idf = pd.DataFrame(X.toarray(), columns=column_names)
df_tf_idf.shape

(34886, 251188)

## Cosine Similarity

In [20]:
from sklearn.metrics.pairwise import cosine_similarity

df_cos_sim = pd.DataFrame(cosine_similarity(df_tf_idf, dense_output=True))
df_cos_sim.shape

  ret = a @ b


(34886, 34886)

### Converting Cosine Similarity Dataframe to Top-K Items

In [34]:
import warnings

# hide pandas warning messages
warnings.filterwarnings('ignore')

In [35]:
from tqdm.notebook import tqdm

K = 10

df_top_k = pd.DataFrame()
movie_indices = df_cos_sim.columns

for col in tqdm(movie_indices):
    df_test = df_cos_sim[[col]].sort_values(by=[col], ascending=False).head(K+1).copy()

    record = []
    for index, row in df_test.iterrows():
        if index != col:
            item = [int(index), float(row[col])]
            record.append(item)
        if len(record) == K:
            break

    df_top_k[col] = record

df_top_k.shape

  0%|          | 0/34886 [00:00<?, ?it/s]

(10, 34886)

In [40]:
# Transpose
df_top_k = df_top_k.T

df_top_k.sample(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
26025,"[26504, 0.3620489229730903]","[28718, 0.3252340008773924]","[28632, 0.3051069020176202]","[26299, 0.28905304730763376]","[23910, 0.27672501700917873]","[27536, 0.2139407924188466]","[13906, 0.20152185292688773]","[26049, 0.16824506757415866]","[26654, 0.1561447509224819]","[26896, 0.15468184455730213]"
9008,"[9004, 0.5122274405830288]","[33037, 0.40420383857958814]","[9011, 0.36707442778749066]","[17407, 0.35376842243076123]","[9018, 0.35182061629607597]","[9098, 0.3464159686356937]","[9045, 0.3461904367377711]","[33031, 0.3441738821188626]","[9005, 0.3420256605247796]","[9064, 0.34175097043130215]"
32847,"[32846, 0.8551423322970173]","[7035, 0.47972423586536644]","[7086, 0.46971844215945846]","[6980, 0.46373064326102514]","[7017, 0.46362807486332214]","[19390, 0.4607687751089281]","[19375, 0.460080621617711]","[6974, 0.45951762301714283]","[19382, 0.45810861837129235]","[6977, 0.45316264488310337]"
13856,"[21513, 0.3480367039522531]","[4134, 0.33309619583649946]","[6491, 0.31260049713129345]","[4851, 0.31247903461471976]","[9520, 0.29941393847889713]","[8813, 0.2958359293688111]","[10282, 0.2935366295258927]","[12376, 0.29226806485317436]","[11930, 0.2914800043784019]","[13201, 0.2897033465639696]"
18732,"[18722, 0.4554976874767943]","[34527, 0.42096436605275983]","[34295, 0.3973296482392984]","[34521, 0.39487003463827286]","[4478, 0.39245923280639516]","[34297, 0.38878224141035606]","[4346, 0.38758392085708215]","[24706, 0.3826056606863259]","[739, 0.3812160504011599]","[34314, 0.37311532180081447]"
7719,"[7771, 0.6928682794762017]","[19692, 0.6441729182501121]","[19669, 0.5575402790721555]","[32893, 0.5300184873899625]","[19694, 0.4737045688335866]","[7733, 0.44998808656901274]","[34319, 0.4456154623896174]","[7758, 0.44543937140590484]","[29303, 0.44100204212811844]","[29306, 0.43850820541229196]"
9797,"[25175, 0.33713114014245027]","[25169, 0.3291603321478832]","[25177, 0.3245161038619608]","[25174, 0.3220402493609471]","[9798, 0.3190776894344768]","[9853, 0.31744055351477807]","[25163, 0.311862764952367]","[17502, 0.3101844778511574]","[25168, 0.30423819912521893]","[9869, 0.3037115795580636]"
3866,"[20720, 0.4761706754356434]","[18096, 0.4597195837196465]","[18097, 0.44350283022139225]","[8705, 0.38917010967286564]","[7478, 0.3773605302515406]","[17819, 0.3592525187653775]","[10885, 0.3522126009760382]","[18676, 0.33897679720172597]","[717, 0.30676816437966153]","[8313, 0.28931677571683256]"
4948,"[4981, 0.5577362848276128]","[4944, 0.5479257338775796]","[18869, 0.5398801620871491]","[4847, 0.5381642563952086]","[5002, 0.5330969199825556]","[4893, 0.528909395976042]","[18857, 0.5235814248231784]","[5032, 0.522102456249883]","[5029, 0.5214020317873379]","[4864, 0.5160719100051497]"
31532,"[5949, 0.49527548524900766]","[5864, 0.4622274547380769]","[5915, 0.4449189407699913]","[5923, 0.442325995196076]","[31531, 0.431939880581105]","[19106, 0.4276563220369206]","[5854, 0.42712418685233755]","[19091, 0.42665424194272517]","[5976, 0.42579697539360695]","[19065, 0.42205398693849816]"


In [41]:
# saving similarity top-k dataframe

df_top_k.to_parquet("../data/movie_top_k_t.parquet")

## Testing

In [203]:
pd.set_option('display.max_colwidth', 50)

In [204]:
query = 'titanic'

df[df["Title"].str.contains("titanic")]

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot,doc,doc_clean
6008,1953,titanic,american,directorjeannegulesco,actorbarbarastanwyck actorcliftonwebb actorrob...,biography,https://en.wikipedia.org/wiki/titanic_(1953_film),"at the last minute, a wealthy american expatri...",1953 titanic american directorjeannegulesco ac...,1953 titan american directorjeannegulesco acto...
9758,1980,raise the titanic,american,directorjerryjameson,actorjasonrobards actoralecguinness actorannea...,action,https://en.wikipedia.org/wiki/raise_the_titani...,the film opens on the fictional island of svar...,1980 raise the titanic american directorjerryj...,1980 rais titan american directorjerryjameson ...
12857,1996,titanic,american,directorrobertlieberman,actorgeorgec.scott actorevamariesaint actorpet...,biography,https://en.wikipedia.org/wiki/titanic_(1996_tv...,titanic follows three main story threads.\r\ni...,1996 titanic american directorrobertlieberman ...,1996 titan american directorrobertlieberman ac...
13153,1997,titanic,american,directorjamescameron,actorleonardodicaprio actorkatewinslet actorbi...,"historical epic, disaster",https://en.wikipedia.org/wiki/titanic_(1997_film),"in 1996, treasure hunter brock lovett and his ...",1997 titanic american directorjamescameron act...,1997 titan american directorjamescameron actor...
16392,2012,titanic 3d,american,directorjamescameron,actorleonardodicaprio actorkatewinslet actorbi...,drama,https://en.wikipedia.org/wiki/titanic_(1997_film),"in 1996, treasure hunter brock lovett and his ...",2012 titanic 3d american directorjamescameron ...,2012 titan american directorjamescameron actor...


In [207]:
movie_index = 13153 # Titanic Movie

df_query = df_cos_sim[[movie_index]].sort_values(by=[movie_index]).tail(5)
df_query.shape

(34886, 1)

In [217]:
df_query

Unnamed: 0,13153
6275,0.398865
16004,0.421042
14599,0.472291
16392,0.992128
13153,1.0


In [223]:
df[df.index == 6275]

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot,doc,doc_clean
6275,1955,east of eden,american,directoreliakazan,actorjulieharris actorjamesdean actorraymondma...,drama,https://en.wikipedia.org/wiki/east_of_eden_(film),"the story is set during 1917 and 1918, leading...",1955 east of eden american directoreliakazan a...,1955 east eden american directoreliakazan acto...
