# CIS 3920 GROUP 11 RECOMMENDER SYSTEM PROJECT
Sahiti Kovvuri, Kamilla Sharipova, Samantha Soto, Weiguang Wu

# importing all libraries

In [1]:
import pandas as pd 
import numpy as np
import ast 
from ast import literal_eval # converts strings into tuples for further data processing
import re #used to tokenize & lowercased data

import nltk
from nltk.corpus import stopwords #used for Porter Stemmer and NLTK processing 
nltk.download('stopwords') #NLTK (Natural Language Toolkit) includes a list of 40 stop words defined as 'stopwords' 

import seaborn as sns
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer #used to call Tfid vectorizer functions
from sklearn.feature_extraction.text import CountVectorizer #used to call count vectorizer functions
from sklearn.metrics.pairwise import cosine_similarity #used to call cosine similarity function
import string 

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sahitikovvuri/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
! ls -l 'tmdb_5000_movies.csv' 'tmdb_5000_credits.csv'

-rw-r--r--@ 1 sahitikovvuri  staff  40044293 Nov 15 22:34 tmdb_5000_credits.csv
-rw-r--r--@ 1 sahitikovvuri  staff   5722964 Nov 15 22:34 tmdb_5000_movies.csv


In [3]:
! wc -l 'tmdb_5000_movies.csv' 'tmdb_5000_credits.csv'

    4804 tmdb_5000_movies.csv
    4804 tmdb_5000_credits.csv
    9608 total


# data preparation using csv's

In [4]:
#reading tmdb_5000 movies and credits data files
movies_df = pd.read_csv('tmdb_5000_movies.csv')
credits_df = pd.read_csv('tmdb_5000_credits.csv')

In [5]:
#merging both data sets
credits_df.columns = ['id','title','cast','crew']
movies_df= movies_df.merge(credits_df,on='id')

In [6]:
print(movies_df)

         budget                                             genres  \
0     237000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
1     300000000  [{"id": 12, "name": "Adventure"}, {"id": 14, "...   
2     245000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
3     250000000  [{"id": 28, "name": "Action"}, {"id": 80, "nam...   
4     260000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
...         ...                                                ...   
4798     220000  [{"id": 28, "name": "Action"}, {"id": 80, "nam...   
4799       9000  [{"id": 35, "name": "Comedy"}, {"id": 10749, "...   
4800          0  [{"id": 35, "name": "Comedy"}, {"id": 18, "nam...   
4801          0                                                 []   
4802          0                [{"id": 99, "name": "Documentary"}]   

                                               homepage      id  \
0                           http://www.avatarmovie.com/   19995   
1          http://disney.

# cleaning-up data

In [7]:
#removed newline (\n) characters in 'overview'
movies_df['overview'] = movies_df['overview'].str.replace(r'\n', '', regex=True)
movies_df.sample(10)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title_x,vote_average,vote_count,title_y,cast,crew
32,200000000,"[{""id"": 10751, ""name"": ""Family""}, {""id"": 14, ""...",http://disney.go.com/wonderland/,12155,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,Alice in Wonderland,"Alice, an unpretentious and individual 19-year...",78.530105,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,108.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,You're invited to a very important date.,Alice in Wonderland,6.4,4645,Alice in Wonderland,"[{""cast_id"": 7, ""character"": ""Alice Kingsleigh...","[{""credit_id"": ""52fe44c09251416c7503fbc3"", ""de..."
1150,40000000,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 10749, ""...",http://www.myspace.com/proposalmovie,18240,"[{""id"": 1907, ""name"": ""fictitious marriage""}, ...",en,The Proposal,When she learns she's in danger of losing her ...,36.238968,"[{""name"": ""Touchstone Pictures"", ""id"": 9195}, ...",...,108.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Here comes the bribe...,The Proposal,6.7,1804,The Proposal,"[{""cast_id"": 3, ""character"": ""Margaret Tate"", ...","[{""credit_id"": ""5635d9db925141284c019df7"", ""de..."
3942,3000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 35, ""nam...",http://www.hobowithashotgun.com/,49010,"[{""id"": 293, ""name"": ""female nudity""}, {""id"": ...",en,Hobo with a Shotgun,A vigilante homeless man pulls into a new city...,16.900433,"[{""name"": ""Rhombus Media"", ""id"": 164}, {""name""...",...,86.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Delivering justice, one shell at a time...",Hobo with a Shotgun,5.7,211,Hobo with a Shotgun,"[{""cast_id"": 2, ""character"": ""Hobo"", ""credit_i...","[{""credit_id"": ""534275200e0a2679a1003b31"", ""de..."
1893,25000000,"[{""id"": 14, ""name"": ""Fantasy""}, {""id"": 27, ""na...",http://www.dontbeafraidofthedark.com/,46261,"[{""id"": 1299, ""name"": ""monster""}, {""id"": 9714,...",en,Don't Be Afraid of the Dark,A young girl sent to live with her father and ...,14.562197,"[{""name"": ""Miramax Films"", ""id"": 14}, {""name"":...",...,99.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Fear is never just make believe,Don't Be Afraid of the Dark,5.4,341,Don't Be Afraid of the Dark,"[{""cast_id"": 10, ""character"": ""Kim"", ""credit_i...","[{""credit_id"": ""52fe46edc3a36847f811a321"", ""de..."
2704,0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 18, ""nam...",,1123,"[{""id"": 796, ""name"": ""police brutality""}, {""id...",en,Catch a Fire,The true story of anti-apartheid activists in ...,4.052219,"[{""name"": ""Mirage Enterprises"", ""id"": 932}, {""...",...,101.0,"[{""iso_639_1"": ""af"", ""name"": ""Afrikaans""}, {""i...",Released,"The spark that ignites us, unites us.",Catch a Fire,6.4,25,Catch a Fire,"[{""cast_id"": 14, ""character"": ""Nic Vos"", ""cred...","[{""credit_id"": ""52fe42e8c3a36847f802be69"", ""de..."
4534,500000,"[{""id"": 99, ""name"": ""Documentary""}]",http://www.rickyjaymovie.com/,180383,"[{""id"": 2343, ""name"": ""magic""}, {""id"": 187056,...",en,Deceptive Practice: The Mysteries and Mentors ...,The life and career of renowned magician and s...,0.425195,"[{""name"": ""Kino Lorber"", ""id"": 39134}]",...,88.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,,Deceptive Practice: The Mysteries and Mentors ...,7.1,17,Deceptive Practice: The Mysteries and Mentors ...,"[{""cast_id"": 2, ""character"": ""Himself"", ""credi...","[{""credit_id"": ""547117cf92514123e30008b4"", ""de..."
3890,0,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 10749, ""...",,10162,"[{""id"": 3864, ""name"": ""false identity""}, {""id""...",en,Waking Ned,"When a lottery winner dies of shock, his fello...",5.735912,"[{""name"": ""Fox Searchlight Pictures"", ""id"": 43...",...,91.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,How far would you go to win a fortune!,Waking Ned,7.4,54,Waking Ned,"[{""cast_id"": 9, ""character"": ""Jackie O'Shea"", ...","[{""credit_id"": ""52fe43399251416c750082b9"", ""de..."
1275,50000000,"[{""id"": 878, ""name"": ""Science Fiction""}, {""id""...",,1272,"[{""id"": 83, ""name"": ""saving the world""}, {""id""...",en,Sunshine,"Fifty years into the future, the sun is dying,...",51.502884,"[{""name"": ""DNA Films"", ""id"": 284}, {""name"": ""I...",...,107.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"If the sun dies, so do we.",Sunshine,7.0,1182,Sunshine,"[{""cast_id"": 5, ""character"": ""Robert Capa"", ""c...","[{""credit_id"": ""5378a9080e0a261425004f95"", ""de..."
2809,7000000,"[{""id"": 27, ""name"": ""Horror""}, {""id"": 53, ""nam...",http://www.jaws25.com/,578,"[{""id"": 1495, ""name"": ""fishing""}, {""id"": 1913,...",en,Jaws,An insatiable great white shark terrorizes the...,50.152327,"[{""name"": ""Universal Pictures"", ""id"": 33}, {""n...",...,124.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Don't go in the water.,Jaws,7.5,2542,Jaws,"[{""cast_id"": 15, ""character"": ""Police Chief Ma...","[{""credit_id"": ""52fe4255c3a36847f801603d"", ""de..."
586,70000000,"[{""id"": 10752, ""name"": ""War""}, {""id"": 18, ""nam...",http://www.monumentsmen.com/,152760,"[{""id"": 1956, ""name"": ""world war ii""}, {""id"": ...",en,The Monuments Men,Based on the true story of the greatest treasu...,43.873266,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,118.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,It was the greatest art heist in history,The Monuments Men,5.8,1523,The Monuments Men,"[{""cast_id"": 7, ""character"": ""James Granger"", ...","[{""credit_id"": ""53630586c3a3681568000c62"", ""de..."


# handling missing data (NaN/null values)

In [8]:
#selects all rows with NaN in the overview column (handling missing data) 
movies_df[movies_df['overview'].isna()]

#saves the list into a dataframe called null_movies
null_movies = movies_df[movies_df['overview'].isna()] 

#displays the movie titles only in Null_movies
null_movies[['original_title','overview']] 

#handles missing overview data

#null value imputation named null
null = null_movies.loc[:, ['original_title', 'overview']]

#select the column for which the description will be imputed into named col_list
col_list =  list(null["original_title"])
descrip = ["Set within a year after the events of Batman Begins, Batman, Lieutenant James Gordon, and new District Attorney Harvey Dent successfully begin to round up the criminals that plague Gotham City, until a mysterious and sadistic criminal mastermind known only as 'The Joker' appears in Gotham, creating a new wave of chaos. Batman's struggle against The Joker becomes deeply personal, forcing him to confront everything he believes and improve his technology to stop him. A love triangle develops between Bruce Wayne, Dent, and Rachel Dawes.",
          "Growing up can be a bumpy road, and it's no exception for Riley, who is uprooted from her Midwest life when her father starts a new job in San Francisco. Like all of us, Riley is guided by her emotions - Joy, Fear, Anger, Disgust and Sadness. The emotions live in Headquarters, the control center inside Riley's mind, where they help advise her through everyday life. As Riley and her emotions struggle to adjust to a new life in San Francisco, turmoil ensues in Headquarters. Although Joy, Riley's main and most important emotion, tries to keep things positive, the emotions conflict on how best to navigate a new city, house and school.",
          "After stealing a mysterious orb in the far reaches of outer space, Peter Quill from Earth is now the main target of a manhunt led by the villain known as Ronan the Accuser. To help fight Ronan and his team and save the galaxy from his power, Quill creates a team of space heroes known as the Guardians of the Galaxy to save the galaxy.",
          "In Earth's future, a global crop blight and second Dust Bowl are slowly rendering the planet uninhabitable. Professor Brand (Michael Caine), a brilliant NASA physicist, is working on plans to save mankind by transporting Earth's population to a new home via a wormhole. But first, Brand must send former NASA pilot Cooper (Matthew McConaughey) and a team of researchers through the wormhole and across the galaxy to find out which of three planets could be mankind's new home.",
          "Dom Cobb (Leonardo DiCaprio) is a thief with the rare ability to enter people's dreams and steal their secrets from their subconscious. His skill has made him a hot commodity in the world of corporate espionage but has also cost him everything he loves. Cobb gets a chance at redemption when he is offered a seemingly impossible task: Plant an idea in someone's mind. If he succeeds, it will be the perfect crime, but a dangerous enemy anticipates Cobb's every move",
          "The future of civilization rests in the fate of the One Ring, which has been lost for centuries. Powerful forces are unrelenting in their search for it. But fate has placed it in the hands of a young Hobbit named Frodo Baggins (Elijah Wood), who inherits the Ring and steps into legend. A daunting task lies ahead for Frodo when he becomes the Ringbearer - to destroy the One Ring in the fires of Mount Doom where it was forged.", 
           "Two years before the Civil War, Django (Jamie Foxx), a slave, finds himself accompanying an unorthodox German bounty hunter named Dr. King Schultz (Christoph Waltz) on a mission to capture the vicious Brittle brothers. Their mission successful, Schultz frees Django, and together they hunt the South's most-wanted criminals. Their travels take them to the infamous plantation of shady Calvin Candie (Leonardo DiCaprio), where Django's long-lost wife (Kerry Washington) is still a slave.",
          "In 1987, Jordan Belfort (Leonardo DiCaprio) takes an entry-level job at a Wall Street brokerage firm. By the early 1990s, while still in his 20s, Belfort founds his own firm, Stratton Oakmont. Together with his trusted lieutenant (Jonah Hill) and a merry band of brokers, Belfort makes a huge fortune by defrauding wealthy investors out of millions. However, while Belfort and his cronies partake in a hedonistic brew of sex, drugs and thrills, the SEC and the FBI close in on his empire of excess.",
          "The culmination of nearly 10 years' work and conclusion to Peter Jackson's epic trilogy based on the timeless J.R.R. Tolkien classic, The Lord of the Rings: The Return of the King presents the final confrontation between the forces of good and evil fighting for control of the future of Middle-earth. Hobbits Frodo and Sam reach Mordor in their quest to destroy the `one ring', while Aragorn leads the forces of good against Sauron's evil army at the stone city of Minas Tirith.",
          "The sequel to the Golden Globe-nominated and AFI Award-winning The Lord of the Rings: The Fellowship of the Ring, The Two Towers follows the continuing quest of Frodo (Elijah Wood) and the Fellowship to destroy the One Ring. Frodo and Sam (Sean Astin) discover they are being followed by the mysterious Gollum. Aragorn (Viggo Mortensen), the Elf archer Legolas and Gimli the Dwarf encounter the besieged Rohan kingdom, whose once great King Theoden has fallen under Saruman's deadly spell",
          "This Disney animated feature follows the adventures of the young lion Simba (Jonathan Taylor Thomas), the heir of his father, Mufasa (James Earl Jones). Simba's wicked uncle, Scar (Jeremy Irons), plots to usurp Mufasa's throne by luring father and son into a stampede of wildebeests. But Simba escapes, and only Mufasa is killed. Simba returns as an adult (Matthew Broderick) to take back his homeland from Scar with the help of his friends Timon (Nathan Lane) and Pumbaa (Ernie Sabella).",
          "Neo (Keanu Reeves) believes that Morpheus (Laurence Fishburne), an elusive figure considered to be the most dangerous man alive, can answer his question -- What is the Matrix? Neo is contacted by Trinity (Carrie-Anne Moss), a beautiful stranger who leads him into an underworld where he meets Morpheus. They fight a brutal battle for their lives against a cadre of viciously intelligent secret agents. It is a truth that could cost Neo something more precious than his life.",
          "A depressed man (Edward Norton) suffering from insomnia meets a strange soap salesman named Tyler Durden (Brad Pitt) and soon finds himself living in his squalid house after his perfect apartment is destroyed. The two bored men form an underground club with strict rules and fight other men who are fed up with their mundane lives. Their perfect partnership frays when Marla (Helena Bonham Carter), a fellow support group crasher, attracts Tyler's attention.",
          "Paul Edgecomb (Tom Hanks) walked the mile with a variety of cons. He had never encountered someone like John Coffey (Michael Clarke Duncan), a massive black man convicted of brutally killing a pair of young sisters. Coffey had the size and strength to kill anyone, but not the demeanor. Beyond his simple, naive nature and a deathly fear of the dark, Coffey seemed to possess a prodigious, supernatural gift. Paul began to question whether Coffey was truly guilty of murdering the two girls.",
          "Slow-witted Forrest Gump (Tom Hanks) has never thought of himself as disadvantaged, and thanks to his supportive mother (Sally Field), he leads anything but a restricted life. Whether dominating on the gridiron as a college football star, fighting in Vietnam or captaining a shrimp boat, Forrest inspires people with his childlike optimism. But one person Forrest cares about most may be the most difficult to save -- his childhood love, the sweet but troubled Jenny (Robin Wright)",
          "When retiring police Detective William Somerset (Morgan Freeman) tackles a final case with the aid of newly transferred David Mills (Brad Pitt), they discover a number of elaborate and grizzly murders. They soon realize they are dealing with a serial killer (Kevin Spacey) who is targeting people he thinks represent one of the seven deadly sins. Somerset also befriends Mills' wife, Tracy (Gwyneth Paltrow), who is pregnant and afraid to raise her child in the crime-riddled city.",
          "Businessman Oskar Schindler (Liam Neeson) arrives in Krakow in 1939, ready to make his fortune from World War II, which has just started. After joining the Nazi party primarily for political expediency, he staffs his factory with Jewish workers for similarly pragmatic reasons. When the SS begins exterminating Jews in the Krakow ghetto, Schindler arranges to have his workers protected to keep his factory in operation, but soon realizes that in so doing, he is also saving innocent lives.",
          "Andy Dufresne (Tim Robbins) is sentenced to two consecutive life terms in prison for the murders of his wife and her lover and is sentenced to a tough prison. However, only Andy knows he didn't commit the crimes. While there, he forms a friendship with Red (Morgan Freeman), experiences brutality of prison life, adapts, helps the warden, etc., all in 19 years.",
          "The adventure continues in this Star Wars sequel. Luke Skywalker (Mark Hamill), Han Solo (Harrison Ford), Princess Leia (Carrie Fisher) and Chewbacca (Peter Mayhew) face attack by the Imperial forces and its AT-AT walkers on the ice planet Hoth. While Han and Leia escape in the Millennium Falcon, Luke travels to Dagobah in search of Yoda. Only with the Jedi master's help will Luke survive when the dark side of the Force beckons him into the ultimate duel with Darth Vader (David Prowse)",
          "Jodie Foster stars as Clarice Starling, a top student at the FBI's training academy. Jack Crawford (Scott Glenn) wants Clarice to interview Dr. Hannibal Lecter (Anthony Hopkins), a brilliant psychiatrist who is also a violent psychopath, serving life behind bars for various acts of murder and cannibalism. Crawford believes that Lecter may have insight into a case and that Starling, as an attractive young woman, may be just the bait to draw him out.",
          "In this 1980s sci-fi classic, small-town California teen Marty McFly (Michael J. Fox) is thrown back into the '50s when an experiment by his eccentric scientist friend Doc Brown (Christopher Lloyd) goes awry. Traveling through time in a modified DeLorean car, Marty encounters young versions of his parents (Crispin Glover, Lea Thompson), and must make sure that they fall in love or he'll cease to exist. Even more dauntingly, Marty has to return to his own time and save the life of Doc Brown.",
          "In this animated feature by noted Japanese director Hayao Miyazaki, 10-year-old Chihiro (Rumi Hiiragi) and her parents (Takashi Naitô, Yasuko Sawaguchi) stumble upon a seemingly abandoned amusement park. After her mother and father are turned into giant pigs, Chihiro meets the mysterious Haku (Miyu Irino), who explains that the park is a resort for supernatural beings who need a break from their time spent in the earthly realm, and that she must work there to free herself and her parents.",
          "In 1939, newly created British intelligence agency MI6 recruits Cambridge mathematics alumnus Alan Turing (Benedict Cumberbatch) to crack Nazi codes, including Enigma -- which cryptanalysts had thought unbreakable. Turing's team, including Joan Clarke (Keira Knightley), analyze Enigma messages while he builds a machine to decipher them. Turing and team finally succeed and become heroes, but in 1952, the quiet genius encounters disgrace when authorities reveal he is gay and send him to prison.",
          "Following the rise of father Jorge Mario Bergoglio from his early life as a teacher in a Jesuit High School in Argentina, to archbishop and cardinal of Buenos Aires, until he was elected Pope of the Roman Catholic Church.",
          "The compelling sequel to The Godfather, contrasting the life of Corleone father and son. Traces the problems of Michael Corleone (Al Pacino) in 1958 and that of a young immigrant Vito Corleone (Robert De Niro) in 1917's Hell's Kitchen. Michael survives many misfortunes and Vito is introduced to a life of crime.",
          "The Imperial Forces -- under orders from cruel Darth Vader (David Prowse) -- hold Princess Leia (Carrie Fisher) hostage, in their efforts to quell the rebellion against the Galactic Empire. Luke Skywalker (Mark Hamill) and Han Solo (Harrison Ford), captain of the Millennium Falcon, work together with the companionable droid duo R2-D2 (Kenny Baker) and C-3PO (Anthony Daniels) to rescue the beautiful princess, help the Rebel Alliance, and restore freedom and justice to the Galaxy.",
          "Vincent Vega (John Travolta) and Jules Winnfield (Samuel L. Jackson) are hitmen with a penchant for philosophical discussions. In this ultra-hip, multi-strand crime movie, their storyline is interwoven with those of their boss, gangster Marsellus Wallace (Ving Rhames) ; his actress wife, Mia (Uma Thurman) ; struggling boxer Butch Coolidge (Bruce Willis) ; master fixer Winston Wolfe (Harvey Keitel) and a nervous pair of armed robbers, Pumpkin (Tim Roth) and Honey Bunny (Amanda Plummer).",
          "Widely regarded as one of the greatest films of all time, this mob drama, based on Mario Puzo's novel of the same name, focuses on the powerful Italian-American crime family of Don Vito Corleone (Marlon Brando). When the don's youngest son, Michael (Al Pacino), reluctantly joins the Mafia, he becomes involved in the inevitable cycle of violence and betrayal. Although Michael tries to maintain a normal relationship with his wife, Kay (Diane Keaton), he is drawn deeper into the family business",
          "Andrew Neiman (Miles Teller) is an ambitious young jazz drummer, in pursuit of rising to the top of his elite music conservatory. Terence Fletcher (J.K. Simmons), an instructor known for his terrifying teaching methods, discovers Andrew and transfers the aspiring drummer into the top jazz ensemble, forever changing the young man's life. But Andrew's passion to achieve perfection quickly spirals into obsession, as his ruthless teacher pushes him to the brink of his ability and his sanity",
          "The life of Frank Sinatra, as an actor and singer and the steps along the way that led him to become such an icon.",
          "To protest their working conditions and poor wages, farmworkers in Immokalee, Florida, start a hunger strike outside the headquarters of Publix supermarkets."]
#saves missing overviews in a dictionary 
overviews = pd.DataFrame({'original_title': col_list, 
                          'overview': descrip})
#modifies original dataframe inputting the missing overviews
movies_df['overview']=movies_df['overview'].apply(lambda x: x if x == x else overviews)


In [9]:
#displays data after inputting missing overviews
movies_df.sample(5)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title_x,vote_average,vote_count,title_y,cast,crew
3817,967686,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 80, ""nam...",http://www.four-lions.co.uk/,37495,"[{""id"": 13015, ""name"": ""terrorism""}, {""id"": 19...",en,Four Lions,Four Lions tells the story of a group of Briti...,20.544999,"[{""name"": ""Film4"", ""id"": 9349}, {""name"": ""Draf...",...,101.0,"[{""iso_639_1"": ""ar"", ""name"": ""\u0627\u0644\u06...",Released,We are 4 Lions.,Four Lions,7.0,423,Four Lions,"[{""cast_id"": 5, ""character"": ""Omar"", ""credit_i...","[{""credit_id"": ""536befdfc3a368124100a6e6"", ""de..."
4456,800000,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 10749, ""n...",http://www.lhp.com.sg/victor/,25461,"[{""id"": 10183, ""name"": ""independent film""}]",en,Raising Victor Vargas,"The film follows Victor, a Lower East Side tee...",3.643662,[],...,88.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,,Raising Victor Vargas,7.8,13,Raising Victor Vargas,"[{""cast_id"": 3, ""character"": ""Victor Vargas"", ...","[{""credit_id"": ""58d0f315c3a36838af008913"", ""de..."
4122,13500000,"[{""id"": 18, ""name"": ""Drama""}]",https://twitter.com/Stonewall_Movie,273899,"[{""id"": 173510, ""name"": ""stonewall riot""}]",en,Stonewall,"""Stonewall"" is a drama about a young man in Ne...",5.16023,"[{""name"": ""Centropolis Entertainment"", ""id"": 3...",...,129.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Where Pride Began,Stonewall,5.2,32,Stonewall,"[{""cast_id"": 2, ""character"": ""Danny"", ""credit_...","[{""credit_id"": ""538f48c8c3a3680d7500021b"", ""de..."
2990,10000000,"[{""id"": 27, ""name"": ""Horror""}, {""id"": 53, ""nam...",,8922,"[{""id"": 1583, ""name"": ""mass murder""}, {""id"": 4...",en,Jeepers Creepers,A college-age brother and sister get more than...,23.316208,"[{""name"": ""United Artists"", ""id"": 60}, {""name""...",...,90.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,What’s eating you?,Jeepers Creepers,6.1,720,Jeepers Creepers,"[{""cast_id"": 10, ""character"": ""Patricia 'Trish...","[{""credit_id"": ""52fe44c8c3a36847f80a9c27"", ""de..."
1910,0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 35, ""nam...",,10074,"[{""id"": 5956, ""name"": ""stuntman""}, {""id"": 6669...",en,Hot Rod,"For Rod Kimball, performing stunts is a way of...",6.309587,"[{""name"": ""Paramount Pictures"", ""id"": 4}]",...,88.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Smack destiny in the face.,Hot Rod,6.3,313,Hot Rod,"[{""cast_id"": 22, ""character"": ""Rod Kimble"", ""c...","[{""credit_id"": ""52fe431d9251416c75003f95"", ""de..."


In [10]:
#checks if there are any null values after imputing
movies_df[movies_df['overview'].isna()]

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title_x,vote_average,vote_count,title_y,cast,crew


In [11]:
#changed ‘overview’ type to string 
movies_df['overview'] = movies_df['overview'].astype(str)

# text mining preprocessing technique (1): removing punctuation 

In [12]:
#function to remove punctuation from 'overview'
string.punctuation
def remove_punct(text):
    text_nopunct = "".join([char for char in text if char not in string.punctuation])
    return text_nopunct

# Remove punctuation from data
movies_df['overview'] = movies_df['overview'].apply(lambda x: remove_punct(x))
movies_df.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title_x,vote_average,vote_count,title_y,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,In the 22nd century a paraplegic Marine is dis...,150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,Captain Barbossa long believed to be dead has ...,139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,John Carter is a warweary former military capt...,43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


# text mining preprocessing technique (2): tokenizing strings

In [13]:
#function to tokenize 
def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

movies_df['overview'] = movies_df['overview'].apply(lambda x: tokenize(x))

#display overview with new overview column
movies_df.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title_x,vote_average,vote_count,title_y,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"[In, the, 22nd, century, a, paraplegic, Marine...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"[Captain, Barbossa, long, believed, to, be, de...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,"[A, cryptic, message, from, Bond, s, past, sen...",107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...",112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"[John, Carter, is, a, warweary, former, milita...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


# text mining preprocessing technique (3): handling lowercased strings

In [14]:
#changed ‘overview’ type to string again (was getting error “AttributeError: 'list' object has no attribute 'lower'”)
movies_df['overview'] = movies_df['overview'].astype(str)

In [15]:
# adds the column with tokenized & lowercased data 
movies_df['overview'] = movies_df['overview'].apply(lambda x: tokenize(x.lower())) 

#display overview with new overview column
movies_df.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title_x,vote_average,vote_count,title_y,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"[, in, the, 22nd, century, a, paraplegic, mari...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"[, captain, barbossa, long, believed, to, be, ...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,"[, a, cryptic, message, from, bond, s, past, s...",107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,"[, following, the, death, of, district, attorn...",112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"[, john, carter, is, a, warweary, former, mili...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


# text mining preprocessing technique (4): handling stopwords using NLTK 

In [16]:
#removing english stop words such as 'a', 'the', 'was', 'in', etc
stopwords = nltk.corpus.stopwords.words('english') 

#function to remove stop words called remove_stopwords 
def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stopwords]
    return text

# removes stop words from data & adds a column to the right with data 
# no stop words present in overview_clean2
movies_df['overview'] = movies_df['overview'].apply(lambda x: remove_stopwords(x))


#display overview with new overview column
movies_df.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title_x,vote_average,vote_count,title_y,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"[, 22nd, century, paraplegic, marine, dispatch...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"[, captain, barbossa, long, believed, dead, co...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,"[, cryptic, message, bond, past, sends, trail,...",107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,"[, following, death, district, attorney, harve...",112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"[, john, carter, warweary, former, military, c...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


# text mining preprocessing technique (5): cut words using Porter Stemmer

In [17]:
#defining the porter stemmer
stemmer = nltk.PorterStemmer()

#create function to apply stemmer
def stemming(tokenized_text):
    text = [stemmer.stem(word) for word in tokenized_text]
    return text

#apply Porter Stemmer
movies_df['overview'] = movies_df['overview'].apply(lambda x: stemming(x))

#display overview with new overview column
movies_df.head()


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title_x,vote_average,vote_count,title_y,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"[, 22nd, centuri, parapleg, marin, dispatch, m...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"[, captain, barbossa, long, believ, dead, come...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,"[, cryptic, messag, bond, past, send, trail, u...",107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,"[, follow, death, district, attorney, harvey, ...",112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"[, john, carter, warweari, former, militari, c...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [18]:
#changed ‘overview’ type to string again (was getting error “AttributeError: 'list' object has no attribute 'lower'”)
movies_df['overview'] = movies_df['overview'].astype(str)

# vectorization technique (1): calculate the TF-IDF score vectorizer using overview

In [19]:
# Calculate the TF-IDF score for each word in overview, word by word
tfidf = TfidfVectorizer(analyzer='word', stop_words='english')#removing stop words like 'the' and 'a'

# overview_matrix variable where we store the matrix containing each word and 
#its TF-IDF score with regard to each word in overview
overview_matrix = tfidf.fit_transform(movies_df['overview'])
overview_matrix

<4803x16717 sparse matrix of type '<class 'numpy.float64'>'
	with 128238 stored elements in Compressed Sparse Row format>

# creating a new feature with spoken_languages, keywords, genres, cast, crew

In [20]:
features = ['spoken_languages', 'keywords', 'genres', 'cast','crew']

In [21]:
#converting string list features in tuples
for feature in features: movies_df[feature] = movies_df[feature].apply(lambda x: literal_eval(str(x)))

In [22]:
#merged the strings by defining get_list
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        if len(names) > 2:
            names = names[:2] #only take first 2  names
        return names

# similarity metric technique (1): cosine

In [23]:
cosine_sim2 = linear_kernel(overview_matrix, overview_matrix)

In [24]:
#first three features are from tmdb_5000_credits.csv, "cast" feature is from tmdb_5000_movies.csv
for feature in features:
    movies_df[feature] = movies_df[feature].apply(get_list) #turn into list

In [25]:
movies_df[['original_title','spoken_languages', 'keywords', 'genres', 'cast','crew']].sample(5)
#these are the features we are going to work with (excluding original_title)

Unnamed: 0,original_title,spoken_languages,keywords,genres,cast,crew
1274,Just Visiting,"[English, Français]","[time travel, remake]","[Comedy, Fantasy]","[Jean Reno, Christina Applegate]","[Suzanne Smith, Patrice Ledoux]"
3979,Freakonomics,[English],"[female nudity, corruption]",[Documentary],"[Zoe Sloane, Jade Viggiano]","[Heidi Ewing, Heidi Ewing]"
3254,Akeelah and the Bee,[English],"[black people, spelling]",[Drama],"[Keke Palmer, Laurence Fishburne]","[Jaki Brown, Kim Coleman]"
3222,Bright Star,"[Français, English]","[poet, independent film]","[Drama, Romance]","[Abbie Cornish, Ben Whishaw]","[David M. Thompson, François Ivernel]"
2193,Secret in Their Eyes,[English],"[fbi, missing child]","[Crime, Drama]","[Chiwetel Ejiofor, Nicole Kidman]","[Mark Johnson, Billy Ray]"


In [26]:
# reference --> O'Reilly "Hands on Reccomendation Systems"
#creating a soup out of the metadata to join the combined features in a single string
def rec_features(x):
    return ' '.join(x['spoken_languages']) + '  '.join(x['keywords']) + '  '.join(x['genres']) + '  '.join(x['cast']) + '  '.join(x['crew'])# + ' ' + ' '.join(x['overview'])
movies_df['rec_features'] = movies_df.apply(rec_features, axis=1)

In [27]:
print(movies_df)

         budget                genres  \
0     237000000   [Action, Adventure]   
1     300000000  [Adventure, Fantasy]   
2     245000000   [Action, Adventure]   
3     250000000       [Action, Crime]   
4     260000000   [Action, Adventure]   
...         ...                   ...   
4798     220000       [Action, Crime]   
4799       9000     [Comedy, Romance]   
4800          0       [Comedy, Drama]   
4801          0                    []   
4802          0         [Documentary]   

                                               homepage      id  \
0                           http://www.avatarmovie.com/   19995   
1          http://disney.go.com/disneypictures/pirates/     285   
2           http://www.sonypictures.com/movies/spectre/  206647   
3                    http://www.thedarkknightrises.com/   49026   
4                  http://movies.disney.com/john-carter   49529   
...                                                 ...     ...   
4798                                  

# vectorization technique (2): count vectorizer using features

In [28]:
countVec = CountVectorizer(analyzer='word', stop_words='english') 
#removing unnecessary stop words
#in order to extract features from our data, must use fit.transform to 
#count the number of texts and make teh transformed matrix (keyword_matrix) into an array
keyword_matrix = countVec.fit_transform(movies_df['rec_features'])

In [29]:
#measures distance between similarities of movie features chosen
cosine_sim = cosine_similarity(keyword_matrix, keyword_matrix) 

In [30]:
# Reset index of main dataframe and reconstruct data frame to mirror output results
movies_df = movies_df.reset_index()
movies = pd.Series(movies_df.index, index=movies_df['original_title'])

In [31]:
cosine_sim3 = cosine_similarity(cosine_sim, cosine_sim2)

# define recommendation function get_recommendations

In [32]:
def get_recommendations(title, cosine_sim3=cosine_sim3):
    # index of the movie with that title
    recommendation = movies[title]
    # pairwise similarity scores of all movies with that movie
    similarity = list(enumerate(cosine_sim3[recommendation]))
    # sorts movies based on similarity score
    similarity = sorted(similarity, key=lambda x: x[1], reverse=True)
    # gets scores of 5 most similar movies
    similarity = similarity[1:6]
    # gets the index of the similar movies
    movie_recommendation = [i[0] for i in similarity]
    # return top 5 most similar movies
    return movies_df['original_title'].iloc[movie_recommendation]

# allow user to input movie for recommendations > user interface (creativity)

In [None]:
movie = input('Enter a movie for recommendations(and press enter): ')

In [None]:
#display recommendations
get_recommendations(movie, cosine_sim)

# get recommendations for The Dark Knight, The Shawshank Redemption and Frozen

In [None]:
get_recommendations('The Dark Knight', cosine_sim)

In [None]:
get_recommendations('The Shawshank Redemption', cosine_sim)

In [None]:
get_recommendations('Frozen', cosine_sim)

In [None]:
#REFERENCES
#https://stackoverflow.com/questions/12453580/how-to-concatenate-join-items-in-a-list-to-a-single-string
#https://www.nltk.org/_modules/nltk/stem/wordnet.html
#25_NB_2.pynb 
#13_TM_2.pynb for preprocessing techniques