In [1]:
import json
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context="talk")

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_distances

In [19]:
df = pd.read_csv("dataset/IMDb movies.csv", low_memory=False)
df.loc[df["year"] == "TV Movie 2019", "year"] = "2019"
df["year"] = df["year"].astype("int")

In [20]:
df.head(3)

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000009,Miss Jerry,Miss Jerry,1894,1894-10-09,Romance,45,USA,,Alexander Black,...,"Blanche Bayliss, William Courtenay, Chauncey D...",The adventures of a female reporter in the 1890s.,5.9,154,,,,,1.0,2.0
1,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,...,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,589,$ 2250,,,,7.0,7.0
2,tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,...,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...,5.8,188,,,,,5.0,2.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85855 entries, 0 to 85854
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_title_id          85855 non-null  object 
 1   title                  85855 non-null  object 
 2   original_title         85855 non-null  object 
 3   year                   85855 non-null  int32  
 4   date_published         85855 non-null  object 
 5   genre                  85855 non-null  object 
 6   duration               85855 non-null  int64  
 7   country                85791 non-null  object 
 8   language               85022 non-null  object 
 9   director               85768 non-null  object 
 10  writer                 84283 non-null  object 
 11  production_company     81400 non-null  object 
 12  actors                 85786 non-null  object 
 13  description            83740 non-null  object 
 14  avg_vote               85855 non-null  float64
 15  vo

In [5]:
df.dropna(subset=["year"], inplace=True)

In [6]:
df["year"].unique()

array([1894, 1906, 1911, 1912, 1919, 1913, 1914, 1915, 1916, 1917, 1918,
       1920, 1921, 1924, 1922, 1923, 1925, 1926, 1935, 1927, 1928, 1983,
       1929, 1930, 1932, 1931, 1937, 1938, 1933, 1934, 1936, 1940, 1939,
       1942, 1943, 1941, 1948, 1944, 2001, 1946, 1945, 1947, 1973, 1949,
       1950, 1952, 1951, 1962, 1953, 1954, 1955, 1961, 1956, 1958, 1957,
       1959, 1960, 1963, 1965, 1971, 1964, 1966, 1968, 1967, 1969, 1976,
       1970, 1979, 1972, 1981, 1978, 2000, 1989, 1975, 1974, 1986, 1990,
       2018, 1977, 1982, 1980, 1993, 1984, 1985, 1988, 1987, 2005, 1991,
       2002, 1994, 1992, 1995, 2017, 1997, 1996, 2006, 1999, 1998, 2007,
       2008, 2003, 2004, 2010, 2009, 2011, 2013, 2012, 2016, 2015, 2014,
       2019, 2020])

In [7]:
rekomen = df[(df["year"] > 2019) & (df["avg_vote"] > 9)].sort_values(by="avg_vote", ascending=False).head(10).to_json(orient="records")

In [8]:
data = json.loads(rekomen)

In [9]:
data[0]

{'imdb_title_id': 'tt11207902',
 'title': 'Lejos de Casa pelicula Venezolana',
 'original_title': 'Lejos de Casa pelicula Venezolana',
 'year': 2020,
 'date_published': '2020-03-19',
 'genre': 'Drama, History',
 'duration': 87,
 'country': 'Venezuela',
 'language': 'Spanish',
 'director': 'Abner Official',
 'writer': 'Dariana Jozh, Abner Official',
 'production_company': None,
 'actors': 'Angibell, Gabriel Buitrago, Darwing, Dariana Jozh, Marko, Abner Official, Yroski',
 'description': 'Samuel, a young Venezuelan, emigrates from his country in search of a better life and to help his family, without knowing the difficult situations that he will face as an immigrant.',
 'avg_vote': 9.8,
 'votes': 133,
 'budget': '$ 5000',
 'usa_gross_income': None,
 'worlwide_gross_income': None,
 'metascore': None,
 'reviews_from_users': None,
 'reviews_from_critics': None}

# Content Based -> Description

In [10]:
df.dropna(subset=["description"], inplace=True)

In [11]:
df.head(30)

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000009,Miss Jerry,Miss Jerry,1894,1894-10-09,Romance,45,USA,,Alexander Black,...,"Blanche Bayliss, William Courtenay, Chauncey D...",The adventures of a female reporter in the 1890s.,5.9,154,,,,,1.0,2.0
1,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,...,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,589,$ 2250,,,,7.0,7.0
2,tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,...,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...,5.8,188,,,,,5.0,2.0
3,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,...,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...,5.2,446,$ 45000,,,,25.0,3.0
4,tt0002130,L'Inferno,L'Inferno,1911,1911-03-06,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",...,"Salvatore Papa, Arturo Pirovano, Giuseppe de L...",Loosely adapted from Dante's Divine Comedy and...,7.0,2237,,,,,31.0,14.0
5,tt0002199,"From the Manger to the Cross; or, Jesus of Naz...","From the Manger to the Cross; or, Jesus of Naz...",1912,1913,"Biography, Drama",60,USA,English,Sidney Olcott,...,"R. Henderson Bland, Percy Dyer, Gene Gauntier,...","An account of the life of Jesus Christ, based ...",5.7,484,,,,,13.0,5.0
6,tt0002423,Madame DuBarry,Madame DuBarry,1919,1919-11-26,"Biography, Drama, Romance",85,Germany,German,Ernst Lubitsch,...,"Pola Negri, Emil Jannings, Harry Liedtke, Edua...","The story of Madame DuBarry, the mistress of L...",6.8,753,,,,,12.0,9.0
7,tt0002445,Quo Vadis?,Quo Vadis?,1913,1913-03-01,"Drama, History",120,Italy,Italian,Enrico Guazzoni,...,"Amleto Novelli, Gustavo Serena, Carlo Cattaneo...","An epic Italian film ""Quo Vadis"" influenced ma...",6.2,273,ITL 45000,,,,7.0,5.0
8,tt0002452,Independenta Romaniei,Independenta Romaniei,1912,1912-09-01,"History, War",120,Romania,,"Aristide Demetriade, Grigore Brezeanu",...,"Aristide Demetriade, Constanta Demetriade, Con...",The movie depicts the Romanian War of Independ...,6.7,198,ROL 400000,,,,4.0,1.0
9,tt0002461,Richard III,Richard III,1912,1912-10-15,Drama,55,"France, USA",English,"André Calmettes, James Keane",...,"Robert Gemp, Frederick Warde, Albert Gardner, ...",Richard of Gloucester uses manipulation and mu...,5.5,225,$ 30000,,,,8.0,1.0


In [12]:
bow = CountVectorizer(stop_words="english", tokenizer=word_tokenize)
bank = bow.fit_transform(df["description"])

## Step 1: Encode what use watch

In [13]:
user_imdb_title_id = "tt0003772"

In [14]:
konten = df.loc[df["imdb_title_id"] == user_imdb_title_id, "description"]

In [15]:
code = bow.transform(konten)

In [30]:
konten

25    Though mistreated by her cruel stepmother and ...
Name: description, dtype: object

In [31]:
df.reindex(columns=["description"], index=df[df["imdb_title_id"] == "tt0042229"].index)["description"]

3    The fabled queen of Egypt's affair with Roman ...
Name: description, dtype: object

In [40]:
konten = df.reindex(columns=["description"], index=df[df["imdb_title_id"] == "tt0342556"].index)["description"]
code = bow.transform(konten)
distance = cosine_distances(code, bank)
rec_idx = distance.argsort()[0, 1:11]
hasil_rekomendasi = df.loc[rec_idx].to_json(orient="records")
hasil_rekomendasi

'[{"imdb_title_id":"tt2340888","title":"Survivors","original_title":"Survivors","year":2015,"date_published":"2015-10-18","genre":"Drama, Horror, Thriller","duration":92,"country":"UK","language":"English","director":"Adam Spinks","writer":"Adam Spinks, Laurence Timms","production_company":"Initiative Motion Pictures","actors":"David Anderson, Adrian Annis, Simon Burbage, Lucy Chappell, Ali Currey, Steve Davis, Nathaniel Francis, Joanne Gale, David Anthony Green, Alan Harman, Lauren Hutchings, Luke Kaile, Lydia Kay, Rich Keeble, Antony D. Lane","description":"The outbreak of a deadly virus sends the UK into a state of emergency, into a war it appears destined to lose. In a world without laws, without order and without anybody watching... how far would you go to survive?","avg_vote":4.5,"votes":109,"budget":"GBP 10000","usa_gross_income":null,"worlwide_gross_income":null,"metascore":null,"reviews_from_users":3.0,"reviews_from_critics":22.0},{"imdb_title_id":"tt0098391","title":"Stradiva

In [37]:
df.reindex(index=rec_idx).to_json(orient="records")

'[{"imdb_title_id":"tt0083367","title":"Il cuore del tiranno","original_title":"A zsarnok sz\\u00edve, avagy Boccaccio Magyarorsz\\u00e1gon","year":1981,"date_published":"1981-10-26","genre":"Drama, History","duration":96,"country":"Hungary, Italy","language":"Hungarian","director":"Mikl\\u00f3s Jancs\\u00f3","writer":"Giovanna Gagliardo, Gyula Hern\\u00e1di","production_company":"Mafilm","actors":"Teresa Ann Savoy, L\\u00e1szl\\u00f3 G\\u00e1lffi, J\\u00f3zsef Madaras, Ninetto Davoli, L\\u00e1szl\\u00f3 M\\u00e1rkus, G\\u00e9za D. Heged\\u00fcs, Gy\\u00f6rgy Cserhalmi, Mikl\\u00f3s K\\u00f6ll\\u00f5, G\\u00e1bor D\\u00f3ra, Mih\\u00e1ly Csik\\u00f3s, Katalin D\\u00f5ry, \\u00c9va Preisinger, Ir\\u00e9n Kisp\\u00e1l, Katalin Preisinger-Keller, G\\u00e9za Melczer-Luk\\u00e1cs","description":"A historical drama set in the 1400s, about a young man sent to Italy but is forced back after his father\'s mysterious death.","avg_vote":6.8,"votes":118,"budget":null,"usa_gross_income":null,"worlw

## Step 2: Mencari Similarity

In [16]:
distance = cosine_distances(code, bank)
distance

array([[0.88047714, 0.9194177 , 0.8511125 , ..., 0.83281654, 0.79955407,
        0.72996914]])

In [17]:
rec_idx = distance.argsort()[0, 1:11]
rec_idx

array([ 6399, 11919, 17727, 10042, 19686, 78079, 37423,  1974, 56526,
       55308], dtype=int64)

## Step 3: Rekomen

In [18]:
df.loc[rec_idx].to_json(orient="records")

'[{"imdb_title_id":"tt0042229","title":"Il barone dell\'Arizona","original_title":"The Baron of Arizona","year":1950,"date_published":"1951-01-24","genre":"Biography, Crime, Drama","duration":97,"country":"USA","language":"English","director":"Samuel Fuller","writer":"Samuel Fuller","production_company":"Deputy Corporation","actors":"Vincent Price, Ellen Drew, Vladimir Sokoloff, Beulah Bondi, Reed Hadley, Robert Barrat, Robin Short, Tina Pine, Karen Kester, Margia Dean, Jonathan Hale, Edward Keane, Barbara Wooddell, I. Stanford Jolley, Fred Kohler Jr.","description":"Master swindler James Reavis painstakingly spends years forging documents and land grants that will make his wife and him undisputed owners of the entire state of Arizona.","avg_vote":7.0,"votes":1877,"budget":"$ 135000","usa_gross_income":null,"worlwide_gross_income":null,"metascore":null,"reviews_from_users":35.0,"reviews_from_critics":29.0},{"imdb_title_id":"tt0058312","title":"Maciste, gladiatore di Sparta","original_t