### Import libs

In [1]:
import nmslib
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import sys
import os
sys.path.append(os.path.join(sys.path[0], '..'))

from tools.corpus import PlotCorpus
from utils.tokenizers import lemma_tokenizer
from tools.film_card import FilmCard

import pickle
from tqdm import tqdm

import logging
logging.basicConfig(format='%(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logging.info('Data preparing start')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Anton\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Anton\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
INFO: Data preparing start


### Data preparing

In [2]:
df_movies_wiki = pd.read_csv('./data/dataset/wiki_movies.csv')
print(df_movies_wiki.shape)

films_wiki = []
for row_idx, row in df_movies_wiki.iterrows():
    film = FilmCard(row['Release Year'],
                    row['Title'],
                    row['Origin/Ethnicity'],
                    row['Genre'],
                    row['Plot'])
    films_wiki.append(film)

with open(r'./data/dataset/films_wiki.pickle', 'wb') as f:
    pickle.dump(films_wiki, f)
    
logging.info('Wiki data is prepared!')


df_movies_imdb = pd.read_csv('./data/dataset/movies.csv')
print(df_movies_imdb.shape)
common_titles = list(set(df_movies_wiki.Title).intersection(set(df_movies_imdb.title)))
df_movies_imdb = df_movies_imdb.loc[df_movies_imdb.title.isin(common_titles), :].dropna(subset=['imdb_plot'])
print(df_movies_imdb.shape)

imdb_films = []
for row_idx, row in df_movies_imdb.iterrows():
    film = FilmCard(None,
                    row['title'],
                    None,
                    None,
                    row['imdb_plot'])
    imdb_films.append(film)
    
with open(r'./data/dataset/films_imdb.pickle', 'wb') as f:
    pickle.dump(imdb_films, f)
    
logging.info('IMDB data is prepared!')

(34886, 8)


INFO: Wiki data is prepared!
INFO: IMDB data is prepared!


(100, 5)
(86, 5)


In [3]:
min(df_movies_wiki['Release Year']), max(df_movies_wiki['Release Year']), set(df_movies_wiki['Origin/Ethnicity'])

(1901,
 2017,
 {'American',
  'Assamese',
  'Australian',
  'Bangladeshi',
  'Bengali',
  'Bollywood',
  'British',
  'Canadian',
  'Chinese',
  'Egyptian',
  'Filipino',
  'Hong Kong',
  'Japanese',
  'Kannada',
  'Malayalam',
  'Malaysian',
  'Maldivian',
  'Marathi',
  'Punjabi',
  'Russian',
  'South_Korean',
  'Tamil',
  'Telugu',
  'Turkish'})

In [4]:
df_movies_wiki.loc[df_movies_wiki['Origin/Ethnicity'] == 'Russian', :]

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
34062,2000,Brother 2,Russian,Aleksei Balabanov,"Sergei Bodrov, Jr., Viktor Sukhorukov, Sergei ...",crime,https://en.wikipedia.org/wiki/Brother_2,The film opens with Danila Bagrov (Sergei Bodr...
34063,2000,Demobbed,Russian,Roman Kachanov,"Juozas Budraitis, Viktor Pavlov, Aleksandr Ded...",comedy,https://en.wikipedia.org/wiki/Demobbed_(film),Three young Russians from very different walks...
34064,2000,His Wife's Diary,Russian,Alexei Uchitel,Andrei Smirnov,historical drama,https://en.wikipedia.org/wiki/His_Wife%27s_Diary,A tragic story of love and loneliness - this i...
34065,2000,House for the Rich,Russian,Vladimir Fokin,"Valentin Gaft, Vladimir Eremin, Konstantin Kha...",drama,https://en.wikipedia.org/wiki/House_for_the_Rich,The film is set in the apartment of an old Mos...
34066,2000,The New Bremen Town Musicians,Russian,Alexander Gorlenko,"Sergei Penkin, Grigoriy Mamikonov, Gennady Gla...",animation,https://en.wikipedia.org/wiki/The_New_Bremen_T...,The plot does not reveal how much time has pas...
...,...,...,...,...,...,...,...,...
34289,2017,The Last Warrior,Russian,Dmitriy Dyachenko,"Victor Chorinyak,\r\nMila Sivatskaya,\r\nEkate...",unknown,https://en.wikipedia.org/wiki/The_Last_Warrior...,"The film opens with sorceress Varvara, princes..."
34290,2017,Mathilda,Russian,Aleksey Uchitel,"Danila Kozlovsky,\r\nGrigoriy Dobrygin,\r\nLar...",unknown,https://en.wikipedia.org/wiki/Matilda_(2017_film),The film tells the story of the romantic relat...
34291,2017,Furious,Russian,Ivan Shurkhovetskiy,"Ilya Malakov,\r\nPolina Chernyshova,\r\nAlekse...",unknown,https://en.wikipedia.org/wiki/Furious_(2017_film),Film will tell about Golden Horde times and th...
34292,2017,Yolki 6,Russian,"Zhora Kryzhovnikov, Dmitri Kiselyov","Ivan Urgant,\r\nSergey Svetlakov,\r\nDmitry Na...",unknown,https://en.wikipedia.org/wiki/Yolki_6,High-school hipster Andrei (Daniela Vakhrushev...


In [5]:
df_movies_imdb.title

0                The Godfather
1     The Shawshank Redemption
2             Schindler's List
3                  Raging Bull
4                   Casablanca
                ...           
94            Double Indemnity
95       Rebel Without a Cause
96                 Rear Window
97               The Third Man
98          North by Northwest
Name: title, Length: 86, dtype: object