## Data Load into MondoDB

This script loads all movie data (TMDB and OMDB) to MongoDB

In [2]:
# Import all dependencies
from pymongo import MongoClient
import pandas as pd

In [3]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = MongoClient(conn)

In [4]:
# Define database and collection
db = client.moviesdb

# drop collections if already existing
db.omdb_api.drop()
db.tmdb_data.drop()

#create / re-create collections
omdb_coll = db.omdb_api
tmdb_coll = db.tmdb_data
movies_coll = db.all_movies_data

### Load OMDB data to MongoDB

In [4]:
omdb_api_data= pd.read_csv("omdb_api_cleaned.csv")

omdb_api_data.head()

Unnamed: 0.1,Unnamed: 0,Title,Year,Genre,Awards,Metascore,BoxOffice
0,0,The Widow of Saint-Pierre,2000,"Drama, History, Romance",Nominated for 1 Golden Globe. Another 3 wins &...,73.0,$0
1,1,Nurse Betty,2000,"Comedy, Crime, Drama, Romance",Won 1 Golden Globe. Another 4 wins & 14 nomina...,69.0,$0
2,2,Fiza,2000,"Action, Crime, Drama",4 wins & 18 nominations.,0.0,$0
3,3,Duets,2000,"Comedy, Drama, Music, Romance",1 nomination.,40.0,"$4,262,782"
4,4,Brother,2000,"Crime, Drama, Thriller",1 win & 1 nomination.,47.0,$0


In [16]:
omdb_api_data.count()

omdb_movie_id    3354
Title            3354
Year             3354
Genre            3354
Awards           3354
Metascore        3354
BoxOffice        3354
dtype: int64

In [7]:
# rename col 1 to ID
omdb_api_data.rename(columns = {'Unnamed: 0': 'omdb_movie_id'}, inplace = True)
omdb_api_data.columns

Index(['omdb_movie_id', 'Title', 'Year', 'Genre', 'Awards', 'Metascore',
       'BoxOffice'],
      dtype='object')

In [8]:
omdb_dict=omdb_api_data.to_dict(orient="records")
print(omdb_dict)
    

[{'omdb_movie_id': 0, 'Title': 'The Widow of Saint-Pierre', 'Year': 2000, 'Genre': 'Drama, History, Romance', 'Awards': 'Nominated for 1 Golden Globe. Another 3 wins & 8 nominations.', 'Metascore': 73.0, 'BoxOffice': '$0'}, {'omdb_movie_id': 1, 'Title': 'Nurse Betty', 'Year': 2000, 'Genre': 'Comedy, Crime, Drama, Romance', 'Awards': 'Won 1 Golden Globe. Another 4 wins & 14 nominations.', 'Metascore': 69.0, 'BoxOffice': '$0'}, {'omdb_movie_id': 2, 'Title': 'Fiza', 'Year': 2000, 'Genre': 'Action, Crime, Drama', 'Awards': '4 wins & 18 nominations.', 'Metascore': 0.0, 'BoxOffice': '$0'}, {'omdb_movie_id': 3, 'Title': 'Duets', 'Year': 2000, 'Genre': 'Comedy, Drama, Music, Romance', 'Awards': '1 nomination.', 'Metascore': 40.0, 'BoxOffice': '$4,262,782'}, {'omdb_movie_id': 4, 'Title': 'Brother', 'Year': 2000, 'Genre': 'Crime, Drama, Thriller', 'Awards': '1 win & 1 nomination.', 'Metascore': 47.0, 'BoxOffice': '$0'}, {'omdb_movie_id': 5, 'Title': 'Sexy Beast', 'Year': 2000, 'Genre': 'Crime, D

In [11]:
omdb_ins_res = omdb_coll.insert_many(omdb_dict)

### check if the load has been successfull
- check the total records inserted
- query a for a random record

In [19]:
# check the total records inserted
if(len(omdb_ins_res.inserted_ids) == omdb_api_data.Title.count()):
    print(f"Success!!! Total of {len(omdb_ins_res.inserted_ids)} inserted and it matches the DF count.")
else:
    print(f"Warning!!! Total records inserted {len(omdb_ins_res.inserted_ids)} is less than DF count of {omdb_api_data.Title.count()}")

Success!!! Total of 3354 inserted and it matches the DF count.


In [26]:
# Query a random record


print("Record pulled from Mongo")
print(omdb_coll.find_one({'Title' : omdb_api_data.Title[20]}))

print("Row from dataframe")

omdb_api_data.loc[omdb_api_data.Title == omdb_api_data.Title[20],]

Record pulled from Mongo
{'_id': ObjectId('5c70678c2b0322428caa1b37'), 'omdb_movie_id': 19, 'Title': 'Dungeons & Dragons', 'Year': 2000, 'Genre': 'Action, Adventure, Fantasy', 'Awards': '11 nominations.', 'Metascore': 14.0, 'BoxOffice': '$14,813,627'}
Row from dataframe


Unnamed: 0,omdb_movie_id,Title,Year,Genre,Awards,Metascore,BoxOffice
20,19,Dungeons & Dragons,2000,"Action, Adventure, Fantasy",11 nominations.,14.0,"$14,813,627"


### TMDB Load to MongoDB

In [27]:
dataFile = "TMDB_Cleaned.csv"
tmdb_df = pd.read_csv(dataFile)
tmdb_df.head()

Unnamed: 0.1,Unnamed: 0,title,budget,popularity,production_companies,release_date,runtime,revenue,status,vote_average,vote_count,Year
0,2772,The Widow of Saint-Pierre,0,1.780065,"[{""name"": ""Cin\u00e9maginaire Inc."", ""id"": 280...",2000-01-01,112.0,0,Released,6.7,11,2000
1,3141,Next Friday,11000000,9.337388,"[{""name"": ""New Line Cinema"", ""id"": 12}]",2000-01-12,98.0,59827328,Released,6.4,135,2000
2,3344,My Dog Skip,7000000,5.675535,"[{""name"": ""Alcon Entertainment"", ""id"": 1088}, ...",2000-01-14,95.0,0,Released,6.5,69,2000
3,775,Supernova,90000000,5.762037,"[{""name"": ""United Artists"", ""id"": 60}, {""name""...",2000-01-14,91.0,14828081,Released,4.9,109,2000
4,4636,Chuck & Buck,0,0.812855,[],2000-01-21,96.0,0,Released,5.7,16,2000


In [28]:
# rename col 1 to ID
tmdb_df.rename(columns = {'Unnamed: 0': 'tmdb_movie_id'}, inplace = True)
tmdb_df.columns

Index(['tmdb_movie_id', 'title', 'budget', 'popularity',
       'production_companies', 'release_date', 'runtime', 'revenue', 'status',
       'vote_average', 'vote_count', 'Year'],
      dtype='object')

In [29]:
insert_dict = tmdb_df.to_dict(orient = 'records')

In [30]:
len(insert_dict)

3494

In [32]:
insert_tmdb_res = tmdb_coll.insert_many(insert_dict)

### Verification : check if the load has been successfull
- check the total records inserted
- query a for a random record

In [33]:
# check the total records inserted
if(len(insert_tmdb_res.inserted_ids) == tmdb_df.title.count()):
    print(f"Success!!! Total of {len(omdb_ins_res.inserted_ids)} inserted and it matches the DF count.")
else:
    print(f"Warning!!! Total records inserted {len(omdb_ins_res.inserted_ids)} is less than DF count of {omdb_api_data.Title.count()}")

Success!!! Total of 3354 inserted and it matches the DF count.


In [34]:
# Query a random record


print("Record pulled from Mongo")
print(tmdb_coll.find_one({'title' : tmdb_df.title[370]}))

print("Row from dataframe")

tmdb_df.loc[tmdb_df.title == tmdb_df.title[370],]

Record pulled from Mongo
{'_id': ObjectId('5c706a982b0322428caa29af'), 'tmdb_movie_id': 1496, 'title': 'Snow Dogs', 'budget': 33000000, 'popularity': 13.655735, 'production_companies': '[{"name": "Walt Disney Pictures", "id": 2}]', 'release_date': '2002-01-18', 'runtime': 99.0, 'revenue': 0, 'status': 'Released', 'vote_average': 5.3, 'vote_count': 185, 'Year': 2002}
Row from dataframe


Unnamed: 0,tmdb_movie_id,title,budget,popularity,production_companies,release_date,runtime,revenue,status,vote_average,vote_count,Year
370,1496,Snow Dogs,33000000,13.655735,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",2002-01-18,99.0,0,Released,5.3,185,2002


### Load OMDB data to MongoDB

In [4]:
omdb_api_data= pd.read_csv("omdb_api_cleaned.csv")

omdb_api_data.head()

Unnamed: 0.1,Unnamed: 0,Title,Year,Genre,Awards,Metascore,BoxOffice
0,0,The Widow of Saint-Pierre,2000,"Drama, History, Romance",Nominated for 1 Golden Globe. Another 3 wins &...,73.0,$0
1,1,Nurse Betty,2000,"Comedy, Crime, Drama, Romance",Won 1 Golden Globe. Another 4 wins & 14 nomina...,69.0,$0
2,2,Fiza,2000,"Action, Crime, Drama",4 wins & 18 nominations.,0.0,$0
3,3,Duets,2000,"Comedy, Drama, Music, Romance",1 nomination.,40.0,"$4,262,782"
4,4,Brother,2000,"Crime, Drama, Thriller",1 win & 1 nomination.,47.0,$0


In [16]:
omdb_api_data.count()

omdb_movie_id    3354
Title            3354
Year             3354
Genre            3354
Awards           3354
Metascore        3354
BoxOffice        3354
dtype: int64

In [7]:
# rename col 1 to ID
omdb_api_data.rename(columns = {'Unnamed: 0': 'omdb_movie_id'}, inplace = True)
omdb_api_data.columns

Index(['omdb_movie_id', 'Title', 'Year', 'Genre', 'Awards', 'Metascore',
       'BoxOffice'],
      dtype='object')

In [8]:
omdb_dict=omdb_api_data.to_dict(orient="records")
print(omdb_dict)
    

[{'omdb_movie_id': 0, 'Title': 'The Widow of Saint-Pierre', 'Year': 2000, 'Genre': 'Drama, History, Romance', 'Awards': 'Nominated for 1 Golden Globe. Another 3 wins & 8 nominations.', 'Metascore': 73.0, 'BoxOffice': '$0'}, {'omdb_movie_id': 1, 'Title': 'Nurse Betty', 'Year': 2000, 'Genre': 'Comedy, Crime, Drama, Romance', 'Awards': 'Won 1 Golden Globe. Another 4 wins & 14 nominations.', 'Metascore': 69.0, 'BoxOffice': '$0'}, {'omdb_movie_id': 2, 'Title': 'Fiza', 'Year': 2000, 'Genre': 'Action, Crime, Drama', 'Awards': '4 wins & 18 nominations.', 'Metascore': 0.0, 'BoxOffice': '$0'}, {'omdb_movie_id': 3, 'Title': 'Duets', 'Year': 2000, 'Genre': 'Comedy, Drama, Music, Romance', 'Awards': '1 nomination.', 'Metascore': 40.0, 'BoxOffice': '$4,262,782'}, {'omdb_movie_id': 4, 'Title': 'Brother', 'Year': 2000, 'Genre': 'Crime, Drama, Thriller', 'Awards': '1 win & 1 nomination.', 'Metascore': 47.0, 'BoxOffice': '$0'}, {'omdb_movie_id': 5, 'Title': 'Sexy Beast', 'Year': 2000, 'Genre': 'Crime, D

In [11]:
omdb_ins_res = omdb_coll.insert_many(omdb_dict)

### check if the load has been successfull
- check the total records inserted
- query a for a random record

In [19]:
# check the total records inserted
if(len(omdb_ins_res.inserted_ids) == omdb_api_data.Title.count()):
    print(f"Success!!! Total of {len(omdb_ins_res.inserted_ids)} inserted and it matches the DF count.")
else:
    print(f"Warning!!! Total records inserted {len(omdb_ins_res.inserted_ids)} is less than DF count of {omdb_api_data.Title.count()}")

Success!!! Total of 3354 inserted and it matches the DF count.


In [26]:
# Query a random record


print("Record pulled from Mongo")
print(omdb_coll.find_one({'Title' : omdb_api_data.Title[20]}))

print("Row from dataframe")

omdb_api_data.loc[omdb_api_data.Title == omdb_api_data.Title[20],]

Record pulled from Mongo
{'_id': ObjectId('5c70678c2b0322428caa1b37'), 'omdb_movie_id': 19, 'Title': 'Dungeons & Dragons', 'Year': 2000, 'Genre': 'Action, Adventure, Fantasy', 'Awards': '11 nominations.', 'Metascore': 14.0, 'BoxOffice': '$14,813,627'}
Row from dataframe


Unnamed: 0,omdb_movie_id,Title,Year,Genre,Awards,Metascore,BoxOffice
20,19,Dungeons & Dragons,2000,"Action, Adventure, Fantasy",11 nominations.,14.0,"$14,813,627"
