In [15]:
# Import dependencies
import pandas as pd
from sqlalchemy import create_engine
import numpy as np
import ast
from sqlalchemy import create_engine

# BEST MOVIES

In [16]:
oscars_csv_file = "oscar.csv" #dont forget to edit
oscars_df = pd.read_csv(oscars_csv_file)

In [17]:
best_movie = oscars_df.loc[(oscars_df['Award'] == 'Best Motion Picture') | (oscars_df['Award'] == 'Best Picture')]
best_movie.head()

Unnamed: 0,Year,Ceremony,Award,Winner,Name,Film
1752,1944,17,Best Motion Picture,,Double Indemnity,Paramount
1753,1944,17,Best Motion Picture,,Gaslight,Metro-Goldwyn-Mayer
1754,1944,17,Best Motion Picture,1.0,Going My Way,Paramount
1755,1944,17,Best Motion Picture,,Since You Went Away,Selznick International Pictures
1756,1944,17,Best Motion Picture,,Wilson,20th Century-Fox


In [18]:
# Cleaning the data
best_movie = best_movie[['Year', 'Name', 'Film', 'Winner']]

best_movie = best_movie.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

best_movie.head()

Unnamed: 0,Year,Name,Film,Winner
1752,1944,Double Indemnity,Paramount,
1753,1944,Gaslight,Metro-Goldwyn-Mayer,
1754,1944,Going My Way,Paramount,1.0
1755,1944,Since You Went Away,Selznick International Pictures,
1756,1944,Wilson,20th Century-Fox,


In [19]:
best_movie = best_movie.rename(columns = {'Year' : 'release_year', 
                                          'Name':'title', 
                                          'Film':'production_companies',
                                          'Winner':'win'})

best_movie['win'] = np.where(best_movie['win']==1.0, 'Y', 'N')

In [20]:
best_movie.head()

Unnamed: 0,release_year,title,production_companies,win
1752,1944,Double Indemnity,Paramount,N
1753,1944,Gaslight,Metro-Goldwyn-Mayer,N
1754,1944,Going My Way,Paramount,Y
1755,1944,Since You Went Away,Selznick International Pictures,N
1756,1944,Wilson,20th Century-Fox,N


In [21]:
# Adding the imdb_id variables
movie_ids = pd.read_csv("movies_metadata.csv")
movie_ids = movie_ids[['imdb_id', 'title', 'release_date']]
#for windows use:
movie_ids['release_year'] = movie_ids['release_date'].str[-4:]
#for macs use:
#movie_ids['release_year'] = movie_ids['release_date'].str[:4]
movie_ids.head() 

Unnamed: 0,imdb_id,title,release_date,release_year
0,tt0114709,Toy Story,30/10/1995,1995
1,tt0113497,Jumanji,15/12/1995,1995
2,tt0113228,Grumpier Old Men,22/12/1995,1995
3,tt0114885,Waiting to Exhale,22/12/1995,1995
4,tt0113041,Father of the Bride Part II,10/02/1995,1995


## Adding the IDs


In [22]:
best_movie_ids = pd.merge(best_movie, 
                          movie_ids, on=['title', 'release_year'], how='left')
best_movie = best_movie_ids
best_movie.loc[best_movie['title'] == "Double Indemnity"]

Unnamed: 0,release_year,title,production_companies,win,imdb_id,release_date
0,1944,Double Indemnity,Paramount,N,tt0036775,24/04/1944


# RATINGS

In [25]:
# Loading the ratings data
ratings_csv_file = 'ratings.csv' #dont forget to edit
ratings_df = pd.read_csv(ratings_csv_file)
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


In [26]:
# Selectiong relevant variables
clean_ratings_df = ratings_df[['movieId', 'rating', 'userId']].copy()
clean_ratings_df.head()

Unnamed: 0,movieId,rating,userId
0,110,1.0,1
1,147,4.5,1
2,858,5.0,1
3,1221,5.0,1
4,1246,5.0,1


In [27]:
# Adding the movie id
links_csv_file = ('links.csv') #dont forget to edit
links_df = pd.read_csv(links_csv_file)
links_df.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [28]:
# Cleaning IDs
links_clean_df = links_df[["movieId","imdbId"]].copy()
links_clean_df.head()

Unnamed: 0,movieId,imdbId
0,1,114709
1,2,113497
2,3,113228
3,4,114885
4,5,113041


In [29]:
# Reviews with movie ids
reviews_w_movie_ids = pd.merge(links_clean_df, clean_ratings_df, on ='movieId', how = 'left')
reviews_w_movie_ids.head()

Unnamed: 0,movieId,imdbId,rating,userId
0,1,114709,4.0,8.0
1,1,114709,4.5,9.0
2,1,114709,4.0,12.0
3,1,114709,4.0,20.0
4,1,114709,4.0,24.0


In [30]:
# Making the movie ids the same across all tables
reviews_w_movie_ids['imdbId'] = 'tt' + links_df['imdbId'].astype(str)
reviews_w_movie_ids = reviews_w_movie_ids.rename(columns = {'userId' : 'reviewer_id', 
                                                            'imdbId':'imdb_id'})
reviews_w_movie_ids = reviews_w_movie_ids[['imdb_id','rating', 'reviewer_id']]
reviews_w_movie_ids.head()
ratings = reviews_w_movie_ids
ratings.head()

Unnamed: 0,imdb_id,rating,reviewer_id
0,tt114709,4.0,8.0
1,tt113497,4.5,9.0
2,tt113228,4.0,12.0
3,tt114885,4.0,20.0
4,tt113041,4.0,24.0


# ALL MOVIES

In [32]:
movie_csv_file = "movies_metadata.csv"#dont forget to edit this
movie_df = pd.read_csv(movie_csv_file)
movie_df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,30/10/1995,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,t,,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,15/12/1995,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,22/12/1995,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,22/12/1995,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,10/02/1995,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [33]:
# Selecting relevant variables
clean_movie_df = movie_df[['budget','imdb_id', 'title', 'release_date','revenue', 'production_companies']]
clean_movie_df.head()

Unnamed: 0,budget,imdb_id,title,release_date,revenue,production_companies
0,30000000,tt0114709,Toy Story,30/10/1995,373554033.0,"[{""name"": ""Pixar Animation Studios"", ""id"": 3}]"
1,65000000,tt0113497,Jumanji,15/12/1995,262797249.0,"[{""name"": ""TriStar Pictures"", ""id"": 559}, {""na..."
2,0,tt0113228,Grumpier Old Men,22/12/1995,0.0,"[{""name"": ""Warner Bros."", ""id"": 6194}, {""name""..."
3,16000000,tt0114885,Waiting to Exhale,22/12/1995,81452156.0,"[{""name"": ""Twentieth Century Fox Film Corporat..."
4,0,tt0113041,Father of the Bride Part II,10/02/1995,76578911.0,"[{""name"": ""Sandollar Productions"", ""id"": 5842}..."


In [34]:
clean_movie_df['production_companies'] = clean_movie_df.production_companies.str.replace('name ?' , '')
clean_movie_df['production_companies'] = clean_movie_df.production_companies.str.replace('id ?' , '')
clean_movie_df['production_companies'] = clean_movie_df.production_companies.str.replace('1 ?' , '')
clean_movie_df['production_companies'] = clean_movie_df.production_companies.str.replace('2 ?' , '')
clean_movie_df['production_companies'] = clean_movie_df.production_companies.str.replace('3 ?' , '')
clean_movie_df['production_companies'] = clean_movie_df.production_companies.str.replace('4 ?' , '')
clean_movie_df['production_companies'] = clean_movie_df.production_companies.str.replace('5 ?' , '')
clean_movie_df['production_companies'] = clean_movie_df.production_companies.str.replace('6 ?' , '')
clean_movie_df['production_companies'] = clean_movie_df.production_companies.str.replace('7 ?' , '')
clean_movie_df['production_companies'] = clean_movie_df.production_companies.str.replace('8 ?' , '')
clean_movie_df['production_companies'] = clean_movie_df.production_companies.str.replace('9 ?' , '')
clean_movie_df['production_companies'] = clean_movie_df.production_companies.str.replace('9 ?' , '')
clean_movie_df['production_companies'] = clean_movie_df.production_companies.str.replace('0 ?' , '')
clean_movie_df['production_companies'] = clean_movie_df.production_companies.str.replace(': ?' , '')
clean_movie_df['production_companies'] = clean_movie_df.production_companies.str.replace('""" ?' , '')
clean_movie_df['production_companies'] = clean_movie_df.production_companies.str.replace(', "" ?' , '')
clean_movie_df['production_companies'] = clean_movie_df.production_companies.str.replace('"?' , '')
clean_movie_df['production_companies'] = clean_movie_df.production_companies.str.replace('\[?' , '')
clean_movie_df['production_companies'] = clean_movie_df.production_companies.str.replace('\]?' , '')
clean_movie_df['production_companies'] = clean_movie_df.production_companies.str.replace('\{?' , '')
clean_movie_df['production_companies'] = clean_movie_df.production_companies.str.replace('\}?' , '')
clean_movie_df['production_companies'] = clean_movie_df.production_companies.str.replace("'''?" , '')
clean_movie_df['production_companies'] = clean_movie_df.production_companies.str.replace("', ?" , '')

clean_movie_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

Unnamed: 0,budget,imdb_id,title,release_date,revenue,production_companies
0,30000000,tt0114709,Toy Story,30/10/1995,373554033.0,Pixar Animation Studios
1,65000000,tt0113497,Jumanji,15/12/1995,262797249.0,"TriStar Pictures, Teitler Film, Interscope Com..."
2,0,tt0113228,Grumpier Old Men,22/12/1995,0.0,"Warner Bros., Lancaster Gate"
3,16000000,tt0114885,Waiting to Exhale,22/12/1995,81452156.0,Twentieth Century Fox Film Corporation
4,0,tt0113041,Father of the Bride Part II,10/02/1995,76578911.0,"Sandollar Productions, Touchstone Pictures"


In [35]:
all_movies = clean_movie_df.dropna()
all_movies = all_movies.drop_duplicates()
all_movies.count()

budget                  45332
imdb_id                 45332
title                   45332
release_date            45332
revenue                 45332
production_companies    45332
dtype: int64

# EXPORTING THE DATASETS


### Connect to Local DataBase

In [40]:
user_name = input('Enter user name:')
password = input('Password:')
rds_connection_string = "{0}:{1}@localhost:5432/Magic_Table_db".format(user_name, password)
engine = create_engine(f'postgresql://{rds_connection_string}')

Enter user name:postgres
Password:rebounD29


### Check for tables

In [41]:
engine.table_names()

['all_movies', 'best_movies', 'ratings']

### Use pandas to load csv converted DataFrame into database

In [42]:
#load all_movies to SQL data base
all_movies.to_sql(name='all_movies', con=engine, if_exists='append', index=False)


### Check to see if load was successful

In [43]:
pd.read_sql_query('select * from all_movies', con=engine).head()

Unnamed: 0,budget,imdb_id,title,release_date,revenue,production_companies
0,30000000.0,tt0114709,Toy Story,30/10/1995,373554033.0,Pixar Animation Studios
1,65000000.0,tt0113497,Jumanji,15/12/1995,262797249.0,"TriStar Pictures, Teitler Film, Interscope Com..."
2,0.0,tt0113228,Grumpier Old Men,22/12/1995,0.0,"Warner Bros., Lancaster Gate"
3,16000000.0,tt0114885,Waiting to Exhale,22/12/1995,81452156.0,Twentieth Century Fox Film Corporation
4,0.0,tt0113041,Father of the Bride Part II,10/02/1995,76578911.0,"Sandollar Productions, Touchstone Pictures"


In [44]:
best_movie.to_sql(name='best_movies', con=engine, if_exists='append', index=False)
pd.read_sql_query('select * from best_movies', con=engine).head()

Unnamed: 0,release_year,title,production_companies,win,imdb_id,release_date
0,1944,Double Indemnity,Paramount,N,tt0036775,24/04/1944
1,1944,Gaslight,Metro-Goldwyn-Mayer,N,tt0036855,04/05/1944
2,1944,Going My Way,Paramount,Y,tt0036872,15/05/1944
3,1944,Since You Went Away,Selznick International Pictures,N,tt0037280,20/07/1944
4,1944,Wilson,20th Century-Fox,N,tt0037465,01/08/1944


In [None]:
ratings.to_sql(name='ratings', con=engine, if_exists='append', index=False)
pd.read_sql_query('select * from ratings', con=engine).head()