## Data Cleaning

First import necessary packages for cleaning and anything else that is needed.

In [1]:
import pandas as pd
import numpy as np
import os
from pandasql import sqldf

pysqldf = lambda q: sqldf(q, globals())
pd.set_option('display.max_rows', 500)

Loading data into dataframes

In [10]:
Bommoviegross = pd.read_csv('./zippedData/bom.movie_gross.csv.gz')
BomGross = pd.DataFrame(Bommoviegross)

imdbname = pd.read_csv('./zippedData/imdb.name.basics.csv.gz')
imdbName = pd.DataFrame(imdbname)

imdbtitlea = pd.read_csv('./zippedData/imdb.title.akas.csv.gz')
imdbTitleAkas = pd.DataFrame(imdbtitlea)

imdbtitleb = pd.read_csv('./zippedData/imdb.title.basics.csv.gz')
imdbTitleBasic = pd.DataFrame(imdbtitleb)

imdbtitlec = pd.read_csv('./zippedData/imdb.title.crew.csv.gz')
imdbTitleCrew = pd.DataFrame(imdbtitlec)

imdbtitlep = pd.read_csv('./zippedData/imdb.title.principals.csv.gz')
imdbTitlePrin = pd.DataFrame(imdbtitlep)

imdbtitler = pd.read_csv('./zippedData/imdb.title.ratings.csv.gz')
imdbTitleRat = pd.DataFrame(imdbtitler)

rtmov = pd.read_csv('./zippedData/rt.movie_info.tsv.gz', sep='\t')
rtmovie = pd.DataFrame(rtmov)

rtrev = pd.read_csv('./zippedData/rt.reviews.tsv.gz', sep='\t', encoding= 'unicode_escape')
rtreview = pd.DataFrame(rtrev)

tmdb = pd.read_csv('./zippedData/tmdb.movies.csv.gz')
tmdbmov = pd.DataFrame(tmdb)

tnm = pd.read_csv('./zippedData/tn.movie_budgets.csv.gz')
tnmmov = pd.DataFrame(tnm)

In [3]:
#Joining imdb tables to make a master table
q = '''SELECT *
       FROM imdbTitleBasic
       LEFT JOIN imdbTitleRat
       USING (tconst)
       Left JOIN imdbTitleCrew
       USING (tconst)
       ;
    '''
imdbMaster = pysqldf(q)

#cleaning up directors and writers columns, putting in the actual names for imdbMaster table
imdbMaster['directors'] = imdbMaster['directors'].astype(str)
imdbMaster['writers'] = imdbMaster['writers'].astype(str)
imdbMaster['directors'] = imdbMaster['directors'].map(lambda x: x.split(','))
imdbMaster['writers'] = imdbMaster['writers'].map(lambda x: x.split(','))
All_names = imdbName.set_index('nconst')['primary_name'].to_dict()
DirectorNamesList = []
for entry in imdbMaster['directors']:
    DirectorNamesList.append([All_names.get(item,item) for item in entry])
imdbMaster['Director_Names'] = DirectorNamesList
WritersNamesList = []
for entry in imdbMaster['writers']:
    WritersNamesList.append([All_names.get(item,item) for item in entry])
imdbMaster['Writers_Names'] = WritersNamesList

In [4]:
#Cleaning up tnmfile

#Dropping columns with $0 for domestic_gross and worldwide_gross (only 367 out of 5782 data entries: ~5%)
df_to_drop1 = tnmmov.loc[(tnmmov['domestic_gross'] == '$0') & (tnmmov['worldwide_gross'] == '$0')] #throw these out
newtnm = pd.concat([tnmmov,df_to_drop1]).drop_duplicates(keep=False)

#Making production_budget, domestic_gross and worldwide_gross values to int instead of string
newtnm['production_budget'] = newtnm.production_budget.map(lambda x: int(x.replace('$','').replace(',','')))
newtnm['domestic_gross'] = newtnm.domestic_gross.map(lambda x: int(x.replace('$','').replace(',','')))
newtnm['worldwide_gross'] = newtnm.worldwide_gross.map(lambda x: int(x.replace('$','').replace(',','')))

#Dec 31 is a placeholder date (only 74 entries, lets throw out instead of looking up each movie date)
#Total thrown out is 7.62% of original data
df_to_drop2 = newtnm[newtnm.release_date.map(lambda x: x.startswith('Dec 31,'))]
newtnmForDateComp = pd.concat([newtnm,df_to_drop2]).drop_duplicates(keep=False)
#Lets make the release month and release year into new columns, will be easier to work with
newtnmForDateComp['Release_Month'] = newtnmForDateComp['release_date'].map(lambda x: x[0:3])
newtnmForDateComp['Release_Year'] = newtnmForDateComp['release_date'].map(lambda x: x[-4:])
#Drop the id and release_date column to make it look nicer
newtnmForDateComp = newtnmForDateComp.drop(columns = ['id','release_date'])

In [11]:
#Cleaning up TMDB File

#First drop the unnamed first column
tmdbmov = tmdbmov.drop(tmdbmov.columns[0], axis = 1)
#Lets drop exact duplicate columns
newtmdb = tmdbmov.drop_duplicates()

#Next lets fill genre column with actual names
#Drop blank rows for genre



#Dropping Bad "Eden" movies
#todrop1 = newtmdb[(newtmdb['id'] == 409371) | (newtmdb['id'] == 388223)].index
#newtmdb.drop(todrop1 , inplace=True)
#Dropping Bad "Lucky" movies
#todrop2 = newtmdb[(newtmdb['id'] == 602984) | (newtmdb['id'] == 461974)].index
#newtmdb.drop(todrop2 , inplace=True)

In [18]:
newtmdb.original_title.value_counts()

Lucky                   5
Aftermath               5
Truth or Dare           5
Eden                    5
Delirium                4
                       ..
Below Zero              1
The Arrival             1
My Sex Robot            1
Revolt                  1
The Millionaire Tour    1
Name: original_title, Length: 24835, dtype: int64

In [44]:
test = newtmdb[newtmdb.vote_count < 2]

In [47]:
newtmdb.popularity.max

<bound method Series.max of 0        33.533
1        28.734
2        28.515
3        28.005
4        27.920
          ...  
26512     0.600
26513     0.600
26514     0.600
26515     0.600
26516     0.600
Name: popularity, Length: 25495, dtype: float64>

In [28]:
len(newtmdb)

25495

In [43]:
imdbMaster.head()

Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres,averagerating,numvotes,directors,writers,Director_Names,Writers_Names
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama",7.0,77.0,[nm0712540],"[nm0023551, nm1194313, nm0347899, nm1391276]",[Harnam Singh Rawail],"[Abrar Alvi, Mahasweta Devi, Gulzar, Anjana Ra..."
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019,114.0,"Biography,Drama",7.2,43.0,[nm0002411],[None],[Mani Kaul],[None]
2,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122.0,Drama,6.9,4517.0,[nm0000080],"[nm0000080, nm0462648]",[Orson Welles],"[Orson Welles, Oja Kodar]"
3,tt0069204,Sabse Bada Sukh,Sabse Bada Sukh,2018,,"Comedy,Drama",6.1,13.0,[nm0611531],[nm0347899],[Hrishikesh Mukherjee],[Gulzar]
4,tt0100275,The Wandering Soap Opera,La Telenovela Errante,2017,80.0,"Comedy,Drama,Fantasy",6.5,119.0,"[nm0765384, nm0749914]","[nm1360635, nm0749914]","[Valeria Sarmiento, Raoul Ruiz]","[Pía Rey, Raoul Ruiz]"


Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres,averagerating,numvotes,directors,writers,Director_Names,Writers_Names
