# Data Analysis of Movies

#### Dataset for movies - https://grouplens.org/datasets/movielens/
#### Summary about dataset - http://files.grouplens.org/datasets/movielens/ml-latest-README.html


In [None]:
import graphlab as gl
import pandas as pd
import numpy as np
import requests as rq
import json,os
#import matplotlib.pyplot as plt
#import matplotlib.cbook as cbook

In [None]:
#File that we used
#links_file_path = r'C:\Users\naveen.sanka\Documents\Data Viz Project\Individual\ml-latest\ml-latest\links.csv'
#links_data = gl.SFrame(links_file_path)

# API to get Actors and more detailed information about each movie

### We will be using links data to extract information from Open movie database using Imdb movie id
### Sample url - http://www.omdbapi.com/?i=tt0113497

#### This is the piece of code we used to extract data from omdb 

In [None]:
# base_url = 'http://www.omdbapi.com/?i=tt'
# output_df = pd.DataFrame()
# for x in links_data:
#     query_url = base_url+str('{0:0{width}}'.format(x['imdbId'], width=7))
#     try:
#         ret = rq.get(query_url)
#     except rq.exceptions as e:
#         print("Error in Post method",e)
#         raise
#     except rq.exceptions.Timeout as e:
#         print("Timeout Error",str(e))
#         raise
#     except rq.exceptions.TooManyRedirects as e:
#         print("redirects Error",str(e))
#         raise
#     except rq.exceptions.RequestException as e:
#         print('Exception occured', str(e))
#         raise
#     temp = pd.read_json(ret.text,typ='series')
#     temp = pd.DataFrame(temp).T
#     temp = temp.sort_index(axis=1,ascending=True)
#     output_df = output_df.append(temp)

#### For convinience I have saved the data in a JSON file

In [None]:
# Saving file to local
# temp_final_df.to_json(r'C:\Users\naveen.sanka\Documents\Data Viz Project\Individual\Imdb_Json_File\final_movies_data_omdb.json'
#                       ,orient='split')

In [None]:
omdb_df = pd.read_json(r'C:\Users\naveen.sanka\Documents\Data Viz Project\Individual\Imdb_Json_File\final_movies_data_omdb.json'
                       ,orient='split')

We need to perform some data cleaning operations to use this data:
    1. Removing index column from data frame as we don't need it.
    2. We have observed that there are duplicate imdbId so, we will remove them

In [None]:
#Removing index column
omdb_df = omdb_df.drop('index',axis=1)
# Removing duplicates from the omdb_df
omdb_df = omdb_df.drop_duplicates(['imdbID'],keep='first')

# We have extracted Highest Paid Actors list from Statista

In [None]:
#Loading CSV Files
highest_paid_df = pd.read_csv(r'C:\Users\naveen.sanka\Documents\Data Viz Project\Individual\Highest_Paid_2016.csv')
#Appending Actors and Actress
Highest_Paid_Actors = highest_paid_df['Highest Paid Actors'].append(highest_paid_df['Highest Paid Actress'])
#We have identified few parameters for Actors success
Actor_Success = omdb_df[['imdbID','Title','Actors','Country','Released','Director','BoxOffice','imdbRating']]
#Deriving parameters for all the Actors
for x in Highest_Paid_Actors[0:len(Highest_Paid_Actors)-1]:
    Actor_Success[x] = np.nan
    def actor_present(data):
        if x in data:
            return 1
        else:
            return np.nan
    Actor_Success[x] = Actor_Success.Actors.apply(actor_present)

In [None]:
#Dropping rows which are not needed
Actor_Success = Actor_Success.dropna(subset=['Dwayne Johnson','Jackie Chan','Matt Damon',
               'Tom Cruise',        'Johnny Depp',        'Ben Affleck',
               'Vin Diesel',     'Shah Rukh Khan',   'Robert Downey Jr',
             'Akshay Kumar',          'Brad Pitt',  'Jennifer Lawrence',
         'Melissa McCarthy', 'Scarlett Johansson',   'Jennifer Aniston',
             'Fan Bingbing',    'Charlize Theron',          'Amy Adams',
            'Julia Roberts',         'Mila Kunis',   'Deepika Padukone'],how='all')

In [None]:
# Converting Releasd Column to Datetime type
Actor_Success.Released = pd.to_datetime(Actor_Success.Released, format="%d %b %Y",errors='coerce')
Actor_Success.imdbRating = pd.to_numeric(Actor_Success.imdbRating)
Actor_Success.BoxOffice = Actor_Success.BoxOffice.replace({'N/A':np.nan})
Actor_Success.BoxOffice = Actor_Success.BoxOffice.replace('[\$,]', '', regex=True).astype(float)
Actor_Success = Actor_Success.sort(columns='Released')

avg_rat_box = pd.DataFrame(Highest_Paid_Actors[0:len(Highest_Paid_Actors)-1],columns=['Actors'])
def box_mean(column):
    temp = Actor_Success[Actor_Success[str(column)]==1]
    temp_box = pd.DataFrame(temp['BoxOffice'])
    #removing outliers
    temp_box = temp_box[abs(temp_box['BoxOffice']-np.mean(temp_box['BoxOffice']))<2*np.std(temp_box['BoxOffice'])]
    return temp_box['BoxOffice'].mean()

def rating_mean(column):
    #print(column)
    temp = Actor_Success[Actor_Success[str(column)]==1]
    temp_rat = pd.DataFrame(temp['imdbRating'])
    #removing outliers
    temp_rat = temp_rat[abs(temp_rat['imdbRating']-np.mean(temp_rat['imdbRating']))<2*np.std(temp_rat['imdbRating'])]
    return temp_rat['imdbRating'].mean()

avg_rat_box['BoxOffice'] = avg_rat_box.Actors.apply(box_mean)
avg_rat_box['imdbRating'] = avg_rat_box.Actors.apply(rating_mean)

#removing outliers
def remove_outliers(temp_j, m=2):
    #return data[abs(data - np.mean(data)) < m * np.std(data)]
    return temp_j[abs(temp_j['BoxOffice']-np.mean(temp_j['BoxOffice']))<2*np.std(temp_j['BoxOffice'])]

avg_rat_box

In [None]:
#Sorting data
avg_rat_box = avg_rat_box.sort_values(by=['BoxOffice','imdbRating'],ascending=False)

#Mapping 
def find_earnings(act):
    #print(act)
    if act in list(highest_paid_df['Highest Paid Actors']):
        temp = highest_paid_df[highest_paid_df['Highest Paid Actors']==act]
        op = int(temp['Amount'])
        #print(type(op))
        #print(op)
    else:
        temp = highest_paid_df[highest_paid_df['Highest Paid Actress']==act]
        op =  int(temp['Amount.1'])
        #print(op)
    return op

avg_rat_box['Earnings'] = avg_rat_box.Actors.apply(find_earnings)
avg_rat_box['Earnings'] = avg_rat_box['Earnings']*1000000

avg_rat_box.to_excel(r'C:\Users\naveen.sanka\Documents\Data Viz Project\Individual\high_paid_actors.xlsx')

# Highest BoxOffice collection based on Genre

In [None]:
omdb_box_df = omdb_df
#Correcting data from omdb
# BoxOffice for Titanic and Avatar
omdb_box_df.set_value(1661,'BoxOffice',658672302)
omdb_box_df.BoxOffice = omdb_box_df.BoxOffice.replace({'N/A':np.nan})
omdb_box_df.BoxOffice = omdb_box_df.BoxOffice.replace({'$25.7k':25700})
omdb_box_df.BoxOffice = omdb_box_df.BoxOffice.replace('[\$,]', '', regex=True).astype(float)
omdb_box_df.Released = pd.to_datetime(omdb_box_df.Released, format="%d %b %Y",errors='coerce')

imdb_genre_list = ['Action','Adventure','Animation','Biography','Comedy','Crime','Documentary','Drama',
                   'Family','Fantasy','Film-Noir','History','Horror','Music','Musical','Mystery',
                   'Romance','Sci-Fi','Sport','Thriller','War','Western']

for x in imdb_genre_list:
    omdb_box_df[x] = np.nan
    def genre_present(data):
        if x in data:
            return 1
        else:
            return np.nan
    omdb_box_df[x] = omdb_box_df.Genre.apply(genre_present)
    
omdb_box_df = omdb_box_df.sort_values(by='BoxOffice',ascending=False)
omdb_box_df_op = omdb_box_df[0:100]

gross_by_gen_dif = pd.DataFrame(imdb_genre_list,columns=['Genre'])
def find_gen(gen):
    return omdb_box_df_op[gen].sum()
    
gross_by_gen_dif['High_BoxOfiice'] = gross_by_gen_dif.Genre.apply(find_gen)
gross_by_gen_dif.to_excel(r'C:\Users\naveen.sanka\Documents\Data Viz Project\Individual\omdb_genre_boxoffice.xlsx')

# Influence of movie title on imdb ratings

### Influence of length of Title

In [None]:
title_omdb_df = omdb_df
def find_len(data):
    return len(data)
title_omdb_df['Title_Length'] = title_omdb_df.Title.apply(find_len)

title_omdb_df.imdbRating = title_omdb_df.imdbRating.replace({'N/A':np.nan})
title_omdb_df.imdbRating = pd.to_numeric(title_omdb_df.imdbRating)
title_omdb_df[0:3].T

xxxx = title_omdb_df[['imdbRating','Title_Length']]

#We have observed that there is no significant corr
xxxx.corr()

# Oscar nomination and winning

In [None]:
omdb_oscar_df = omdb_df
omdb_oscar_df['Oscar'] = np.NaN
def find_oscar(col):
    if ('Oscars' in col) or ('oscars' in col) or ('oscar' in col) or ('Oscar' in col):
        if ('Nominated' in col) or ('nominated' in col):
            return 2
        else:
            return 1
    else:
        return np.NaN
omdb_oscar_df['Oscar'] = omdb_oscar_df.Awards.apply(find_oscar)
omdb_oscar_df = omdb_oscar_df[omdb_oscar_df['Oscar']>=1]

omdb_oscar_df = omdb_oscar_df.sort_values(by=['Year','imdbRating'], ascending=False)
omdb_oscar_df.Runtime = omdb_oscar_df.Runtime.str.replace('min','')
omdb_oscar_df.T

omdb_oscar_df.to_excel(r'C:\Users\naveen.sanka\Documents\Data Viz Project\Individual\omdb_oscar_corr.xlsx')