# High Rating Movies Criteria
## Loading Data

In [13]:
import pandas as pd
imdb = pd.read_csv('https://raw.githubusercontent.com/scalabretta/GroupProject--IMDB/master/Dataset/IMDB-database-update.csv', encoding='latin1')
imdb.head(5)

Unnamed: 0,movie_ID,movie_title,duration,color,title_year,country,language,content_rating,budget_USD,gross_USD,...,actor_3_facebook?,actor_3_facebook_likes,cast_total_facebook_likes,movie_facebook?,movie_facebook_likes;,facenumber_in_poster,num_voted_users,num_user_for_reviews,imdb_score,imdb_top_250
0,1,Avatar,178,1,2009,USA,English,PG-13,237000000,760505847,...,1,855,4834,1,33000,0,886204,3054,7.9,0
1,2,Pirates of the Caribbean: At World's End,169,1,2007,USA,English,PG-13,300000000,309404152,...,1,1000,48350,0,0,0,471220,1238,7.1,0
2,3,The Dark Knight Rises,164,1,2012,USA,English,PG-13,250000000,448130642,...,1,23000,106759,1,164000,0,1144337,2701,8.5,1
3,4,Spider-Man 3,156,1,2007,USA,English,PG-13,258000000,336530303,...,1,4000,46055,0,0,0,383056,1902,6.2,0
4,5,Batman v Superman: Dawn of Justice,183,1,2016,USA,English,PG-13,250000000,330249062,...,1,2000,24450,1,197000,0,371639,3018,6.9,0


## Data Cleaning

In [44]:
imdb.isnull().sum()

movie_ID                     0
movie_title                  0
duration                     0
color                        0
title_year                   0
country                      0
language                     0
content_rating               0
budget_USD                   0
gross_USD                    0
net_USD                      0
profitable                   0
director_name                0
director_top20?              0
director_facebook?           0
director_facebook_likes      0
actor_1_name                 0
actor_top20?                 0
actor_1_facebook?            0
actor_1_facebook_likes       0
actor_2_name                 0
actor_top20?.1               0
actor_2_facebook?            0
actor_2_facebook_likes       0
actor_3_name                 0
actor_top20?.2               0
actor_3_facebook?            0
actor_3_facebook_likes       0
cast_total_facebook_likes    0
movie_facebook?              0
movie_facebook_likes;        0
facenumber_in_poster         0
num_vote

In [42]:
imdb = imdb[(imdb['director_top20?']=='0') | (imdb['director_top20?']=='1')]
imdb['director_top20?'] = imdb['director_top20?'].astype(int)

## Features Selection

In [43]:
imdb.select_dtypes(exclude='object').columns

Index(['movie_ID', 'duration', 'color', 'title_year', 'budget_USD',
       'gross_USD', 'net_USD', 'profitable', 'director_top20?',
       'director_facebook?', 'director_facebook_likes', 'actor_top20?',
       'actor_1_facebook?', 'actor_1_facebook_likes', 'actor_top20?.1',
       'actor_2_facebook?', 'actor_2_facebook_likes', 'actor_top20?.2',
       'actor_3_facebook?', 'actor_3_facebook_likes',
       'cast_total_facebook_likes', 'movie_facebook?', 'movie_facebook_likes;',
       'facenumber_in_poster', 'num_voted_users', 'num_user_for_reviews',
       'imdb_score', 'imdb_top_250'],
      dtype='object')

In [7]:
numeric_variable = ['duration', 'color', 'title_year', 'budget_USD',
       'gross_USD', 'net_USD', 'profitable', 'director_top20?',
       'director_facebook?', 'director_facebook_likes', 'actor_top20?',
       'actor_1_facebook?', 'actor_1_facebook_likes', 'actor_top20?.1',
       'actor_2_facebook?', 'actor_2_facebook_likes', 'actor_top20?.2',
       'actor_3_facebook?', 'actor_3_facebook_likes',
       'cast_total_facebook_likes', 'movie_facebook?', 'movie_facebook_likes;',
       'facenumber_in_poster', 'num_voted_users', 'num_user_for_reviews',
       'imdb_score', 'imdb_top_250']

### t test

In [61]:
from scipy import stats

In [71]:
for item in numeric_variable:
    a = stats.ttest_ind(imdb_high[item], imdb_low[item],  equal_var = False)
    if (a.pvalue<0.05):
        print(item)

duration
color
gross_USD
net_USD
profitable
director_top20?
director_facebook?
director_facebook_likes
actor_top20?
actor_1_facebook_likes
actor_top20?.1
actor_2_facebook_likes
actor_top20?.2
actor_3_facebook_likes
cast_total_facebook_likes
movie_facebook?
movie_facebook_likes;
facenumber_in_poster
num_voted_users
num_user_for_reviews
imdb_score
imdb_top_250


### Correlation value

In [78]:
abs(imdb[numeric_variable].corr()['imdb_score']).sort_values()

movie_facebook?              0.005919
actor_1_facebook?            0.007474
budget_USD                   0.029899
net_USD                      0.035675
actor_2_facebook?            0.041763
actor_3_facebook?            0.043454
actor_top20?.2               0.060296
actor_3_facebook_likes       0.066476
facenumber_in_poster         0.067903
actor_1_facebook_likes       0.092281
actor_top20?.1               0.099312
director_top20?              0.103105
actor_2_facebook_likes       0.104257
cast_total_facebook_likes    0.106400
color                        0.118755
actor_top20?                 0.131350
director_facebook?           0.140049
profitable                   0.191339
director_facebook_likes      0.193517
gross_USD                    0.215737
movie_facebook_likes;        0.281011
num_user_for_reviews         0.323491
duration                     0.366436
imdb_top_250                 0.371100
num_voted_users              0.479143
imdb_score                   1.000000
Name: imdb_s

chosen features/variables :
* gross_USD                   
* movie_facebook_likes;        
* num_user_for_reviews
* duration
* imdb_top_250
* num_voted_users

## Creating New Variable

In [45]:
score_group = ["High" if x>7.5 else "Low" for x in imdb['imdb_score']]

In [46]:
imdb['score_group'] = score_group

## Slicing Dataframe

In [48]:
imdb_high = imdb[imdb['score_group']=="High"]
imdb_low = imdb[imdb['score_group']=="Low"]

## The Differences between high and low rated movies

In [76]:
imdb_high[['gross_USD','movie_facebook_likes;','num_user_for_reviews','duration','imdb_top_250','num_voted_users']].mean().round(2)

gross_USD                84321188.79
movie_facebook_likes;       24933.83
num_user_for_reviews          690.18
duration                      125.52
imdb_top_250                    0.29
num_voted_users            295918.34
dtype: float64

In [77]:
imdb_low[['gross_USD','movie_facebook_likes;','num_user_for_reviews','duration','imdb_top_250','num_voted_users']].mean().round(2)

gross_USD                46266640.45
movie_facebook_likes;        6684.64
num_user_for_reviews          272.21
duration                      107.37
imdb_top_250                    0.00
num_voted_users             72233.04
dtype: float64

From the statistics above, we can conclude that:
* High rated movies make more money
* High rated movies are more likely to be *'liked'* in Facebook
* High rated movies are more likely to be reviewed by IMDB user
* High rated movies have longer duration
* High rated movies are more likely to be voted by users