# Steven's Question: Does title length affect rating?

In [1]:
import numpy as np
import pandas as pd
from glob import glob
import sqlite3
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# This pulls all the .csv files into one list of dataframes.

csv_files = glob('zippedData/*.csv.gz')
csv_dfs = [pd.read_csv(f) for f in csv_files]
len(csv_dfs)

9

In [4]:
# Assign the .csv files to human-readable variables.

# Has tconst, title, and release year.
imdb_basics = csv_dfs[3]

# Has user ratings.
imdb_ratings = csv_dfs[6]

In [5]:
filter_ = imdb_ratings.numvotes >= 1000
imdb_ratings[filter_]

Unnamed: 0,tconst,averagerating,numvotes
3,tt1043726,4.2,50352
6,tt1094666,7.0,1613
10,tt1171222,5.1,8296
11,tt1174693,5.8,2381
12,tt1181840,7.0,5494
...,...,...,...
73763,tt8443704,7.5,1947
73771,tt8564902,4.7,5863
73772,tt8574252,7.1,1526
73792,tt8948790,9.0,1778


In [6]:
# Create a SQL database for easy joins.

conn = sqlite3.connect('steven2.db')
cur = conn.cursor()

In [7]:
# Check to make sure SQL database was created.

In [10]:
ls

 Volume in drive C is OS
 Volume Serial Number is C084-E836

 Directory of C:\Users\Steven\class\mod1\movie-analysis

06/16/2020  11:47 AM    <DIR>          .
06/16/2020  11:47 AM    <DIR>          ..
06/16/2020  10:30 AM             1,973 .gitignore
06/16/2020  11:44 AM    <DIR>          .ipynb_checkpoints
06/16/2020  10:26 AM                16 README.md
06/14/2020  03:46 PM            60,165 steven_data_exploration.ipynb
06/16/2020  10:46 AM            34,649 steven_group_q_profit_runtime_cleaning.ipynb
06/16/2020  11:44 AM            43,476 steven_personal_q_cleaning.ipynb
06/16/2020  11:14 AM       127,225,856 steven_q.db
06/16/2020  11:47 AM        12,980,224 steven2.db
06/16/2020  11:46 AM            37,343 uuuuuuuuuuuuugggggggggggggggggggghhhhhhhhhhhhhhhhhh.ipynb
06/16/2020  10:28 AM    <DIR>          zippedData
               8 File(s)    140,383,702 bytes
               4 Dir(s)  42,870,190,080 bytes free


In [9]:
# Turn dataframes into tables on database.

imdb_basics.to_sql('basics', conn, if_exists='replace', index=False)
imdb_ratings.to_sql('ratings', conn, if_exists='replace', index=False)

In [11]:
# Commit database changes

conn.commit()

In [9]:
a = pd.DataFrame(cur.execute("""SELECT primary_title, start_year, runtime_minutes FROM basics WHERE start_year > 2017""").fetchall())
a.columns = [x[0] for x in cur.description]
a.head(15)

Unnamed: 0,primary_title,start_year,runtime_minutes
0,One Day Before the Rainy Season,2019,114.0
1,The Other Side of the Wind,2018,122.0
2,Sabse Bada Sukh,2018,
3,A Thin Life,2018,75.0
4,T.G.M. - osvoboditel,2018,60.0
5,Heaven & Hell,2018,104.0
6,Gangavataran,2018,134.0
7,Seven Jews from My Class,2018,40.0
8,On kadin,2019,
9,To Chase a Million,2018,97.0


In [14]:
b = pd.DataFrame(cur.execute("""SELECT *
                                FROM basics
                                JOIN budgets
                                ON basics.primary_title = budgets.movie""").fetchall())
b.columns = [x[0] for x in cur.description]
b

Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,tt0249516,Foodfight!,Foodfight!,2012,91.0,"Action,Animation,Comedy",26,"Dec 31, 2012",Foodfight!,"$45,000,000",$0,"$73,706"
1,tt0293429,Mortal Kombat,Mortal Kombat,2021,,"Action,Adventure,Fantasy",10,"Aug 18, 1995",Mortal Kombat,"$20,000,000","$70,433,227","$122,133,227"
2,tt0326592,The Overnight,The Overnight,2010,88.0,,21,"Jun 19, 2015",The Overnight,"$200,000","$1,109,808","$1,165,996"
3,tt0337692,On the Road,On the Road,2012,124.0,"Adventure,Drama,Romance",17,"Mar 22, 2013",On the Road,"$25,000,000","$720,828","$9,313,302"
4,tt0359950,The Secret Life of Walter Mitty,The Secret Life of Walter Mitty,2013,114.0,"Adventure,Comedy,Drama",37,"Dec 25, 2013",The Secret Life of Walter Mitty,"$91,000,000","$58,236,838","$187,861,183"
...,...,...,...,...,...,...,...,...,...,...,...,...
3810,tt9877596,Trapped,Trapped,2016,,,87,"Sep 20, 2002",Trapped,"$30,000,000","$6,916,869","$6,916,869"
3811,tt9889072,The Promise,The Promise,2017,,Drama,78,"Apr 21, 2017",The Promise,"$90,000,000","$8,224,288","$10,551,417"
3812,tt9893078,Sublime,Sublime,2019,,Documentary,30,"Mar 13, 2007",Sublime,"$1,800,000",$0,$0
3813,tt9899880,Columbus,Columbus,2018,85.0,Comedy,93,"Aug 4, 2017",Columbus,"$700,000","$1,017,107","$1,110,511"


In [50]:
c = pd.DataFrame(cur.execute("""SELECT p.tconst, r.numvotes, r.averagerating, p.category
                                FROM principals p
                                JOIN ratings r
                                ON p.tconst = r.tconst""").fetchall())
c.columns = [x[0] for x in cur.description]
c

Unnamed: 0,tconst,numvotes,averagerating,category
0,tt0323808,2328,3.9,editor
1,tt0323808,2328,3.9,actress
2,tt0323808,2328,3.9,actor
3,tt0323808,2328,3.9,actor
4,tt0323808,2328,3.9,actress
...,...,...,...,...
629750,tt9681728,34,6.6,director
629751,tt9681728,34,6.6,producer
629752,tt9681728,34,6.6,cinematographer
629753,tt9681728,34,6.6,actor


In [58]:
filt = (c.category == 'actor')
c[filt]

Unnamed: 0,tconst,numvotes,averagerating,category
2,tt0323808,2328,3.9,actor
3,tt0323808,2328,3.9,actor
11,tt0417610,80,6.4,actor
13,tt0417610,80,6.4,actor
23,tt0469152,88,7.2,actor
...,...,...,...,...
629747,tt9681728,34,6.6,actor
629748,tt9681728,34,6.6,actor
629749,tt9681728,34,6.6,actor
629753,tt9681728,34,6.6,actor


In [71]:
c.groupby('tconst').sum()

Unnamed: 0_level_0,numvotes,averagerating
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1
tt0063540,770,70.0
tt0066787,301,50.4
tt0069049,45170,69.0
tt0069204,130,61.0
tt0100275,1190,65.0
...,...,...
tt9913084,60,62.0
tt9914286,1224,78.3
tt9914642,56,59.5
tt9914942,40,52.8
