# IMDB movie ratings

#### Import python tools

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd

## Read our data

In [3]:
url = "https://raw.githubusercontent.com/stiles/usc/main/data/raw/imdb_1000.csv"

#### Create a movies dataframe, and sort it by title

In [4]:
movies_df = pd.read_csv(url).sort_values("title")

#### First five rows?

In [5]:
movies_df.head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
542,7.8,(500) Days of Summer,PG-13,Comedy,95,"[u'Zooey Deschanel', u'Joseph Gordon-Levitt', ..."
5,8.9,12 Angry Men,NOT RATED,Drama,96,"[u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals..."
201,8.1,12 Years a Slave,R,Biography,134,"[u'Chiwetel Ejiofor', u'Michael Kenneth Willia..."
698,7.6,127 Hours,R,Adventure,94,"[u'James Franco', u'Amber Tamblyn', u'Kate Mara']"
110,8.3,2001: A Space Odyssey,G,Mystery,160,"[u'Keir Dullea', u'Gary Lockwood', u'William S..."


#### Use the `describe()` method to see the distribution

In [6]:
movies_df.describe()

Unnamed: 0,star_rating,duration
count,979.0,979.0
mean,7.889785,120.979571
std,0.336069,26.21801
min,7.4,64.0
25%,7.6,102.0
50%,7.8,117.0
75%,8.1,134.0
max,9.3,242.0


---

## Categorization exercise

#### What do you consider a 'long' movie? 

In [7]:
movie_long = 150

In [8]:
movie_long = movies_df["duration"].mean() + movies_df["duration"].std()

#### Boolean categorization function

In [9]:
def categorize_long_movies(time):
    if time >= movie_long:
        return True
    else:
        return False

#### Apply that to our duration column as a new column

In [10]:
movies_df["is_long"] = movies_df["duration"].apply(categorize_long_movies)

In [11]:
movies_df.head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list,is_long
542,7.8,(500) Days of Summer,PG-13,Comedy,95,"[u'Zooey Deschanel', u'Joseph Gordon-Levitt', ...",False
5,8.9,12 Angry Men,NOT RATED,Drama,96,"[u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals...",False
201,8.1,12 Years a Slave,R,Biography,134,"[u'Chiwetel Ejiofor', u'Michael Kenneth Willia...",False
698,7.6,127 Hours,R,Adventure,94,"[u'James Franco', u'Amber Tamblyn', u'Kate Mara']",False
110,8.3,2001: A Space Odyssey,G,Mystery,160,"[u'Keir Dullea', u'Gary Lockwood', u'William S...",True


In [12]:
movies_df.is_long.value_counts()

False    845
True     134
Name: is_long, dtype: int64

---

## Looping exercise

In [13]:
for g in movies_df["genre"].unique():
    print(g)

Comedy
Drama
Biography
Adventure
Mystery
Crime
Horror
Action
Animation
History
Sci-Fi
Western
Family
Thriller
Film-Noir
Fantasy


In [14]:
# https://www.imdb.com/search/title/?genres=comedy

In [15]:
base_url = "https://www.imdb.com/search/title/?genres"

In [16]:
for g in movies_df["genre"].unique():
    print("https://www.imdb.com/search/title/?genres=" + g.lower())

https://www.imdb.com/search/title/?genres=comedy
https://www.imdb.com/search/title/?genres=drama
https://www.imdb.com/search/title/?genres=biography
https://www.imdb.com/search/title/?genres=adventure
https://www.imdb.com/search/title/?genres=mystery
https://www.imdb.com/search/title/?genres=crime
https://www.imdb.com/search/title/?genres=horror
https://www.imdb.com/search/title/?genres=action
https://www.imdb.com/search/title/?genres=animation
https://www.imdb.com/search/title/?genres=history
https://www.imdb.com/search/title/?genres=sci-fi
https://www.imdb.com/search/title/?genres=western
https://www.imdb.com/search/title/?genres=family
https://www.imdb.com/search/title/?genres=thriller
https://www.imdb.com/search/title/?genres=film-noir
https://www.imdb.com/search/title/?genres=fantasy


In [17]:
movies_df["genre_url"] = "https://www.imdb.com/search/title/?genres=" + g.lower()

In [18]:
movies_df.head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list,is_long,genre_url
542,7.8,(500) Days of Summer,PG-13,Comedy,95,"[u'Zooey Deschanel', u'Joseph Gordon-Levitt', ...",False,https://www.imdb.com/search/title/?genres=fantasy
5,8.9,12 Angry Men,NOT RATED,Drama,96,"[u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals...",False,https://www.imdb.com/search/title/?genres=fantasy
201,8.1,12 Years a Slave,R,Biography,134,"[u'Chiwetel Ejiofor', u'Michael Kenneth Willia...",False,https://www.imdb.com/search/title/?genres=fantasy
698,7.6,127 Hours,R,Adventure,94,"[u'James Franco', u'Amber Tamblyn', u'Kate Mara']",False,https://www.imdb.com/search/title/?genres=fantasy
110,8.3,2001: A Space Odyssey,G,Mystery,160,"[u'Keir Dullea', u'Gary Lockwood', u'William S...",True,https://www.imdb.com/search/title/?genres=fantasy


---

### Questions

#### How many crime movies vs. other types? 

In [19]:
movies_df.genre.value_counts()

Drama        278
Comedy       156
Action       136
Crime        124
Biography     77
Adventure     75
Animation     62
Horror        29
Mystery       16
Western        9
Thriller       5
Sci-Fi         5
Film-Noir      3
Family         2
Fantasy        1
History        1
Name: genre, dtype: int64

#### Which movie featuring Jack Nicholson had the highest star rating? 

In [20]:
movies_df[movies_df["actors_list"].str.contains("Jack Nicholson")].sort_values(
    "star_rating", ascending=False
)

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list,is_long,genre_url
16,8.7,One Flew Over the Cuckoo's Nest,R,Drama,133,"[u'Jack Nicholson', u'Louise Fletcher', u'Mich...",False,https://www.imdb.com/search/title/?genres=fantasy
49,8.5,The Departed,R,Crime,151,"[u'Leonardo DiCaprio', u'Matt Damon', u'Jack N...",True,https://www.imdb.com/search/title/?genres=fantasy
62,8.5,The Shining,R,Horror,146,"[u'Jack Nicholson', u'Shelley Duvall', u'Danny...",False,https://www.imdb.com/search/title/?genres=fantasy
130,8.3,Chinatown,R,Drama,130,"[u'Jack Nicholson', u'Faye Dunaway', u'John Hu...",False,https://www.imdb.com/search/title/?genres=fantasy
578,7.8,As Good as It Gets,PG-13,Comedy,139,"[u'Jack Nicholson', u'Helen Hunt', u'Greg Kinn...",False,https://www.imdb.com/search/title/?genres=fantasy
716,7.6,A Few Good Men,R,Drama,138,"[u'Tom Cruise', u'Jack Nicholson', u'Demi Moore']",False,https://www.imdb.com/search/title/?genres=fantasy
801,7.6,Batman,PG-13,Action,126,"[u'Michael Keaton', u'Jack Nicholson', u'Kim B...",False,https://www.imdb.com/search/title/?genres=fantasy
944,7.4,Terms of Endearment,R,Comedy,132,"[u'Shirley MacLaine', u'Debra Winger', u'Jack ...",False,https://www.imdb.com/search/title/?genres=fantasy
943,7.4,The Bucket List,PG-13,Adventure,97,"[u'Jack Nicholson', u'Morgan Freeman', u'Sean ...",False,https://www.imdb.com/search/title/?genres=fantasy


#### How does the average duration of dramas compare with other genres? 

In [21]:
movies_df.groupby(["genre"])["duration"].mean()

genre
Action       126.485294
Adventure    134.840000
Animation     96.596774
Biography    131.844156
Comedy       107.602564
Crime        122.298387
Drama        126.539568
Family       107.500000
Fantasy      112.000000
Film-Noir     97.333333
History       66.000000
Horror       102.517241
Mystery      115.625000
Sci-Fi       109.000000
Thriller     114.200000
Western      136.666667
Name: duration, dtype: float64

#### How many movies contain Al Pacino?

In [22]:
movies_df[movies_df["actors_list"].str.contains("Al Pacino")].sort_values(
    "star_rating", ascending=False
)

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list,is_long,genre_url
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']",True,https://www.imdb.com/search/title/?genres=fantasy
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv...",True,https://www.imdb.com/search/title/?genres=fantasy
135,8.3,Heat,R,Action,170,"[u'Al Pacino', u'Robert De Niro', u'Val Kilmer']",True,https://www.imdb.com/search/title/?genres=fantasy
115,8.3,Scarface,R,Crime,170,"[u'Al Pacino', u'Michelle Pfeiffer', u'Steven ...",True,https://www.imdb.com/search/title/?genres=fantasy
278,8.1,Dog Day Afternoon,R,Crime,125,"[u'Al Pacino', u'John Cazale', u'Penelope Allen']",False,https://www.imdb.com/search/title/?genres=fantasy
374,8.0,Scent of a Woman,R,Drama,156,"[u'Al Pacino', u""Chris O'Donnell"", u'James Reb...",True,https://www.imdb.com/search/title/?genres=fantasy
436,7.9,Carlito's Way,R,Crime,144,"[u'Al Pacino', u'Sean Penn', u'Penelope Ann Mi...",False,https://www.imdb.com/search/title/?genres=fantasy
463,7.9,Glengarry Glen Ross,R,Drama,100,"[u'Al Pacino', u'Jack Lemmon', u'Alec Baldwin']",False,https://www.imdb.com/search/title/?genres=fantasy
423,7.9,The Insider,R,Biography,157,"[u'Russell Crowe', u'Al Pacino', u'Christopher...",True,https://www.imdb.com/search/title/?genres=fantasy
560,7.8,Donnie Brasco,R,Biography,127,"[u'Al Pacino', u'Johnny Depp', u'Michael Madsen']",False,https://www.imdb.com/search/title/?genres=fantasy


In [23]:
len(
    movies_df[movies_df["actors_list"].str.contains("Al Pacino")].sort_values(
        "star_rating", ascending=False
    )
)

13

#### What's the longest PG-13 action movie? 

In [24]:
movies_df[
    (movies_df["genre"] == "Action") & (movies_df["content_rating"] == "PG-13")
].sort_values("duration", ascending=False).head(1)

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list,is_long,genre_url
385,8.0,Spartacus,PG-13,Action,197,"[u'Kirk Douglas', u'Laurence Olivier', u'Jean ...",True,https://www.imdb.com/search/title/?genres=fantasy


#### Which genre has the highest star rating?

In [25]:
movies_df.groupby(["genre"])["star_rating"].mean().sort_values(ascending=False)

genre
Western      8.255556
Film-Noir    8.033333
History      8.000000
Mystery      7.975000
Adventure    7.933333
Sci-Fi       7.920000
Crime        7.916935
Animation    7.914516
Drama        7.902518
Action       7.884559
Biography    7.862338
Family       7.850000
Comedy       7.822436
Horror       7.806897
Fantasy      7.700000
Thriller     7.680000
Name: star_rating, dtype: float64

#### How many movies contain the word 'city'? 

In [26]:
movies_df[movies_df["genre"].str.contains("Action")]

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list,is_long,genre_url
570,7.8,300,R,Action,117,"[u'Gerard Butler', u'Lena Headey', u'David Wen...",False,https://www.imdb.com/search/title/?genres=fantasy
276,8.1,A Fistful of Dollars,R,Action,99,"[u'Clint Eastwood', u'Gian Maria Volont\xe9', ...",False,https://www.imdb.com/search/title/?genres=fantasy
75,8.4,Aliens,R,Action,137,"[u'Sigourney Weaver', u'Michael Biehn', u'Carr...",False,https://www.imdb.com/search/title/?genres=fantasy
529,7.8,Apocalypto,R,Action,139,"[u'Gerardo Taracena', u'Raoul Trujillo', u'Dal...",False,https://www.imdb.com/search/title/?genres=fantasy
433,7.9,Avatar,PG-13,Action,162,"[u'Sam Worthington', u'Zoe Saldana', u'Sigourn...",True,https://www.imdb.com/search/title/?genres=fantasy
...,...,...,...,...,...,...,...,...
532,7.8,X-Men: First Class,PG-13,Action,132,"[u'James McAvoy', u'Michael Fassbender', u'Jen...",False,https://www.imdb.com/search/title/?genres=fantasy
871,7.5,X2,PG-13,Action,134,"[u'Patrick Stewart', u'Hugh Jackman', u'Halle ...",False,https://www.imdb.com/search/title/?genres=fantasy
403,7.9,Ying xiong,PG-13,Action,99,"[u'Jet Li', u'Tony Chiu Wai Leung', u'Maggie C...",False,https://www.imdb.com/search/title/?genres=fantasy
235,8.1,Yip Man,R,Action,106,"[u'Donnie Yen', u'Simon Yam', u'Siu-Wong Fan']",False,https://www.imdb.com/search/title/?genres=fantasy


In [27]:
len(movies_df[movies_df["genre"].str.contains("Action")])

136

#### Do long movies have a higher average star rating than not long movies?

In [28]:
movies_df.groupby(["is_long"])["star_rating"].mean()

is_long
False    7.858698
True     8.085821
Name: star_rating, dtype: float64

#### Are "G" rated movies more popular than "R" rated movies? 

In [29]:
movies_df.groupby(["content_rating"])["star_rating"].mean().sort_values(ascending=False)

content_rating
PASSED       8.157143
NOT RATED    8.123077
TV-MA        8.100000
APPROVED     8.027660
UNRATED      7.994737
G            7.990625
GP           7.933333
X            7.925000
PG           7.879675
R            7.854783
PG-13        7.828571
NC-17        7.614286
Name: star_rating, dtype: float64

---

## Bonus questions! 

#### How many movies start with "The"?

In [30]:
the_movies = movies_df[movies_df["title"].str[:3] == "The"]

In [31]:
len(the_movies)

223

#### What movie title has the most characters in its name? 

In [32]:
movies_df["title_length"] = movies_df["title"].str.len()

In [33]:
movies_df[movies_df["title_length"] == movies_df["title_length"].max()]

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list,is_long,genre_url,title_length
48,8.5,Dr. Strangelove or: How I Learned to Stop Worr...,PG,Comedy,95,"[u'Peter Sellers', u'George C. Scott', u'Sterl...",False,https://www.imdb.com/search/title/?genres=fantasy,68


In [34]:
movies_df.sort_values("title_length", ascending=False).head(1)

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list,is_long,genre_url,title_length
48,8.5,Dr. Strangelove or: How I Learned to Stop Worr...,PG,Comedy,95,"[u'Peter Sellers', u'George C. Scott', u'Sterl...",False,https://www.imdb.com/search/title/?genres=fantasy,68


#### What percentage of all the movies are in the crime genre? 

In [35]:
movies_df.genre.value_counts(normalize=True) * 100

Drama        28.396323
Comedy       15.934627
Action       13.891726
Crime        12.665986
Biography     7.865169
Adventure     7.660878
Animation     6.332993
Horror        2.962206
Mystery       1.634321
Western       0.919305
Thriller      0.510725
Sci-Fi        0.510725
Film-Noir     0.306435
Family        0.204290
Fantasy       0.102145
History       0.102145
Name: genre, dtype: float64

#### What percentage of movies featuring Meryl Streep are 'long' vs. 'not long'?

In [36]:
meryl_streep = (
    movies_df[movies_df["actors_list"].str.contains("Meryl Streep")]
    .groupby(["is_long"])
    .size()
).reset_index(name="count")

In [37]:
meryl_streep["is_long_share"] = (
    meryl_streep["count"] / meryl_streep["count"].sum()
).round(2)

In [38]:
meryl_streep

Unnamed: 0,is_long,count,is_long_share
0,False,6,0.86
1,True,1,0.14
