# IMDB movie ratings

#### Import python tools

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd

## Read our data

In [4]:
url = "https://raw.githubusercontent.com/stiles/usc/main/data/raw/imdb_1000.csv"

#### Create a movies dataframe, and sort it by title

In [5]:
movies_df = pd.read_csv(url)

#### First five rows?

In [6]:
movies_df.head(5)

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."


#### Use the `describe()` method to see the distribution

In [7]:
movies_df.describe()

Unnamed: 0,star_rating,duration
count,979.0,979.0
mean,7.889785,120.979571
std,0.336069,26.21801
min,7.4,64.0
25%,7.6,102.0
50%,7.8,117.0
75%,8.1,134.0
max,9.3,242.0


---

## Categorization exercise

#### What do you consider a 'long' movie? 

In [8]:
movies_long = 175

In [9]:
movies_long = movies_df["duration"].mean() + movies_df["duration"].std()

In [10]:
movies_long

147.19758083721905

#### Boolean categorization function

In [13]:
def categorize_long_movies(time):
    if time >= movies_long:
        return True
    else:
        return False

#### Apply that to our duration column as a new column

In [14]:
movies_df["is_long"] = movies_df["duration"].apply(categorize_long_movies)

In [15]:
movies_df.head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list,is_long
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...",False
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']",True
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv...",True
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E...",True
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L....",True


In [16]:
movies_df.is_long.value_counts()

False    845
True     134
Name: is_long, dtype: int64

---

## Looping exercise

In [4]:
# https://www.imdb.com/search/title/?genres=comedy

In [18]:
for g in movies_df["genre"].unique():
    print("https://www.imdb.com/search/title/?genres=" + g)

https://www.imdb.com/search/title/?genres=Crime
https://www.imdb.com/search/title/?genres=Action
https://www.imdb.com/search/title/?genres=Drama
https://www.imdb.com/search/title/?genres=Western
https://www.imdb.com/search/title/?genres=Adventure
https://www.imdb.com/search/title/?genres=Biography
https://www.imdb.com/search/title/?genres=Comedy
https://www.imdb.com/search/title/?genres=Animation
https://www.imdb.com/search/title/?genres=Mystery
https://www.imdb.com/search/title/?genres=Horror
https://www.imdb.com/search/title/?genres=Film-Noir
https://www.imdb.com/search/title/?genres=Sci-Fi
https://www.imdb.com/search/title/?genres=History
https://www.imdb.com/search/title/?genres=Thriller
https://www.imdb.com/search/title/?genres=Family
https://www.imdb.com/search/title/?genres=Fantasy


In [20]:
movies_df["genre_url"] = (
    "https://www.imdb.com/search/title/?genres=" + movies_df["genre"]
)

In [21]:
movies_df.head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list,is_long,genre_url
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...",False,https://www.imdb.com/search/title/?genres=Crime
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']",True,https://www.imdb.com/search/title/?genres=Crime
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv...",True,https://www.imdb.com/search/title/?genres=Crime
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E...",True,https://www.imdb.com/search/title/?genres=Action
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L....",True,https://www.imdb.com/search/title/?genres=Crime


---

### Questions

#### How many crime movies vs. other types? 

In [28]:
movies_df[movies_df["genre"] == "Crime"].value_counts()

star_rating  title                     content_rating  genre  duration  actors_list                                                          is_long  genre_url                                      
7.4          Bound                     R               Crime  108       [u'Jennifer Tilly', u'Gina Gershon', u'Joe Pantoliano']              False    https://www.imdb.com/search/title/?genres=Crime    1
8.0          Rope                      PG              Crime  80        [u'James Stewart', u'John Dall', u'Farley Granger']                  False    https://www.imdb.com/search/title/?genres=Crime    1
8.1          Strangers on a Train      APPROVED        Crime  101       [u'Farley Granger', u'Robert Walker', u'Ruth Roman']                 False    https://www.imdb.com/search/title/?genres=Crime    1
             Sin City                  R               Crime  124       [u'Mickey Rourke', u'Clive Owen', u'Bruce Willis']                   False    https://www.imdb.com/search/title/?genres=C

In [26]:
movies_df.genre.value_counts()

Drama        278
Comedy       156
Action       136
Crime        124
Biography     77
Adventure     75
Animation     62
Horror        29
Mystery       16
Western        9
Sci-Fi         5
Thriller       5
Film-Noir      3
Family         2
History        1
Fantasy        1
Name: genre, dtype: int64

#### Which movie featuring Jack Nicholson had the highest star rating? 

In [43]:
movies_df[movies_df["actors_list"].str.contains("Jack Nicholson")].sort_values(
    "star_rating", ascending=True
)

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list,is_long,genre_url
943,7.4,The Bucket List,PG-13,Adventure,97,"[u'Jack Nicholson', u'Morgan Freeman', u'Sean ...",False,https://www.imdb.com/search/title/?genres=Adve...
944,7.4,Terms of Endearment,R,Comedy,132,"[u'Shirley MacLaine', u'Debra Winger', u'Jack ...",False,https://www.imdb.com/search/title/?genres=Comedy
716,7.6,A Few Good Men,R,Drama,138,"[u'Tom Cruise', u'Jack Nicholson', u'Demi Moore']",False,https://www.imdb.com/search/title/?genres=Drama
801,7.6,Batman,PG-13,Action,126,"[u'Michael Keaton', u'Jack Nicholson', u'Kim B...",False,https://www.imdb.com/search/title/?genres=Action
578,7.8,As Good as It Gets,PG-13,Comedy,139,"[u'Jack Nicholson', u'Helen Hunt', u'Greg Kinn...",False,https://www.imdb.com/search/title/?genres=Comedy
130,8.3,Chinatown,R,Drama,130,"[u'Jack Nicholson', u'Faye Dunaway', u'John Hu...",False,https://www.imdb.com/search/title/?genres=Drama
49,8.5,The Departed,R,Crime,151,"[u'Leonardo DiCaprio', u'Matt Damon', u'Jack N...",True,https://www.imdb.com/search/title/?genres=Crime
62,8.5,The Shining,R,Horror,146,"[u'Jack Nicholson', u'Shelley Duvall', u'Danny...",False,https://www.imdb.com/search/title/?genres=Horror
16,8.7,One Flew Over the Cuckoo's Nest,R,Drama,133,"[u'Jack Nicholson', u'Louise Fletcher', u'Mich...",False,https://www.imdb.com/search/title/?genres=Drama


In [39]:
###actors = movies_df["actors_list"]

#### How does the average duration of dramas compare with other genres? 

In [44]:
movies_df.groupby(["genre"]).mean("duration")

Unnamed: 0_level_0,star_rating,duration,is_long
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Action,7.884559,126.485294,0.139706
Adventure,7.933333,134.84,0.293333
Animation,7.914516,96.596774,0.0
Biography,7.862338,131.844156,0.233766
Comedy,7.822436,107.602564,0.044872
Crime,7.916935,122.298387,0.145161
Drama,7.902518,126.539568,0.165468
Family,7.85,107.5,0.0
Fantasy,7.7,112.0,0.0
Film-Noir,8.033333,97.333333,0.0


In [45]:
movies_df.groupby(["genre"])["duration"].mean()

genre
Action       126.485294
Adventure    134.840000
Animation     96.596774
Biography    131.844156
Comedy       107.602564
Crime        122.298387
Drama        126.539568
Family       107.500000
Fantasy      112.000000
Film-Noir     97.333333
History       66.000000
Horror       102.517241
Mystery      115.625000
Sci-Fi       109.000000
Thriller     114.200000
Western      136.666667
Name: duration, dtype: float64

#### How many movies contain Al Pacino?

In [46]:
len(movies_df[movies_df["actors_list"].str.contains("Al Pacino")])

13

#### What's the longest PG-13 action movie? 

In [51]:
action = movies_df[movies_df["genre"] == "Action"]

In [66]:
action[action["content_rating"] == "PG-13"].sort_values(
    "duration", ascending=False
).head(5)

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list,is_long,genre_url
385,8.0,Spartacus,PG-13,Action,197,"[u'Kirk Douglas', u'Laurence Olivier', u'Jean ...",True,https://www.imdb.com/search/title/?genres=Action
684,7.7,The Big Blue,PG-13,Action,168,"[u'Jean-Marc Barr', u'Jean Reno', u'Rosanna Ar...",True,https://www.imdb.com/search/title/?genres=Action
43,8.5,The Dark Knight Rises,PG-13,Action,165,"[u'Christian Bale', u'Tom Hardy', u'Anne Hatha...",True,https://www.imdb.com/search/title/?genres=Action
433,7.9,Avatar,PG-13,Action,162,"[u'Sam Worthington', u'Zoe Saldana', u'Sigourn...",True,https://www.imdb.com/search/title/?genres=Action
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E...",True,https://www.imdb.com/search/title/?genres=Action


#### Which genre has the highest star rating?

In [65]:
movies_df.groupby(["genre"])["star_rating"].mean().sort_values(ascending=False)

genre
Western      8.255556
Film-Noir    8.033333
History      8.000000
Mystery      7.975000
Adventure    7.933333
Sci-Fi       7.920000
Crime        7.916935
Animation    7.914516
Drama        7.902518
Action       7.884559
Biography    7.862338
Family       7.850000
Comedy       7.822436
Horror       7.806897
Fantasy      7.700000
Thriller     7.680000
Name: star_rating, dtype: float64

#### How many movies contain the word 'city'? 

In [74]:
len(movies_df[movies_df["title"].str.contains("City", case=False)])

5

#### Do long movies have a higher average star rating than not long movies?

In [76]:
movies_df.groupby(["is_long"])["star_rating"].mean()

is_long
False    7.858698
True     8.085821
Name: star_rating, dtype: float64

#### Are "G" rated movies more popular than "R" rated movies? 

In [81]:
ratings_movies = movies_df.groupby(["content_rating"])["star_rating"]

In [82]:
ratings_movies.sort_values(["star_rating"])

AttributeError: 'SeriesGroupBy' object has no attribute 'sort_values'

---

## Bonus questions! 

#### How many movies start with "The"?

#### What movie title has the most characters in its name? 

#### What percentage of all the movies are in the crime genre? 

#### What percentage of movies featuring Meryl Streep are 'long' vs. 'not long'?