# IMDB movie ratings

#### Import python tools

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd

## Read our data

In [3]:
url = "https://raw.githubusercontent.com/stiles/usc/main/data/raw/imdb_1000.csv"

#### Create a movies dataframe, and sort it by title

In [4]:
movies = pd.read_csv(url)

In [5]:
movies

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."
...,...,...,...,...,...,...
974,7.4,Tootsie,PG,Comedy,116,"[u'Dustin Hoffman', u'Jessica Lange', u'Teri G..."
975,7.4,Back to the Future Part III,PG,Adventure,118,"[u'Michael J. Fox', u'Christopher Lloyd', u'Ma..."
976,7.4,Master and Commander: The Far Side of the World,PG-13,Action,138,"[u'Russell Crowe', u'Paul Bettany', u'Billy Bo..."
977,7.4,Poltergeist,PG,Horror,114,"[u'JoBeth Williams', u""Heather O'Rourke"", u'Cr..."


#### First five rows?

In [6]:
movies.head(1)

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."


#### Use the `describe()` method to see the distribution

In [7]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 979 entries, 0 to 978
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   star_rating     979 non-null    float64
 1   title           979 non-null    object 
 2   content_rating  976 non-null    object 
 3   genre           979 non-null    object 
 4   duration        979 non-null    int64  
 5   actors_list     979 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 46.0+ KB


---

## Categorization exercise

#### What do you consider a 'long' movie? 

In [8]:
movies_long = movies["duration"].mean() + movies["duration"].std()

In [9]:
movies_long

147.19758083721905

#### Boolean categorization function

In [10]:
def categorize_long_movies(time):
    if time >= movies_long:
        return True
    else:
        return False

#### Apply that to our duration column as a new column

In [11]:
movies["is_long"] = movies["duration"].apply(categorize_long_movies)

In [12]:
movies.head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list,is_long
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...",False
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']",True
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv...",True
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E...",True
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L....",True


In [13]:
movies.is_long.value_counts()

False    845
True     134
Name: is_long, dtype: int64

---

## Looping exercise

In [14]:
for mov in movies["genre"].unique():
    print("https://www.imdb.com/search/title/?genres=" + mov)

https://www.imdb.com/search/title/?genres=Crime
https://www.imdb.com/search/title/?genres=Action
https://www.imdb.com/search/title/?genres=Drama
https://www.imdb.com/search/title/?genres=Western
https://www.imdb.com/search/title/?genres=Adventure
https://www.imdb.com/search/title/?genres=Biography
https://www.imdb.com/search/title/?genres=Comedy
https://www.imdb.com/search/title/?genres=Animation
https://www.imdb.com/search/title/?genres=Mystery
https://www.imdb.com/search/title/?genres=Horror
https://www.imdb.com/search/title/?genres=Film-Noir
https://www.imdb.com/search/title/?genres=Sci-Fi
https://www.imdb.com/search/title/?genres=History
https://www.imdb.com/search/title/?genres=Thriller
https://www.imdb.com/search/title/?genres=Family
https://www.imdb.com/search/title/?genres=Fantasy


In [15]:
movies["genre_url"] = "https://www.imdb.com/search/title/?genres=" + movies["genre"]

In [16]:
movies.head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list,is_long,genre_url
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...",False,https://www.imdb.com/search/title/?genres=Crime
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']",True,https://www.imdb.com/search/title/?genres=Crime
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv...",True,https://www.imdb.com/search/title/?genres=Crime
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E...",True,https://www.imdb.com/search/title/?genres=Action
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L....",True,https://www.imdb.com/search/title/?genres=Crime


In [17]:
# https://www.imdb.com/search/title/?genres=comedy

---

### Questions

#### How many crime movies vs. other types? 

In [18]:
movies.groupby(["genre"])["title"].count()

genre
Action       136
Adventure     75
Animation     62
Biography     77
Comedy       156
Crime        124
Drama        278
Family         2
Fantasy        1
Film-Noir      3
History        1
Horror        29
Mystery       16
Sci-Fi         5
Thriller       5
Western        9
Name: title, dtype: int64

In [19]:
len(movies[movies["genre"] == "Crime"])

124

In [20]:
len(movies[movies["genre"] != "Crime"])

855

#### Which movie featuring Jack Nicholson had the highest star rating? 

In [21]:
movies[movies["actors_list"].str.contains("Jack Nicholson")].sort_values(
    "star_rating", ascending=False
).head(1)

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list,is_long,genre_url
16,8.7,One Flew Over the Cuckoo's Nest,R,Drama,133,"[u'Jack Nicholson', u'Louise Fletcher', u'Mich...",False,https://www.imdb.com/search/title/?genres=Drama


#### How does the average duration of dramas compare with other genres? 

In [22]:
movies.groupby(["genre"])["duration"].mean()

genre
Action       126.485294
Adventure    134.840000
Animation     96.596774
Biography    131.844156
Comedy       107.602564
Crime        122.298387
Drama        126.539568
Family       107.500000
Fantasy      112.000000
Film-Noir     97.333333
History       66.000000
Horror       102.517241
Mystery      115.625000
Sci-Fi       109.000000
Thriller     114.200000
Western      136.666667
Name: duration, dtype: float64

#### How many movies contain Al Pacino?

In [23]:
movies[movies["actors_list"].str.contains("Al Pacino")].count()["title"]

13

#### What's the longest PG-13 action movie? 

In [24]:
movies[
    (movies["content_rating"] == "PG-13") & (movies["genre"] == "Action")
].sort_values("duration", ascending=False).head(1)

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list,is_long,genre_url
385,8.0,Spartacus,PG-13,Action,197,"[u'Kirk Douglas', u'Laurence Olivier', u'Jean ...",True,https://www.imdb.com/search/title/?genres=Action


#### Which genre has the highest star rating?

In [25]:
movies.groupby(["genre"])["star_rating"].mean().sort_values(ascending=False).head(1)

genre
Western    8.255556
Name: star_rating, dtype: float64

#### How many movies contain the word 'city'? 

In [26]:
movies[movies["title"].str.contains("City")].count()["title"]

5

#### Do long movies have a higher average star rating than not long movies?

In [31]:
movies.groupby(["is_long"])["star_rating"].mean()

is_long
False    7.858698
True     8.085821
Name: star_rating, dtype: float64

#### LONG MOVIES RATED HIGHER

#### Are "G" rated movies more popular than "R" rated movies? 

In [36]:
movies[(movies["content_rating"] == "G") | (movies["content_rating"] == "R")].groupby(
    "content_rating"
)["star_rating"].mean().sort_values(ascending=False)

content_rating
G    7.990625
R    7.854783
Name: star_rating, dtype: float64

---

## Bonus questions! 

#### How many movies start with "The"?

In [40]:
movies[movies["title"].str[:3] == "The"]

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list,is_long,genre_url
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...",False,https://www.imdb.com/search/title/?genres=Crime
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']",True,https://www.imdb.com/search/title/?genres=Crime
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv...",True,https://www.imdb.com/search/title/?genres=Crime
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E...",True,https://www.imdb.com/search/title/?genres=Action
6,8.9,"The Good, the Bad and the Ugly",NOT RATED,Western,161,"[u'Clint Eastwood', u'Eli Wallach', u'Lee Van ...",True,https://www.imdb.com/search/title/?genres=Western
...,...,...,...,...,...,...,...,...
960,7.4,The Way Way Back,PG-13,Comedy,103,"[u'Steve Carell', u'Toni Collette', u'Allison ...",False,https://www.imdb.com/search/title/?genres=Comedy
966,7.4,The Simpsons Movie,PG-13,Animation,87,"[u'Dan Castellaneta', u'Julie Kavner', u'Nancy...",False,https://www.imdb.com/search/title/?genres=Anim...
967,7.4,The Rock,R,Action,136,"[u'Sean Connery', u'Nicolas Cage', u'Ed Harris']",False,https://www.imdb.com/search/title/?genres=Action
968,7.4,The English Patient,R,Drama,162,"[u'Ralph Fiennes', u'Juliette Binoche', u'Will...",True,https://www.imdb.com/search/title/?genres=Drama


#### What movie title has the most characters in its name? 

#### What percentage of all the movies are in the crime genre? 

#### What percentage of movies featuring Meryl Streep are 'long' vs. 'not long'?