In [1]:
import pandas as pd
import pymysql

In [2]:
conn = pymysql.connect(host="127.0.0.1", user="root", passwd="root", db="sakila", port=3306, charset='utf8')
film_df = pd.read_sql("select * from film;", conn)
film_actor_df = pd.read_sql("select * from film_actor;", conn)
actor_df = pd.read_sql("select * from actor;", conn)


# desc

In [3]:
# desc film;
# film_id	smallint(5) unsigned	NO	PRI		auto_increment
# title	varchar(255)	NO	MUL		
# description	text	YES			
# release_year	year(4)	YES			
# language_id	tinyint(3) unsigned	NO	MUL		
# original_language_id	tinyint(3) unsigned	YES	MUL		
# rental_duration	tinyint(3) unsigned	NO		3	
# rental_rate	decimal(4,2)	NO		4.99	
# length	smallint(5) unsigned	YES			
# replacement_cost	decimal(5,2)	NO		19.99	
# rating	enum('G','PG','PG-13','R','NC-17')	YES		G	
# special_features	set('Trailers','Commentaries','Deleted Scenes','Behind the Scenes')	YES			
# last_update	timestamp	NO		CURRENT_TIMESTAMP	on update CURRENT_TIMESTAMP

In [5]:
film_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
film_id                 1000 non-null int64
title                   1000 non-null object
description             1000 non-null object
release_year            1000 non-null int64
language_id             1000 non-null int64
original_language_id    0 non-null object
rental_duration         1000 non-null int64
rental_rate             1000 non-null float64
length                  1000 non-null int64
replacement_cost        1000 non-null float64
rating                  1000 non-null object
special_features        1000 non-null object
last_update             1000 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(2), int64(5), object(5)
memory usage: 101.6+ KB


In [6]:
film_df.columns.values

array(['film_id', 'title', 'description', 'release_year', 'language_id',
       'original_language_id', 'rental_duration', 'rental_rate', 'length',
       'replacement_cost', 'rating', 'special_features', 'last_update'], dtype=object)

In [7]:
film_df.shape

(1000, 13)

# select 

In [None]:
# select * 
# from film;
film_df

In [9]:
# select * 
# from film 
# limit 3;
film_df.head(3) # we can also transepose the result by add .T, film_df.head(3).T

Unnamed: 0,film_id,title,description,release_year,language_id,original_language_id,rental_duration,rental_rate,length,replacement_cost,rating,special_features,last_update
0,1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,2006,1,,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",2006-02-15 05:03:42
1,2,ACE GOLDFINGER,A Astounding Epistle of a Database Administrat...,2006,1,,3,4.99,48,12.99,G,"Trailers,Deleted Scenes",2006-02-15 05:03:42
2,3,ADAPTATION HOLES,A Astounding Reflection of a Lumberjack And a ...,2006,1,,7,2.99,50,18.99,NC-17,"Trailers,Deleted Scenes",2006-02-15 05:03:42


In [18]:
# select film_id, title, description
# from film
# limit 10, 5;
film_df[["film_id", "title", "description"]][10:10 + 5] # in sql 10 is offset, 5 is limit, so [10:10 + 5] in pandas

Unnamed: 0,film_id,title,description
10,11,ALAMO VIDEOTAPE,A Boring Epistle of a Butler And a Cat who mus...
11,12,ALASKA PHANTOM,A Fanciful Saga of a Hunter And a Pastry Chef ...
12,13,ALI FOREVER,A Action-Packed Drama of a Dentist And a Croco...
13,14,ALICE FANTASIA,A Emotional Drama of a A Shark And a Database ...
14,15,ALIEN CENTER,A Brilliant Drama of a Cat And a Mad Scientist...


In [29]:
# select distinct rating
# from film;

film_df['rating'].unique()

array(['PG', 'G', 'NC-17', 'PG-13', 'R'], dtype=object)

# count

In [19]:
# select count(*)
# from film;

len(film_df)

1000

In [30]:
# select count(distinct rating)
# from film;

len(film_df['rating'].unique())

5

# where

In [20]:
# select film_id, title, description
# from film
# where film_id = 10;

film_df[film_df["film_id"] == 10][["film_id", "title", "description"]]

Unnamed: 0,film_id,title,description
9,10,ALADDIN CALENDAR,A Action-Packed Tale of a Man And a Lumberjack...


In [None]:
# select film_id, title, description
# from film
# where rental_rate > 2
#       and length < 120;

film_df[(film_df["rental_rate"] > 2) & (film_df["length"] < 120)][["film_id", "title", "description"]]

# order by

In [None]:
# 按照评分从低到高对电影进行排序
# select rental_rate, film_id, title, description
# from film
# order by rental_rate
 
film_df.sort_values(['rental_rate'])[["rental_rate", "film_id", "title", "description"]]  # 默认升序 非原地

In [None]:
# 按照评分从高到低对电影进行排序
# select rental_rate, film_id, title, description
# from film
# order by rental_rate desc
 
film_df.sort_values(['rental_rate'], ascending=0)[["rental_rate", "film_id", "title", "description"]]  # 降序 非原地

# group by

In [43]:
# select rating, count(*)
# from film
# group by rating

film_df.rating.value_counts()

PG-13    223
NC-17    210
R        195
PG       194
G        178
Name: rating, dtype: int64

In [37]:
# select release_year, rating, count(*)
# from film
# group by release_year, rating

film_df.groupby(["release_year", "rating"]).size()  # the result type is series, we can trans it to Dataframe by reset_index

release_year  rating
2006          G         178
              NC-17     210
              PG        194
              PG-13     223
              R         195
dtype: int64

In [39]:
# select release_year, rating, count(*) as counts
# from film
# group by release_year, rating

film_df.groupby(["release_year", "rating"]).size().reset_index(name="counts")  # the result type is Dataframe and rename the column

Unnamed: 0,release_year,rating,counts
0,2006,G,178
1,2006,NC-17,210
2,2006,PG,194
3,2006,PG-13,223
4,2006,R,195


In [45]:
# select  rating , count(distinct rental_duration)  as rental_duration_type_count
# from  film
# group by rating;

film_df.groupby('rating').rental_duration.nunique().reset_index(name="rental_duration_type_count")

Unnamed: 0,rating,rental_duration_type_count
0,G,5
1,NC-17,5
2,PG,5
3,PG-13,5
4,R,5


In [76]:
# select rating, count(distinct length) as length_distinct_count, avg(length) as length_mean, avg(rental_rate) as rental_rate_mean
# from film
# group by rating;

new_df = film_df.groupby("rating").agg({"length": {"length_distinct_count": lambda x: x.nunique(), "length_mean": "mean"},
                                        "rental_rate": {"rental_rate_mean": "mean"}})
new_df

Unnamed: 0_level_0,length,length,rental_rate
Unnamed: 0_level_1,length_distinct_count,length_mean,rental_rate_mean
rating,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
G,109,111.050562,2.888876
NC-17,110,113.228571,2.970952
PG,106,112.005155,3.051856
PG-13,116,120.443946,3.034843
R,103,118.661538,2.938718


In [77]:
new_df.columns = new_df.columns.droplevel(0)
new_df

Unnamed: 0_level_0,length_distinct_count,length_mean,rental_rate_mean
rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
G,109,111.050562,2.888876
NC-17,110,113.228571,2.970952
PG,106,112.005155,3.051856
PG-13,116,120.443946,3.034843
R,103,118.661538,2.938718


In [78]:
new_df.reset_index()

Unnamed: 0,rating,length_distinct_count,length_mean,rental_rate_mean
0,G,109,111.050562,2.888876
1,NC-17,110,113.228571,2.970952
2,PG,106,112.005155,3.051856
3,PG-13,116,120.443946,3.034843
4,R,103,118.661538,2.938718
