In [1]:
import pandas as pd
import pymysql

In [2]:
# 连接本地的mysql数据库，使用sakila库，sakila库是mysql的示例库，一般都有，如果没有的可以安装
# 如果不实际运行sql和pandans语句可以不执行这段代码
conn = pymysql.connect(host="127.0.0.1", user="root", passwd="root", db="sakila", port=3306, charset='utf8')
film_df = pd.read_sql("select * from film;", conn)
film_actor_df = pd.read_sql("select * from film_actor;", conn)
actor_df = pd.read_sql("select * from actor;", conn)


# desc
*注：一般每个code cell里前面的是sql语句（用#注释起来了），随后是对应的pandas语句*

In [5]:
# desc film;
film_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
film_id                 1000 non-null int64
title                   1000 non-null object
description             1000 non-null object
release_year            1000 non-null int64
language_id             1000 non-null int64
original_language_id    0 non-null object
rental_duration         1000 non-null int64
rental_rate             1000 non-null float64
length                  1000 non-null int64
replacement_cost        1000 non-null float64
rating                  1000 non-null object
special_features        1000 non-null object
last_update             1000 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(2), int64(5), object(5)
memory usage: 101.6+ KB


In [6]:
film_df.columns.values

array(['film_id', 'title', 'description', 'release_year', 'language_id',
       'original_language_id', 'rental_duration', 'rental_rate', 'length',
       'replacement_cost', 'rating', 'special_features', 'last_update'], dtype=object)

In [7]:
film_df.shape

(1000, 13)

# select 

In [None]:
# select * 
# from film;
film_df

In [9]:
# select * 
# from film 
# limit 3;
film_df.head(3) # we can also transepose the result by add .T, film_df.head(3).T

Unnamed: 0,film_id,title,description,release_year,language_id,original_language_id,rental_duration,rental_rate,length,replacement_cost,rating,special_features,last_update
0,1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,2006,1,,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",2006-02-15 05:03:42
1,2,ACE GOLDFINGER,A Astounding Epistle of a Database Administrat...,2006,1,,3,4.99,48,12.99,G,"Trailers,Deleted Scenes",2006-02-15 05:03:42
2,3,ADAPTATION HOLES,A Astounding Reflection of a Lumberjack And a ...,2006,1,,7,2.99,50,18.99,NC-17,"Trailers,Deleted Scenes",2006-02-15 05:03:42


In [18]:
# select film_id, title, description
# from film
# limit 10, 5;
film_df[["film_id", "title", "description"]][10:10 + 5] # in sql 10 is offset, 5 is limit, so [10:10 + 5] in pandas

Unnamed: 0,film_id,title,description
10,11,ALAMO VIDEOTAPE,A Boring Epistle of a Butler And a Cat who mus...
11,12,ALASKA PHANTOM,A Fanciful Saga of a Hunter And a Pastry Chef ...
12,13,ALI FOREVER,A Action-Packed Drama of a Dentist And a Croco...
13,14,ALICE FANTASIA,A Emotional Drama of a A Shark And a Database ...
14,15,ALIEN CENTER,A Brilliant Drama of a Cat And a Mad Scientist...


In [29]:
# select distinct rating
# from film;

film_df['rating'].unique()

array(['PG', 'G', 'NC-17', 'PG-13', 'R'], dtype=object)

# count

In [19]:
# select count(*)
# from film;

len(film_df)

1000

In [30]:
# select count(distinct rating)
# from film;

len(film_df['rating'].unique())

5

# where

In [20]:
# select film_id, title, description
# from film
# where film_id = 10;

film_df[film_df["film_id"] == 10][["film_id", "title", "description"]]

Unnamed: 0,film_id,title,description
9,10,ALADDIN CALENDAR,A Action-Packed Tale of a Man And a Lumberjack...


In [None]:
# select film_id, title, description
# from film
# where rental_rate > 2
#       and length < 120;

film_df[(film_df["rental_rate"] > 2) & (film_df["length"] < 120)][["film_id", "title", "description"]]

# in

In [4]:
# SELECT  * 
# FROM sakila.film
# where rental_duration in (3,6,7);

film_df[film_df.rental_duration.isin([3,6,7])]

Unnamed: 0,film_id,title,description,release_year,language_id,original_language_id,rental_duration,rental_rate,length,replacement_cost,rating,special_features,last_update
0,1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,2006,1,,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",2006-02-15 05:03:42
1,2,ACE GOLDFINGER,A Astounding Epistle of a Database Administrat...,2006,1,,3,4.99,48,12.99,G,"Trailers,Deleted Scenes",2006-02-15 05:03:42
2,3,ADAPTATION HOLES,A Astounding Reflection of a Lumberjack And a ...,2006,1,,7,2.99,50,18.99,NC-17,"Trailers,Deleted Scenes",2006-02-15 05:03:42
4,5,AFRICAN EGG,A Fast-Paced Documentary of a Pastry Chef And ...,2006,1,,6,2.99,130,22.99,G,Deleted Scenes,2006-02-15 05:03:42
5,6,AGENT TRUMAN,A Intrepid Panorama of a Robot And a Boy who m...,2006,1,,3,2.99,169,17.99,PG,Deleted Scenes,2006-02-15 05:03:42
6,7,AIRPLANE SIERRA,A Touching Saga of a Hunter And a Butler who m...,2006,1,,6,4.99,62,28.99,PG-13,"Trailers,Deleted Scenes",2006-02-15 05:03:42
7,8,AIRPORT POLLOCK,A Epic Tale of a Moose And a Girl who must Con...,2006,1,,6,4.99,54,15.99,R,Trailers,2006-02-15 05:03:42
8,9,ALABAMA DEVIL,A Thoughtful Panorama of a Database Administra...,2006,1,,3,2.99,114,21.99,PG-13,"Trailers,Deleted Scenes",2006-02-15 05:03:42
9,10,ALADDIN CALENDAR,A Action-Packed Tale of a Man And a Lumberjack...,2006,1,,6,4.99,63,24.99,NC-17,"Trailers,Deleted Scenes",2006-02-15 05:03:42
10,11,ALAMO VIDEOTAPE,A Boring Epistle of a Butler And a Cat who mus...,2006,1,,6,0.99,126,16.99,G,"Commentaries,Behind the Scenes",2006-02-15 05:03:42


In [5]:
# SELECT  * 
# FROM sakila.film
# where rental_duration not in (3,6,7);

film_df[~film_df.rental_duration.isin([3,6,7])]

Unnamed: 0,film_id,title,description,release_year,language_id,original_language_id,rental_duration,rental_rate,length,replacement_cost,rating,special_features,last_update
3,4,AFFAIR PREJUDICE,A Fanciful Documentary of a Frisbee And a Lumb...,2006,1,,5,2.99,117,26.99,G,"Commentaries,Behind the Scenes",2006-02-15 05:03:42
12,13,ALI FOREVER,A Action-Packed Drama of a Dentist And a Croco...,2006,1,,4,4.99,150,21.99,PG,"Deleted Scenes,Behind the Scenes",2006-02-15 05:03:42
14,15,ALIEN CENTER,A Brilliant Drama of a Cat And a Mad Scientist...,2006,1,,5,2.99,46,10.99,NC-17,"Trailers,Commentaries,Behind the Scenes",2006-02-15 05:03:42
19,20,AMELIE HELLFIGHTERS,A Boring Drama of a Woman And a Squirrel who m...,2006,1,,4,4.99,79,23.99,R,"Commentaries,Deleted Scenes,Behind the Scenes",2006-02-15 05:03:42
27,28,ANTHEM LUKE,A Touching Panorama of a Waitress And a Woman ...,2006,1,,5,4.99,91,16.99,PG-13,"Deleted Scenes,Behind the Scenes",2006-02-15 05:03:42
28,29,ANTITRUST TOMATOES,A Fateful Yarn of a Womanizer And a Feminist w...,2006,1,,5,2.99,168,11.99,NC-17,"Trailers,Commentaries,Deleted Scenes",2006-02-15 05:03:42
29,30,ANYTHING SAVANNAH,A Epic Story of a Pastry Chef And a Woman who ...,2006,1,,4,2.99,82,27.99,R,"Trailers,Deleted Scenes,Behind the Scenes",2006-02-15 05:03:42
30,31,APACHE DIVINE,A Awe-Inspiring Reflection of a Pastry Chef An...,2006,1,,5,4.99,92,16.99,NC-17,"Commentaries,Deleted Scenes,Behind the Scenes",2006-02-15 05:03:42
32,33,APOLLO TEEN,A Action-Packed Reflection of a Crocodile And ...,2006,1,,5,2.99,153,15.99,PG-13,"Trailers,Commentaries,Deleted Scenes,Behind th...",2006-02-15 05:03:42
34,35,ARACHNOPHOBIA ROLLERCOASTER,A Action-Packed Reflection of a Pastry Chef An...,2006,1,,4,2.99,147,24.99,PG-13,"Trailers,Deleted Scenes,Behind the Scenes",2006-02-15 05:03:42


# NULL

In [6]:
# SELECT * 
# FROM sakila.film
# where original_language_id is null;

film_df[film_df.original_language_id.isnull()]

Unnamed: 0,film_id,title,description,release_year,language_id,original_language_id,rental_duration,rental_rate,length,replacement_cost,rating,special_features,last_update
0,1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,2006,1,,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",2006-02-15 05:03:42
1,2,ACE GOLDFINGER,A Astounding Epistle of a Database Administrat...,2006,1,,3,4.99,48,12.99,G,"Trailers,Deleted Scenes",2006-02-15 05:03:42
2,3,ADAPTATION HOLES,A Astounding Reflection of a Lumberjack And a ...,2006,1,,7,2.99,50,18.99,NC-17,"Trailers,Deleted Scenes",2006-02-15 05:03:42
3,4,AFFAIR PREJUDICE,A Fanciful Documentary of a Frisbee And a Lumb...,2006,1,,5,2.99,117,26.99,G,"Commentaries,Behind the Scenes",2006-02-15 05:03:42
4,5,AFRICAN EGG,A Fast-Paced Documentary of a Pastry Chef And ...,2006,1,,6,2.99,130,22.99,G,Deleted Scenes,2006-02-15 05:03:42
5,6,AGENT TRUMAN,A Intrepid Panorama of a Robot And a Boy who m...,2006,1,,3,2.99,169,17.99,PG,Deleted Scenes,2006-02-15 05:03:42
6,7,AIRPLANE SIERRA,A Touching Saga of a Hunter And a Butler who m...,2006,1,,6,4.99,62,28.99,PG-13,"Trailers,Deleted Scenes",2006-02-15 05:03:42
7,8,AIRPORT POLLOCK,A Epic Tale of a Moose And a Girl who must Con...,2006,1,,6,4.99,54,15.99,R,Trailers,2006-02-15 05:03:42
8,9,ALABAMA DEVIL,A Thoughtful Panorama of a Database Administra...,2006,1,,3,2.99,114,21.99,PG-13,"Trailers,Deleted Scenes",2006-02-15 05:03:42
9,10,ALADDIN CALENDAR,A Action-Packed Tale of a Man And a Lumberjack...,2006,1,,6,4.99,63,24.99,NC-17,"Trailers,Deleted Scenes",2006-02-15 05:03:42


# fill NULL
补全NULL值的操作pandas比sql方便

In [10]:
# SELECT (case when original_language_id is null then 999 else original_language_id end ) as original_language_id
#  FROM sakila.film
# where original_language_id is null;

film_df.original_language_id.fillna(999,inplace=True)
film_df

Unnamed: 0,film_id,title,description,release_year,language_id,original_language_id,rental_duration,rental_rate,length,replacement_cost,rating,special_features,last_update
0,1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,2006,1,999,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",2006-02-15 05:03:42
1,2,ACE GOLDFINGER,A Astounding Epistle of a Database Administrat...,2006,1,999,3,4.99,48,12.99,G,"Trailers,Deleted Scenes",2006-02-15 05:03:42
2,3,ADAPTATION HOLES,A Astounding Reflection of a Lumberjack And a ...,2006,1,999,7,2.99,50,18.99,NC-17,"Trailers,Deleted Scenes",2006-02-15 05:03:42
3,4,AFFAIR PREJUDICE,A Fanciful Documentary of a Frisbee And a Lumb...,2006,1,999,5,2.99,117,26.99,G,"Commentaries,Behind the Scenes",2006-02-15 05:03:42
4,5,AFRICAN EGG,A Fast-Paced Documentary of a Pastry Chef And ...,2006,1,999,6,2.99,130,22.99,G,Deleted Scenes,2006-02-15 05:03:42
5,6,AGENT TRUMAN,A Intrepid Panorama of a Robot And a Boy who m...,2006,1,999,3,2.99,169,17.99,PG,Deleted Scenes,2006-02-15 05:03:42
6,7,AIRPLANE SIERRA,A Touching Saga of a Hunter And a Butler who m...,2006,1,999,6,4.99,62,28.99,PG-13,"Trailers,Deleted Scenes",2006-02-15 05:03:42
7,8,AIRPORT POLLOCK,A Epic Tale of a Moose And a Girl who must Con...,2006,1,999,6,4.99,54,15.99,R,Trailers,2006-02-15 05:03:42
8,9,ALABAMA DEVIL,A Thoughtful Panorama of a Database Administra...,2006,1,999,3,2.99,114,21.99,PG-13,"Trailers,Deleted Scenes",2006-02-15 05:03:42
9,10,ALADDIN CALENDAR,A Action-Packed Tale of a Man And a Lumberjack...,2006,1,999,6,4.99,63,24.99,NC-17,"Trailers,Deleted Scenes",2006-02-15 05:03:42


# order by

In [None]:
# 按照评分从低到高对电影进行排序
# select rental_rate, film_id, title, description
# from film
# order by rental_rate
 
film_df.sort_values(['rental_rate'])[["rental_rate", "film_id", "title", "description"]]  # 默认升序 非原地

In [None]:
# 按照评分从高到低对电影进行排序
# select rental_rate, film_id, title, description
# from film
# order by rental_rate desc
 
film_df.sort_values(['rental_rate'], ascending=0)[["rental_rate", "film_id", "title", "description"]]  # 降序 非原地

# group by

In [43]:
# select rating, count(*)
# from film
# group by rating

film_df.rating.value_counts()

PG-13    223
NC-17    210
R        195
PG       194
G        178
Name: rating, dtype: int64

In [37]:
# select release_year, rating, count(*)
# from film
# group by release_year, rating

film_df.groupby(["release_year", "rating"]).size()  # the result type is series, we can trans it to Dataframe by reset_index

release_year  rating
2006          G         178
              NC-17     210
              PG        194
              PG-13     223
              R         195
dtype: int64

In [39]:
# select release_year, rating, count(*) as counts
# from film
# group by release_year, rating

film_df.groupby(["release_year", "rating"]).size().reset_index(name="counts")  # the result type is Dataframe and rename the column

Unnamed: 0,release_year,rating,counts
0,2006,G,178
1,2006,NC-17,210
2,2006,PG,194
3,2006,PG-13,223
4,2006,R,195


In [45]:
# select  rating , count(distinct rental_duration)  as rental_duration_type_count
# from  film
# group by rating;

film_df.groupby('rating').rental_duration.nunique().reset_index(name="rental_duration_type_count")

Unnamed: 0,rating,rental_duration_type_count
0,G,5
1,NC-17,5
2,PG,5
3,PG-13,5
4,R,5


In [80]:
# select rating, count(distinct length) as length_distinct_count, avg(length) as length_mean, avg(rental_rate) as rental_rate_mean
# from film
# group by rating;

## [DEPRECATED] Dictionary groupby format
使用一种叫Dictionary groupby format的方式，然后droplevel(0)，最后reset_index

In [79]:
new_df = film_df.groupby("rating").agg({"length": {"length_distinct_count": lambda x: x.nunique(), 
                                                   "length_mean": "mean"},
                                        "rental_rate": {"rental_rate_mean": "mean"}})
new_df

Unnamed: 0_level_0,length,length,rental_rate
Unnamed: 0_level_1,length_distinct_count,length_mean,rental_rate_mean
rating,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
G,109,111.050562,2.888876
NC-17,110,113.228571,2.970952
PG,106,112.005155,3.051856
PG-13,116,120.443946,3.034843
R,103,118.661538,2.938718


In [77]:
new_df.columns = new_df.columns.droplevel(0)
new_df

Unnamed: 0_level_0,length_distinct_count,length_mean,rental_rate_mean
rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
G,109,111.050562,2.888876
NC-17,110,113.228571,2.970952
PG,106,112.005155,3.051856
PG-13,116,120.443946,3.034843
R,103,118.661538,2.938718


In [78]:
new_df.reset_index()

Unnamed: 0,rating,length_distinct_count,length_mean,rental_rate_mean
0,G,109,111.050562,2.888876
1,NC-17,110,113.228571,2.970952
2,PG,106,112.005155,3.051856
3,PG-13,116,120.443946,3.034843
4,R,103,118.661538,2.938718


## Use 'named' functions instead of lambda's:
但是上面用嵌套字典的来重命名计算的字段的方式已经[DEPRECATED]了 (>=0.20.1),推荐的方式是用函数代替匿名函数

In [84]:
def length_distinct_count(group):
    return group.nunique()
    
def length_mean(group):
    return group.mean()
    
def rental_rate_mean(group):
    return group.mean()
    
new_df = film_df.groupby("rating").agg({"length": [length_distinct_count, length_mean],
                                        "rental_rate": rental_rate_mean})
new_df

# 后面的做法同上

Unnamed: 0_level_0,length,length,rental_rate
Unnamed: 0_level_1,length_distinct_count,length_mean,rental_rate_mean
rating,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
G,109,111.050562,2.888876
NC-17,110,113.228571,2.970952
PG,106,112.005155,3.051856
PG-13,116,120.443946,3.034843
R,103,118.661538,2.938718


# group_concat
*oralce里函数是wm_concat*

In [34]:
# SELECT actor_id, group_concat(film_id order by film_id separator ',')  as film_ids
# FROM sakila.film_actor
# group by actor_id;

film_actor_df['film_id_id_str'] = film_actor_df['film_id'].map(str)
# film_actor_df
film_actor_df.groupby('actor_id')['film_id_id_str'].apply(lambda x: ','.join(x)).reset_index(name="film_ids")

Unnamed: 0,actor_id,film_ids
0,1,"1,23,25,106,140,166,277,361,438,499,506,509,60..."
1,2,"3,31,47,105,132,145,226,249,314,321,357,369,39..."
2,3,"17,40,42,87,111,185,289,329,336,341,393,441,45..."
3,4,"23,25,56,62,79,87,355,379,398,463,490,616,635,..."
4,5,"19,54,85,146,171,172,202,203,286,288,316,340,3..."
5,6,"29,53,60,70,112,164,165,193,256,451,503,509,51..."
6,7,"25,27,35,67,96,170,173,217,218,225,292,351,414..."
7,8,"47,115,158,179,195,205,255,263,321,396,458,523..."
8,9,"30,74,147,148,191,200,204,434,510,514,552,650,..."
9,10,"1,9,191,236,251,366,477,480,522,530,587,694,70..."
