## 第3章 pandasでデータを処理しよう

### 3-6: データ処理

In [1]:
# リスト3.6.1：真偽値による抽出
import os
import pandas as pd

base_url = (
    "https://raw.githubusercontent.com/practical-jupyter/sample-data/master/anime/"
)
anime_csv = os.path.join(base_url, "anime.csv")
df = pd.read_csv(anime_csv)
df.loc[df["episodes"] == "Unknown"].head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
73,21,One Piece,"Action, Adventure, Comedy, Drama, Fantasy, Sho...",TV,Unknown,8.58,504862
248,235,Detective Conan,"Adventure, Comedy, Mystery, Police, Shounen",TV,Unknown,8.25,114702
607,1735,Naruto: Shippuuden,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,Unknown,7.94,533578
993,33157,Tanaka-kun wa Itsumo Kedaruge Specials,"Comedy, School, Slice of Life",Special,Unknown,7.72,5400
1226,21639,Yu☆Gi☆Oh! Arc-V,"Action, Fantasy, Game, Shounen",TV,Unknown,7.61,17571


In [2]:
# リスト3.6.2：whereメソッドによる抽出
df.where(df["rating"] < 9.2).head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,,,,,,,
1,,,,,,,
2,,,,,,,
3,9253.0,Steins;Gate,"Sci-Fi, Thriller",TV,24.0,9.17,673572.0
4,9969.0,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51.0,9.16,151266.0


In [3]:
# リスト3.6.3：値を代入
import numpy as np

df.loc[74, "episodes"] = np.nan
df.loc[74, "episodes"]

nan

In [4]:
# リスト3.6.5：複数の要素に代入
df.loc[df["episodes"] == "Unknown", "episodes"] = np.nan

In [5]:
# リスト3.6.6：欠損値の抽出
df.loc[df["episodes"].isnull()].head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
73,21,One Piece,"Action, Adventure, Comedy, Drama, Fantasy, Sho...",TV,,8.58,504862
74,801,Ghost in the Shell: Stand Alone Complex 2nd GIG,"Action, Mecha, Military, Mystery, Police, Sci-...",TV,,8.57,113993
248,235,Detective Conan,"Adventure, Comedy, Mystery, Police, Shounen",TV,,8.25,114702
607,1735,Naruto: Shippuuden,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,,7.94,533578
993,33157,Tanaka-kun wa Itsumo Kedaruge Specials,"Comedy, School, Slice of Life",Special,,7.72,5400


In [6]:
# リスト3.6.7：欠損値の除外
df.dropna().loc[70:].head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
70,578,Hotaru no Haka,"Drama, Historical",Movie,1,8.58,174878
71,16894,Kuroko no Basket 2nd Season,"Comedy, School, Shounen, Sports",TV,25,8.58,243325
72,5028,Major S5,"Comedy, Drama, Romance, Sports",TV,25,8.58,28653
75,31933,JoJo no Kimyou na Bouken: Diamond wa Kudakenai,"Action, Adventure, Comedy, Drama, Shounen, Sup...",TV,39,8.57,74074
76,5205,Kara no Kyoukai 7: Satsujin Kousatsu (Kou),"Action, Mystery, Romance, Supernatural, Thriller",Movie,1,8.57,95658


In [7]:
# リスト3.6.8：非破壊的な操作の確認
df.loc[70:].head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
70,578,Hotaru no Haka,"Drama, Historical",Movie,1.0,8.58,174878
71,16894,Kuroko no Basket 2nd Season,"Comedy, School, Shounen, Sports",TV,25.0,8.58,243325
72,5028,Major S5,"Comedy, Drama, Romance, Sports",TV,25.0,8.58,28653
73,21,One Piece,"Action, Adventure, Comedy, Drama, Fantasy, Sho...",TV,,8.58,504862
74,801,Ghost in the Shell: Stand Alone Complex 2nd GIG,"Action, Mecha, Military, Mystery, Police, Sci-...",TV,,8.57,113993


In [8]:
# リスト3.6.9：破壊的な操作
df.dropna(inplace=True)
df.loc[70:].head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
70,578,Hotaru no Haka,"Drama, Historical",Movie,1,8.58,174878
71,16894,Kuroko no Basket 2nd Season,"Comedy, School, Shounen, Sports",TV,25,8.58,243325
72,5028,Major S5,"Comedy, Drama, Romance, Sports",TV,25,8.58,28653
75,31933,JoJo no Kimyou na Bouken: Diamond wa Kudakenai,"Action, Adventure, Comedy, Drama, Shounen, Sup...",TV,39,8.57,74074
76,5205,Kara no Kyoukai 7: Satsujin Kousatsu (Kou),"Action, Mystery, Romance, Supernatural, Thriller",Movie,1,8.57,95658


In [9]:
# リスト3.6.10：データ型の確認
df["anime_id"].dtype

dtype('int64')

In [10]:
# リスト3.6.12：DataFrameのデータ型の確認
df.dtypes

anime_id      int64
name         object
genre        object
type         object
episodes     object
rating      float64
members       int64
dtype: object

In [11]:
# リスト3.6.14：型変換
pd.options.display.max_rows = 10  # pandasで表示する行数を設定
df["episodes"].astype(np.int64)

0         1
1        64
2        51
3        24
4        51
         ..
10272     1
10273    23
10274     1
10275     1
10276    32
Name: episodes, dtype: int64

In [12]:
# リスト3.6.16：辞書指定による型変換
df.astype({"episodes": np.int64, "rating": np.float64})

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
10272,11095,Zouressha ga Yatte Kita,Adventure,Movie,1,6.06,78
10273,7808,Zukkoke Knight: Don De La Mancha,"Adventure, Comedy, Historical, Romance",TV,23,6.47,172
10274,28543,Zukkoke Sannin-gumi no Hi Asobi Boushi Daisakusen,"Drama, Kids",OVA,1,5.83,50
10275,18967,Zukkoke Sannin-gumi: Zukkoke Jikuu Bouken,"Comedy, Historical, Sci-Fi",OVA,1,6.13,76


In [13]:
# リスト3.6.17：型変換後のDataFrame
df.dtypes

anime_id      int64
name         object
genre        object
type         object
episodes     object
rating      float64
members       int64
dtype: object

In [14]:
# リスト3.6.19：型変換後の値を代入
df["episodes"] = df["episodes"].astype(np.int64)
df.dtypes

anime_id      int64
name         object
genre        object
type         object
episodes      int64
rating      float64
members       int64
dtype: object

In [15]:
# リスト3.6.21：rating列を降順でソート
df.sort_values("rating", ascending=False).head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
9846,33662,Taka no Tsume 8: Yoshida-kun no X-Files,"Comedy, Parody",Movie,1,10.0,13
8985,23005,Mogura no Motoro,Slice of Life,Movie,1,9.5,62
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
8474,33607,Kahei no Umi,Historical,Movie,1,9.33,44
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665


In [16]:
# リスト3.6.22：mapメソッドによる関数の適用
import html

print(df["name"].head())
print(df["name"].map(html.unescape).head())

0                      Kimi no Na wa.
1    Fullmetal Alchemist: Brotherhood
2                            Gintama°
3                         Steins;Gate
4                       Gintama&#039;
Name: name, dtype: object
0                      Kimi no Na wa.
1    Fullmetal Alchemist: Brotherhood
2                            Gintama°
3                         Steins;Gate
4                            Gintama'
Name: name, dtype: object


In [17]:
# リスト3.6.24：applyメソッドによる関数の適用
df.apply(len)

anime_id    10075
name        10075
genre       10075
type        10075
episodes    10075
rating      10075
members     10075
dtype: int64

In [18]:
# リスト3.6.26：applyメソッドによる行に対しての関数の適用
df.apply(len, axis=1).head()

0    7
1    7
2    7
3    7
4    7
dtype: int64

In [19]:
# リスト3.6.28：applyメソッドに渡される型
df.apply(type)

anime_id    <class 'pandas.core.series.Series'>
name        <class 'pandas.core.series.Series'>
genre       <class 'pandas.core.series.Series'>
type        <class 'pandas.core.series.Series'>
episodes    <class 'pandas.core.series.Series'>
rating      <class 'pandas.core.series.Series'>
members     <class 'pandas.core.series.Series'>
dtype: object

In [20]:
# リスト3.6.30：applyメソッドの引数に要素を指定
df.apply(lambda x: len(x["name"]) + len(x["genre"]), axis=1).head()

0    50
1    91
2    68
3    27
4    73
dtype: int64

In [21]:
# リスト3.6.32：applymapメソッドによる関数の適用
df[["name", "genre"]].applymap(len).head()

Unnamed: 0,name,genre
0,14,36
1,32,59
2,8,60
3,11,16
4,13,60
