In [2]:
!python -V

Python 3.10.0


In [3]:
import pandas as pd

data = [[60, 65, 66],
        [80, 85, 88],
        [100, 100, 100]]

In [4]:
df = pd.DataFrame(data)

df.head()

Unnamed: 0,0,1,2
0,60,65,66
1,80,85,88
2,100,100,100


In [6]:
df.columns = ["国語", "数学", "英語"]
df.index = ["Kid A", "Kid B", "Kid C"]

In [7]:
df

Unnamed: 0,国語,数学,英語
Kid A,60,65,66
Kid B,80,85,88
Kid C,100,100,100


In [8]:
# Load data from csv
df = pd.read_csv("./test.csv")
df.head()

Unnamed: 0,名前,国語,数学,英語,学生番号
0,A太,83,89,76,A001
1,B介,66,93,75,B001
2,C子,100,84,96,B002
3,D郎,60,73,40,A002
4,E美,92,62,84,C001


In [13]:
df = pd.read_csv("./test.csv", index_col=0)

In [14]:
df

Unnamed: 0_level_0,国語,数学,英語,学生番号
名前,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A太,83,89,76,A001
B介,66,93,75,B001
C子,100,84,96,B002
D郎,60,73,40,A002
E美,92,62,84,C001
F菜,96,92,94,C002


In [17]:
df.columns.tolist()

['国語', '数学', '英語', '学生番号']

In [18]:
df.index.tolist()

['A太', 'B介', 'C子', 'D郎', 'E美', 'F菜']

In [19]:
df.columns

Index(['国語', '数学', '英語', '学生番号'], dtype='object')

In [20]:
df.dtypes

国語       int64
数学       int64
英語       int64
学生番号    object
dtype: object

In [22]:
df["国語"]

名前
A太     83
B介     66
C子    100
D郎     60
E美     92
F菜     96
Name: 国語, dtype: int64

In [24]:
df[["国語", "数学"]]

Unnamed: 0_level_0,国語,数学
名前,Unnamed: 1_level_1,Unnamed: 2_level_1
A太,83,89
B介,66,93
C子,100,84
D郎,60,73
E美,92,62
F菜,96,92


In [25]:
df.iloc[0]

国語        83
数学        89
英語        76
学生番号    A001
Name: A太, dtype: object

In [27]:
df.iloc[[0,1]]

Unnamed: 0_level_0,国語,数学,英語,学生番号
名前,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A太,83,89,76,A001
B介,66,93,75,B001


In [28]:
# 国語の列だけを抜き出して新しいDataFrameを作る
dfA = pd.read_csv("test.csv", index_col=0)

In [29]:
dfA

Unnamed: 0_level_0,国語,数学,英語,学生番号
名前,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A太,83,89,76,A001
B介,66,93,75,B001
C子,100,84,96,B002
D郎,60,73,40,A002
E美,92,62,84,C001
F菜,96,92,94,C002


In [30]:
dfB = dfA["国語"]

In [32]:
dfB

名前
A太     83
B介     66
C子    100
D郎     60
E美     92
F菜     96
Name: 国語, dtype: int64

In [33]:
# 行を抜き出す
dfB = dfA.iloc[0]

In [34]:
dfB

国語        83
数学        89
英語        76
学生番号    A001
Name: A太, dtype: object

In [35]:
# 不要な行の削除
dfB = dfA.drop(dfA.index[3])
dfB

Unnamed: 0_level_0,国語,数学,英語,学生番号
名前,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A太,83,89,76,A001
B介,66,93,75,B001
C子,100,84,96,B002
E美,92,62,84,C001
F菜,96,92,94,C002


In [36]:
# 条件で抽出
dfA["国語"] > 80

名前
A太     True
B介    False
C子     True
D郎    False
E美     True
F菜     True
Name: 国語, dtype: bool

In [38]:
dfB = dfA[dfA["国語"] > 80]

dfB

Unnamed: 0_level_0,国語,数学,英語,学生番号
名前,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A太,83,89,76,A001
C子,100,84,96,B002
E美,92,62,84,C001
F菜,96,92,94,C002


# 欠損値の処理

In [39]:
data = {
    "国語": [90, 50, None, 40],
    "数学": [80, None, None, 50]
}

idx = ["Kid A", "Kid B", "Kid C", "Kid D"]
dfA = pd.DataFrame(data, index=idx)

dfA

Unnamed: 0,国語,数学
Kid A,90.0,80.0
Kid B,50.0,
Kid C,,
Kid D,40.0,50.0


In [40]:
# 欠損値の個数
dfA.isnull().sum()

国語    1
数学    2
dtype: int64

In [41]:
# 除去
dfA.dropna()

Unnamed: 0,国語,数学
Kid A,90.0,80.0
Kid D,40.0,50.0


In [42]:
# 国語に欠損値がある行だけ除去する
dfA.dropna(subset=["国語"])

Unnamed: 0,国語,数学
Kid A,90.0,80.0
Kid B,50.0,
Kid D,40.0,50.0


In [43]:
# 欠損値を平均値で埋める
dfB = dfA.fillna(dfA.mean())
dfB

Unnamed: 0,国語,数学
Kid A,90.0,80.0
Kid B,50.0,65.0
Kid C,60.0,65.0
Kid D,40.0,50.0


In [44]:
# 一つ前の値で埋める
dfB = dfA.fillna(method="ffill")
dfB

Unnamed: 0,国語,数学
Kid A,90.0,80.0
Kid B,50.0,80.0
Kid C,50.0,80.0
Kid D,40.0,50.0


# 重複したデータを除去する

In [45]:
data = [
    [10, 30, 40],
    [20, 30, 40], # dup
    [20, 30, 40], # dup
    [30, 30, 50],
    [20, 30, 40], # dup
]

dfA = pd.DataFrame(data)
dfA

Unnamed: 0,0,1,2
0,10,30,40
1,20,30,40
2,20,30,40
3,30,30,50
4,20,30,40


In [47]:
dfA.duplicated().value_counts()

False    3
True     2
dtype: int64

In [48]:
dfA.drop_duplicates()

Unnamed: 0,0,1,2
0,10,30,40
1,20,30,40
3,30,30,50


# 文字列を数値に直す

In [49]:
data = {
    "A": ["100", "300"],
    "B": ["500", "1,500"]
}

In [50]:
df = pd.DataFrame(data)

In [51]:
df

Unnamed: 0,A,B
0,100,500
1,300,1500


In [53]:
df.dtypes

A    object
B    object
dtype: object

In [55]:
df["A"] = df["A"].astype(int)

In [56]:
df

Unnamed: 0,A,B
0,100,500
1,300,1500


In [57]:
df.dtypes

A     int64
B    object
dtype: object

In [58]:
df["B"] = df["B"].str.replace(",", "").astype(int)

In [59]:
df

Unnamed: 0,A,B
0,100,500
1,300,1500


In [60]:
df.dtypes

A    int64
B    int64
dtype: object