# pandasによるテキストデータの処理

## テキストデータで扱うデータ型

In [1]:
import pandas as pd

object_ser = pd.Series(["spam", "ham"])
print(object_ser.dtype)

object


In [2]:
string_ser = pd.Series(["spam", "ham"], dtype="string")
# or
string_ser = pd.Series(
    ["spam", "ham"],
    dtype=pd.StringDtype(),
)
print(string_ser.dtype)

string


In [3]:
object_ser.astype("string")
# or
object_ser.astype(pd.StringDtype())

0    spam
1     ham
dtype: string

## .strアクセサ

In [4]:
s = pd.Series(
    ["spam", "ham", None, "egg"],
    dtype="string",
)

In [5]:
s.str[1:]

0     pam
1      am
2    <NA>
3      gg
dtype: string

In [6]:
s.str.upper()

0    SPAM
1     HAM
2    <NA>
3     EGG
dtype: string

In [7]:
import re

s.str.replace(r"g+", "G", regex=True)
# or
s.str.replace(re.compile(r"g+"), "G", regex=True)

0    spam
1     ham
2    <NA>
3      eG
dtype: string

In [8]:
pd.Series(["1+1", "1+2"]).str.replace("+", "-", regex=False)

0    1-1
1    1-2
dtype: object

In [9]:
s.str.cat()

'spamhamegg'

In [10]:
s.str.cat(sep=",", na_rep="***")

'spam,ham,***,egg'

In [11]:
s.str.cat(
    pd.Series(["a", "b"]),
    sep=",",
    join="right",
)

0    spam,a
1     ham,b
dtype: string

In [12]:
c = pd.Series(["a,A", "b,B", "c-C"])
c.str.split(",")

0    [a, A]
1    [b, B]
2     [c-C]
dtype: object

In [13]:
c.str.split(",", expand=True)

Unnamed: 0,0,1
0,a,A
1,b,B
2,c-C,


In [14]:
c.str.extract(r"(.*)[,-](.*)", expand=True)

Unnamed: 0,0,1
0,a,A
1,b,B
2,c,C


In [15]:
c.str.extract(
    r"(?P<left>.*)[,-](?P<right>.*)",
    expand=True,
)

Unnamed: 0,left,right
0,a,A
1,b,B
2,c,C
