# Python String Formatting Methods

In [2]:
import pandas as pd
import numpy as np

# String Methods

## Count

In [8]:
s = pd.Series(['a', 'b', 'c', 'a', None, 'b', 'aa', 'c'])
s


0       a
1       b
2       c
3       a
4    None
5       b
6      aa
7       c
dtype: object

In [9]:
# Count of string
s.str.count('a')

0    1.0
1    0.0
2    0.0
3    1.0
4    NaN
5    0.0
6    2.0
7    0.0
dtype: float64

In [11]:
# dropping null values
s.dropna().str.count("a")   # Note float changes to int

0    1
1    0
2    0
3    1
5    0
6    2
7    0
dtype: int64

#### This changes with Pandas 1.0 where 'Object' is changed to 'String'
#### There both the outputs are int64

In [12]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'],
               dtype="object")

In [13]:
s

0       A
1       B
2       C
3    Aaba
4    Baca
5     NaN
6    CABA
7     dog
8     cat
dtype: object

In [14]:
#Lower case
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

In [15]:
#Upper
s.str.upper()

0       A
1       B
2       C
3    AABA
4    BACA
5     NaN
6    CABA
7     DOG
8     CAT
dtype: object

In [16]:
#Length
s.str.len()

0    1.0
1    1.0
2    1.0
3    4.0
4    4.0
5    NaN
6    4.0
7    3.0
8    3.0
dtype: float64

## Working with Dataframes

In [26]:
df = pd.DataFrame(np.random.randn(3, 2),
                   columns=[' Column A ', ' Column B '], index=range(3))
df

Unnamed: 0,Column A,Column B
0,-1.058706,-0.182069
1,-0.311474,1.069932
2,-0.668987,1.423973


In [27]:
# Dataframe Column 
df.columns.str.strip()

Index(['Column A', 'Column B'], dtype='object')

In [28]:
df.columns.str.lower()

Index([' column a ', ' column b '], dtype='object')

In [29]:
#Renaming columns
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
df

Unnamed: 0,column_a,column_b
0,-1.058706,-0.182069
1,-0.311474,1.069932
2,-0.668987,1.423973


### Splitting and replacing strings

In [31]:
s2 = pd.Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h'])
s2

0    a_b_c
1    c_d_e
2      NaN
3    f_g_h
dtype: object

In [33]:
#Splitting on '_'
s2.str.split('_')

0    [a, b, c]
1    [c, d, e]
2          NaN
3    [f, g, h]
dtype: object

#### Elements in the split lists can be accessed using get or [] notation:

### Get method

In [36]:
# selecting elements at index 1
s2.str.split('_').str.get(1)

0      b
1      d
2    NaN
3      g
dtype: object

In [38]:
# selecting elements at index 2
s2.str.split('_').str[2]

0      c
1      e
2    NaN
3      h
dtype: object

### Expand 

#### It is easy to expand this to return a DataFrame using expand.

In [39]:
s2.str.split('_', expand = True)

Unnamed: 0,0,1,2
0,a,b,c
1,c,d,e
2,,,
3,f,g,h


#### It is also possible to limit the number of splits:

In [40]:
s2.str.split('_', expand = True, n = 1)

Unnamed: 0,0,1
0,a,b_c
1,c,d_e
2,,
3,f,g_h


### R split

#### rsplit is similar to split except it works in the reverse direction, i.e., from the end of the string to the beginning of the string:

In [41]:
s2.str.rsplit('_', expand=True, n=1)

Unnamed: 0,0,1
0,a_b,c
1,c_d,e
2,,
3,f_g,h


### Replace

#### replace by default replaces regular expressions:

In [43]:
s3 = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca','', np.nan, 'CABA', 'dog', 'cat'])
s3               

0       A
1       B
2       C
3    Aaba
4    Baca
5        
6     NaN
7    CABA
8     dog
9     cat
dtype: object

In [45]:
#replace all 'a' or 'dog' by 'XX-XX '
s3.str.replace('a|dog', 'XX-XX ', case=False)

0                 XX-XX 
1                      B
2                      C
3    XX-XX XX-XX bXX-XX 
4         BXX-XX cXX-XX 
5                       
6                    NaN
7         CXX-XX BXX-XX 
8                 XX-XX 
9               cXX-XX t
dtype: object

#### If you do want literal replacement of a string (equivalent to str.replace()), you can set the optional regex parameter to False

In [46]:
s3.str.replace('a|dog', 'XX-XX ', case=False, regex = False)

0       A
1       B
2       C
3    Aaba
4    Baca
5        
6     NaN
7    CABA
8     dog
9     cat
dtype: object

### Concatenation

#### Concatenating a single Series into a string

In [47]:
s = pd.Series(['a', 'b', 'c', 'd'])
s

0    a
1    b
2    c
3    d
dtype: object

### Cat method

In [49]:
s.str.cat(sep='')

'abcd'

In [51]:
s.str.cat(sep=',')

'a,b,c,d'

#### By default, missing values are ignored. Using na_rep, they can be given a representation:

In [56]:
t = pd.Series(['a', 'b', np.nan, 'd'])
t

0      a
1      b
2    NaN
3      d
dtype: object

In [57]:
t.str.cat(sep=',')


'a,b,d'

In [55]:
t.str.cat(sep = '',na_rep= '-' )

'ab-d'

## Concatenating a Series and something list-like into a Series

In [58]:
s

0    a
1    b
2    c
3    d
dtype: object

In [59]:
 s.str.cat(['A', 'B', 'C', 'D'])

0    aA
1    bB
2    cC
3    dD
dtype: object

## Indexing with .str

#### You can use [] notation to directly index by position locations. If you index past the end of the string, the result will be a NaN.

In [60]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan,
                   'CABA', 'dog', 'cat'])
s

0       A
1       B
2       C
3    Aaba
4    Baca
5     NaN
6    CABA
7     dog
8     cat
dtype: object

In [61]:
s.str[0]

0      A
1      B
2      C
3      A
4      B
5    NaN
6      C
7      d
8      c
dtype: object

In [62]:
s.str[1]

0    NaN
1    NaN
2    NaN
3      a
4      a
5    NaN
6      A
7      o
8      a
dtype: object

## Extracting substrings

In [69]:
#DataFrame is returned expand = True
pd.Series(['a1', 'b2', 'c3']).str.extract(r'[ab](\d)', expand=True)

Unnamed: 0,0
0,1.0
1,2.0
2,


In [70]:
# series is returned expand = False
pd.Series(['a1', 'b2', 'c3']).str.extract(r'[ab](\d)', expand=False)

0      1
1      2
2    NaN
dtype: object

## Testing for Strings that match or contain a pattern

In [71]:
pattern = r'[0-9][a-z]'

In [72]:
pd.Series(['1', '2', '3a', '3b', '03c']).str.contains(pattern)

0    False
1    False
2     True
3     True
4     True
dtype: bool

In [73]:
pd.Series(['1', '2', '3a', '3b', '03c']).str.match(pattern)

0    False
1    False
2     True
3     True
4    False
dtype: bool

#### The distinction between match and contains is strictness: match relies on strict re.match, while contains relies on re.search.

#### Methods like match, contains, startswith, and endswith take an extra na argument so missing values can be considered True or False:

In [74]:
s4 = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s4

0       A
1       B
2       C
3    Aaba
4    Baca
5     NaN
6    CABA
7     dog
8     cat
dtype: object

In [75]:
s4.str.contains('A', na=False)

0     True
1    False
2    False
3     True
4    False
5    False
6     True
7    False
8    False
dtype: bool

## Creating indicator variables

#### You can extract dummy variables from string columns. For example if they are separated by a '|':

In [81]:
s = pd.Series(['a', 'a|b', np.nan, 'a|c'])
s

0      a
1    a|b
2    NaN
3    a|c
dtype: object

In [83]:
s.str.get_dummies(sep= '|')

Unnamed: 0,a,b,c
0,1,0,0
1,1,1,0
2,0,0,0
3,1,0,1


Method summary

Method
	

Description

    cat()Concatenate strings

    split()Split strings on delimiter

    rsplit()Split strings on delimiter working from the end of the string

    get()Index into each element (retrieve i-th element)

    join()Join strings in each element of the Series with passed separator

    get_dummies()Split strings on the delimiter returning DataFrame of dummy variables

    contains()Return boolean array if each string contains pattern/regex

    replace()Replace occurrences of pattern/regex/string with some other string or the return value of a callable given the occurrence

    repeat()Duplicate values (s.str.repeat(3) equivalent to x * 3)

    pad()Add whitespace to left, right, or both sides of strings

    center()Equivalent to str.center

    ljust()Equivalent to str.ljust

    rjust()Equivalent to str.rjust

    zfill()Equivalent to str.zfill

    wrap()Split long strings into lines with length less than a given width

    slice()Slice each string in the Series

    slice_replace()Replace slice in each string with passed value

    count()Count occurrences of pattern

    startswith()Equivalent to str.startswith(pat) for each element

    endswith()Equivalent to str.endswith(pat) for each element

    findall()Compute list of all occurrences of pattern/regex for each string

    match()Call re.match on each element, returning matched groups as list

    extract()Call re.search on each element, returning DataFrame with one row for each element and one column for each regex capture group

    extractall()Call re.findall on each element, returning DataFrame with one row for each match and one column for each regex capture group

    len()Compute string lengths

    strip()Equivalent to str.strip

    rstrip()Equivalent to str.rstrip

    lstrip()Equivalent to str.lstrip

    partition()Equivalent to str.partition

    rpartition()Equivalent to str.rpartition

    lower()Equivalent to str.lower

    casefold()Equivalent to str.casefold

    upper()Equivalent to str.upper

    find()Equivalent to str.find

    rfind()Equivalent to str.rfind

    index()Equivalent to str.index

    rindex()Equivalent to str.rindex

    capitalize()Equivalent to str.capitalize

    swapcase()Equivalent to str.swapcase

    normalize()Return Unicode normal form. Equivalent to unicodedata.normalize

    translate()Equivalent to str.translate

    isalnum()Equivalent to str.isalnum

    isalpha()Equivalent to str.isalpha

    isdigit()Equivalent to str.isdigit

    isspace()Equivalent to str.isspace

    islower()Equivalent to str.islower

    isupper()Equivalent to str.isupper

    istitle()Equivalent to str.istitle

    isnumeric()Equivalent to str.isnumeric

    isdecimal()Equivalent to str.isdecimal