### Pandas-Series String functions

#### In this tutorial we cover
* str.lower(), str.upper(), str.capitalize(), Concatenate(),
* contains(), count(), decode(), str.extractall(), 
* str.startswith(), str.endswith(), str.findall(), str.get_dummies(),
* categories(), cat.reorder_categories(), cat.add_categories()

In [1]:
import numpy as np
import pandas as pd

In [2]:
names = pd.Series(data = ['Sachin', 'Rohit', 'Virat', 'Sunil', 'Kapil'])
names

0    Sachin
1     Rohit
2     Virat
3     Sunil
4     Kapil
dtype: object

In [3]:
# lower case()
names.str.lower()

0    sachin
1     rohit
2     virat
3     sunil
4     kapil
dtype: object

In [4]:
# Upper case()
names.str.upper()

0    SACHIN
1     ROHIT
2     VIRAT
3     SUNIL
4     KAPIL
dtype: object

In [5]:
# Capitalize()
names.str.capitalize()

0    Sachin
1     Rohit
2     Virat
3     Sunil
4     Kapil
dtype: object

In [9]:
surnames = pd.Series(['Tendulkar', 'Sharma', 'Kohli', 'Gavaskar', 'Dev', 'Dhoni'])
surnames

0    Tendulkar
1       Sharma
2        Kohli
3     Gavaskar
4          Dev
5        Dhoni
dtype: object

In [48]:
# Concatenate()
players = surnames.str.cat(names, sep=',', na_rep='_')
players

0    Tendulkar,Sachin
1        Sharma,Rohit
2         Kohli,Virat
3      Gavaskar,Sunil
4           Dev,Kapil
5             Dhoni,_
dtype: object

In [12]:
X = pd.Series(['T20', 'A1', 'Fine', 'Nine', '99'])
X

0     T20
1      A1
2    Fine
3    Nine
4      99
dtype: object

In [14]:
# Contains
X[X.str.contains('\d', regex=True)]

0    T20
1     A1
4     99
dtype: object

In [15]:
# Count
X.str.count('\d')

0    2
1    1
2    0
3    0
4    2
dtype: int64

In [18]:
print(Bharat_Hindi)

b'\xe0\xa4\xad\xe0\xa4\xbe\xe0\xa4\xb0\xe0\xa4\xa4'


In [23]:
print(Jai_Hind)

b'\xe0\xa4\x9c\xe0\xa4\xaf \xe0\xa4\xb9\xe0\xa4\xbf\xe0\xa4\xa8\xe0\xa5\x8d\xe0\xa4\xa6'


In [22]:
# decode()
pd.Series([Bharat_Hindi, Jai_Hind]).str.decode('utf-8')

0        भारत
1    जय हिन्द
dtype: object

In [30]:
X

0     T20
1      A1
2    Fine
3    Nine
4      99
dtype: object

In [34]:
# Extract all digits from each string
X.str.extractall(pat=r'(\d+)').reset_index(drop=True)

Unnamed: 0,0
0,20
1,1
2,99


In [41]:
# Find string which start with pattern A
X[X.str.startswith(pat='A')]

1    A1
dtype: object

In [43]:
# Find sttrings which end with pattern as 0
X[X.str.endswith(pat='0')]

0    T20
dtype: object

In [46]:
# Within each string find all occurances of a digit
X.str.findall(r'\d')

0    [2, 0]
1       [1]
2        []
3        []
4    [9, 9]
dtype: object

In [47]:
# return by position
X.str.get(-1)

0    0
1    1
2    e
3    e
4    9
dtype: object

In [52]:
# get_dummies: splits strings based on separator and returns a dataframe

y = pd.Series(['Tall', 'Small', 'Medium', 'Tall', 'Medium'])

In [53]:
y.str.get_dummies(sep=',')

Unnamed: 0,Medium,Small,Tall
0,0,0,1
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0


### Categories

In [86]:
y

0      Tall
1     Small
2    Medium
3      Tall
4    Medium
dtype: object

In [87]:
# Change elemenets of series as category
y_cat = y.astype('category')
y_cat

0      Tall
1     Small
2    Medium
3      Tall
4    Medium
dtype: category
Categories (3, object): [Medium, Small, Tall]

In [64]:
# Create a new orderrd categorical data type
height_cat = pd.api.types.CategoricalDtype(categories=['Small', 'Medium', 'Tall'], ordered=True)

In [67]:
# create series based on ordered category
y_ordcat = y.astype(height_cat)
y_ordcat

0      Tall
1     Small
2    Medium
3      Tall
4    Medium
dtype: category
Categories (3, object): [Small < Medium < Tall]

In [69]:
# Rename Categories
y_ordcat.cat.rename_categories(['S','M','T'])

0    T
1    S
2    M
3    T
4    M
dtype: category
Categories (3, object): [S < M < T]

In [70]:
# Reorder Categories 
y_ordcat.cat.reorder_categories(['Tall','Medium','Small'])

0      Tall
1     Small
2    Medium
3      Tall
4    Medium
dtype: category
Categories (3, object): [Tall < Medium < Small]

In [77]:
# add_categories

pd.Series(['Tall', 'Small','Medium','V Tall']).astype(height_cat).cat.add_categories(['V Tall'])

0      Tall
1     Small
2    Medium
3       NaN
dtype: category
Categories (4, object): [Small < Medium < Tall < V Tall]

In [79]:
y_newordcat = y_ordcat.cat.add_categories(['V Tall'])
y_newordcat

0      Tall
1     Small
2    Medium
3      Tall
4    Medium
dtype: category
Categories (4, object): [Small < Medium < Tall < V Tall]

In [84]:
y_newordcat[5] = 'V Tall'

In [85]:
y_newordcat

0      Tall
1     Small
2    Medium
3      Tall
4    Medium
5    V Tall
dtype: object