# Vectorized string operations

In [98]:
import numpy as np
import pandas as pd


In [99]:
# What are vectorized operations
a = np.array([1,2,3,4])
a * 4

array([ 4,  8, 12, 16])

In [100]:
# problem in vectorized opertions in python
s = ['cat','mat',None,'rat']

[i.startswith('c') for i in s]

AttributeError: 'NoneType' object has no attribute 'startswith'

In [None]:
# How pandas solves this issue?

s = pd.Series(['cat','mat',None,'rat'])  # fast and optimized

s.str.startswith('c')   # .str - string accessor



0     True
1    False
2     None
3    False
dtype: object

In [None]:
titanic = pd.read_csv(r'E:\Learn_Data_Science\Python_For_Data_Science\Pandas_Folder\Datasets_For_Pandas\titanic.csv')
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [None]:
titanic['Name']

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
                             ...                        
886                                Montvila, Rev. Juozas
887                         Graham, Miss. Margaret Edith
888             Johnston, Miss. Catherine Helen "Carrie"
889                                Behr, Mr. Karl Howell
890                                  Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object

# Common functions

### lower/upper/capitalize/title

In [None]:
### upper
titanic['Name'].str.upper()

0                                BRAUND, MR. OWEN HARRIS
1      CUMINGS, MRS. JOHN BRADLEY (FLORENCE BRIGGS TH...
2                                 HEIKKINEN, MISS. LAINA
3           FUTRELLE, MRS. JACQUES HEATH (LILY MAY PEEL)
4                               ALLEN, MR. WILLIAM HENRY
                             ...                        
886                                MONTVILA, REV. JUOZAS
887                         GRAHAM, MISS. MARGARET EDITH
888             JOHNSTON, MISS. CATHERINE HELEN "CARRIE"
889                                BEHR, MR. KARL HOWELL
890                                  DOOLEY, MR. PATRICK
Name: Name, Length: 891, dtype: object

In [None]:
### lower
titanic['Name'].str.lower()

0                                braund, mr. owen harris
1      cumings, mrs. john bradley (florence briggs th...
2                                 heikkinen, miss. laina
3           futrelle, mrs. jacques heath (lily may peel)
4                               allen, mr. william henry
                             ...                        
886                                montvila, rev. juozas
887                         graham, miss. margaret edith
888             johnston, miss. catherine helen "carrie"
889                                behr, mr. karl howell
890                                  dooley, mr. patrick
Name: Name, Length: 891, dtype: object

In [None]:
### capitalized - first letter upper case
titanic['Name'].str.capitalize
titanic['Name']

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
                             ...                        
886                                Montvila, Rev. Juozas
887                         Graham, Miss. Margaret Edith
888             Johnston, Miss. Catherine Helen "Carrie"
889                                Behr, Mr. Karl Howell
890                                  Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object

In [None]:
### title - first letter of each word upper case
titanic['Name'].str.title()

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
                             ...                        
886                                Montvila, Rev. Juozas
887                         Graham, Miss. Margaret Edith
888             Johnston, Miss. Catherine Helen "Carrie"
889                                Behr, Mr. Karl Howell
890                                  Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object

### len

In [None]:
### len
titanic['Name'].str.len()


0      23
1      51
2      22
3      44
4      24
       ..
886    21
887    28
888    40
889    21
890    19
Name: Name, Length: 891, dtype: int64

In [None]:
titanic['Name'][titanic['Name'].str.len().sort_values(ascending=False).head(1)] # the longest name

82    McDermott, Miss. Brigdet Delia
Name: Name, dtype: object

In [None]:
titanic['Sex'][titanic['Name'].str.len().sort_values(ascending=False).head(1)] # sex of the person

82    female
Name: Sex, dtype: object

### strip()

In [None]:
### strip - remove the first and last extra spaces
a = 'Saiful Islam    Rupom'
b = '      Saiful     '
a.strip()
b.strip()


'Saiful'

In [None]:
titanic['Name'].str.strip() # clean the column

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
                             ...                        
886                                Montvila, Rev. Juozas
887                         Graham, Miss. Margaret Edith
888             Johnston, Miss. Catherine Helen "Carrie"
889                                Behr, Mr. Karl Howell
890                                  Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object

### split -> get

In [None]:
### syntax: .str.split(' ', n = 2) # here n = 2 means to control splitting, means split applying on first two- ' '
### syntax: .str.split(' ', expand= True) # expand= True is for show in a dataframe

In [None]:
### make - first name | surname | title

titanic['Name'].str.split(',')

0                             [Braund,  Mr. Owen Harris]
1      [Cumings,  Mrs. John Bradley (Florence Briggs ...
2                              [Heikkinen,  Miss. Laina]
3        [Futrelle,  Mrs. Jacques Heath (Lily May Peel)]
4                            [Allen,  Mr. William Henry]
                             ...                        
886                             [Montvila,  Rev. Juozas]
887                      [Graham,  Miss. Margaret Edith]
888          [Johnston,  Miss. Catherine Helen "Carrie"]
889                             [Behr,  Mr. Karl Howell]
890                               [Dooley,  Mr. Patrick]
Name: Name, Length: 891, dtype: object

In [None]:
### extract the surname
titanic['Name'].str.split(',').get(0) 
titanic['Name'].str.split(',').str.get(0) # to fetch the first item in the list - extract surname

0         Braund
1        Cumings
2      Heikkinen
3       Futrelle
4          Allen
         ...    
886     Montvila
887       Graham
888     Johnston
889         Behr
890       Dooley
Name: Name, Length: 891, dtype: object

In [None]:
titanic['Last_name'] = titanic['Name'].str.split(',').str.get(0)
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Last_name
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,Braund
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Cumings
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,Heikkinen
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,Futrelle
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,Allen
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,Montvila
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,Graham
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,Johnston
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,Behr


In [None]:
### split the title name and first name
### extract the title name and first name

titanic['Name'].str.split(',').str.get(1)

0                                  Mr. Owen Harris
1       Mrs. John Bradley (Florence Briggs Thayer)
2                                      Miss. Laina
3               Mrs. Jacques Heath (Lily May Peel)
4                                Mr. William Henry
                          ...                     
886                                    Rev. Juozas
887                           Miss. Margaret Edith
888                 Miss. Catherine Helen "Carrie"
889                                Mr. Karl Howell
890                                    Mr. Patrick
Name: Name, Length: 891, dtype: object

In [None]:
titanic['Name'].str.split(',').str.get(1).str.split('.') # splitting between title name and first name

0                                 [ Mr,  Owen Harris]
1      [ Mrs,  John Bradley (Florence Briggs Thayer)]
2                                     [ Miss,  Laina]
3              [ Mrs,  Jacques Heath (Lily May Peel)]
4                               [ Mr,  William Henry]
                            ...                      
886                                   [ Rev,  Juozas]
887                          [ Miss,  Margaret Edith]
888                [ Miss,  Catherine Helen "Carrie"]
889                               [ Mr,  Karl Howell]
890                                   [ Mr,  Patrick]
Name: Name, Length: 891, dtype: object

In [None]:
titanic['Name'].str.split(',').str.get(1).str.split('.').str.get(0) # extract the title name


0         Mr
1        Mrs
2       Miss
3        Mrs
4         Mr
       ...  
886      Rev
887     Miss
888     Miss
889       Mr
890       Mr
Name: Name, Length: 891, dtype: object

In [None]:
titanic['Name'].str.split(',').str.get(1).str.split('.').str.get(0) + '.'

0         Mr.
1        Mrs.
2       Miss.
3        Mrs.
4         Mr.
        ...  
886      Rev.
887     Miss.
888     Miss.
889       Mr.
890       Mr.
Name: Name, Length: 891, dtype: object

In [None]:
titanic['Name'].str.split(',').str.get(1).str.split('.').str.get(1) # extract the first name

0                                 Owen Harris
1       John Bradley (Florence Briggs Thayer)
2                                       Laina
3               Jacques Heath (Lily May Peel)
4                               William Henry
                        ...                  
886                                    Juozas
887                            Margaret Edith
888                  Catherine Helen "Carrie"
889                               Karl Howell
890                                   Patrick
Name: Name, Length: 891, dtype: object

In [106]:
titanic['Title_name'] = titanic['Name'].str.split(',').str.get(1).str.split('.').str.get(0) + '.'
titanic['First_name'] = titanic['Name'].str.split(',').str.get(1).str.split('.').str.get(1).str.strip()

In [107]:
titanic.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Last_name,Title_name,First_name
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Braund,Mr.,Owen Harris
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Cumings,Mrs.,John Bradley (Florence Briggs Thayer)
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Heikkinen,Miss.,Laina


In [None]:
titanic['Title_name'].value_counts()

Title_name
Mr.              517
Miss.            182
Mrs.             125
Master.           40
Dr.                7
Rev.               6
Col.               2
Mlle.              2
Major.             2
Ms.                1
Mme.               1
Don.               1
Lady.              1
Sir.               1
Capt.              1
the Countess.      1
Jonkheer.          1
Name: count, dtype: int64

### replace - make permanent change - so becareful

In [None]:
### replace - Ms. and Mlle. as Miss.

# titanic['Title_name'] = titanic['Title_name'].str.replace('Ms.','Miss.')
# titanic['Title_name'] = titanic['Title_name'].str.replace('Mlle.','Miss.')

In [None]:
titanic['Title_name'].value_counts()

Title_name
Mr.              517
Miss.            185
Mrs.             125
Master.           40
Dr.                7
Rev.               6
Major.             2
Col.               2
Don.               1
Lady.              1
Mme.               1
Sir.               1
Capt.              1
the Countess.      1
Jonkheer.          1
Name: count, dtype: int64

# Filtering
startswith / endswith

isdigit / isalpha

In [108]:
### Find first name starting with letter "A"
titanic['First_name'].str.startswith('A')


0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888    False
889    False
890    False
Name: First_name, Length: 891, dtype: bool

In [110]:
titanic[titanic['First_name'].str.startswith('A')].head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Last_name,Title_name,First_name
13,14,0,3,"Andersson, Mr. Anders Johan",male,39.0,1,5,347082,31.275,,S,Andersson,Mr.,Anders Johan
22,23,1,3,"McGowan, Miss. Anna ""Annie""",female,15.0,0,0,330923,8.0292,,Q,McGowan,Miss.,"Anna ""Annie"""
35,36,0,1,"Holverson, Mr. Alexander Oskar",male,42.0,1,0,113789,52.0,,S,Holverson,Mr.,Alexander Oskar


In [111]:
### Find first name ends with letter "A"
titanic[titanic['First_name'].str.endswith('A')]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Last_name,Title_name,First_name
64,65,0,1,"Stewart, Mr. Albert A",male,,0,0,PC 17605,27.7208,,C,Stewart,Mr.,Albert A
303,304,1,2,"Keane, Miss. Nora A",female,,0,0,226593,12.35,E101,Q,Keane,Miss.,Nora A


# Advanced level filtering

### Applying regular expression
ex: contains

In [None]:
### Search 'john' (both case)

titanic['Name'].str.contains('john', case= False) # case= False means no case sensitivity


0      False
1       True
2      False
3      False
4      False
       ...  
886    False
887    False
888     True
889    False
890    False
Name: Name, Length: 891, dtype: bool

In [113]:
titanic['Name'][titanic['Name'].str.contains('john', case= False)]

1      Cumings, Mrs. John Bradley (Florence Briggs Th...
8      Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)
41     Turpin, Mrs. William John Robert (Dorothy Ann ...
45                              Rogers, Mr. William John
98                  Doling, Mrs. John T (Ada Julia Bone)
112                               Barton, Mr. David John
117                      Turpin, Mr. William John Robert
160                             Cribb, Mr. John Hatfield
162                           Bengtsson, Mr. John Viktor
165      Goldsmith, Master. Frank John William "Frankie"
168                                  Baumann, Mr. John D
172                         Johnson, Miss. Eleanor Ileen
188                                     Bourke, Mr. John
212                               Perkin, Mr. John Henry
226                            Mellors, Mr. William John
227                      Lovell, Mr. John Hall ("Henry")
302                      Johnson, Mr. William Cahoone Jr
324                            

In [None]:
### Find surname starts and ends with vowels (a,e,i,o,u)

(titanic['Last_name'].str.startswith(('a','e','i','o','u','A','E','I','O','U'))) & (titanic['Last_name'].str.endswith(('a','e','i','o','u','A','E','I','O','U')))


0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888    False
889    False
890    False
Name: Last_name, Length: 891, dtype: bool

In [119]:
titanic[(titanic['Last_name'].str.startswith(('a','e','i','o','u','A','E','I','O','U'))) & (titanic['Last_name'].str.endswith(('a','e','i','o','u','A','E','I','O','U')))]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Last_name,Title_name,First_name
30,31,0,1,"Uruchurtu, Don. Manuel E",male,40.0,0,0,PC 17601,27.7208,,C,Uruchurtu,Don.,Manuel E
49,50,0,3,"Arnold-Franchi, Mrs. Josef (Josefine Franchi)",female,18.0,1,0,349237,17.8,,S,Arnold-Franchi,Mrs.,Josef (Josefine Franchi)
207,208,1,3,"Albimona, Mr. Nassef Cassem",male,26.0,0,0,2699,18.7875,,C,Albimona,Mr.,Nassef Cassem
210,211,0,3,"Ali, Mr. Ahmed",male,24.0,0,0,SOTON/O.Q. 3101311,7.05,,S,Ali,Mr.,Ahmed
353,354,0,3,"Arnold-Franchi, Mr. Josef",male,25.0,1,0,349237,17.8,,S,Arnold-Franchi,Mr.,Josef
493,494,0,1,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C,Artagaveytia,Mr.,Ramon
518,519,1,2,"Angle, Mrs. William A (Florence ""Mary"" Agnes H...",female,36.0,1,0,226875,26.0,,S,Angle,Mrs.,"William A (Florence ""Mary"" Agnes Hughes)"
784,785,0,3,"Ali, Mr. William",male,25.0,0,0,SOTON/O.Q. 3101312,7.05,,S,Ali,Mr.,William
840,841,0,3,"Alhomaki, Mr. Ilmari Rudolf",male,20.0,0,0,SOTON/O2 3101287,7.925,,S,Alhomaki,Mr.,Ilmari Rudolf


In [120]:
### Alternative way
titanic['Last_name'].str.contains('^[aeiouAEIOU].+[aeiouAEIOU]$')

0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888    False
889    False
890    False
Name: Last_name, Length: 891, dtype: bool

In [121]:
titanic[titanic['Last_name'].str.contains('^[aeiouAEIOU].+[aeiouAEIOU]$')]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Last_name,Title_name,First_name
30,31,0,1,"Uruchurtu, Don. Manuel E",male,40.0,0,0,PC 17601,27.7208,,C,Uruchurtu,Don.,Manuel E
49,50,0,3,"Arnold-Franchi, Mrs. Josef (Josefine Franchi)",female,18.0,1,0,349237,17.8,,S,Arnold-Franchi,Mrs.,Josef (Josefine Franchi)
207,208,1,3,"Albimona, Mr. Nassef Cassem",male,26.0,0,0,2699,18.7875,,C,Albimona,Mr.,Nassef Cassem
210,211,0,3,"Ali, Mr. Ahmed",male,24.0,0,0,SOTON/O.Q. 3101311,7.05,,S,Ali,Mr.,Ahmed
353,354,0,3,"Arnold-Franchi, Mr. Josef",male,25.0,1,0,349237,17.8,,S,Arnold-Franchi,Mr.,Josef
493,494,0,1,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C,Artagaveytia,Mr.,Ramon
518,519,1,2,"Angle, Mrs. William A (Florence ""Mary"" Agnes H...",female,36.0,1,0,226875,26.0,,S,Angle,Mrs.,"William A (Florence ""Mary"" Agnes Hughes)"
784,785,0,3,"Ali, Mr. William",male,25.0,0,0,SOTON/O.Q. 3101312,7.05,,S,Ali,Mr.,William
840,841,0,3,"Alhomaki, Mr. Ilmari Rudolf",male,20.0,0,0,SOTON/O2 3101287,7.925,,S,Alhomaki,Mr.,Ilmari Rudolf


### slicing

In [124]:
### Find the first four characters of names

titanic['Name'].str[:4]

0      Brau
1      Cumi
2      Heik
3      Futr
4      Alle
       ... 
886    Mont
887    Grah
888    John
889    Behr
890    Dool
Name: Name, Length: 891, dtype: object

In [None]:
titanic['Name'].str[::2] # alternate character

0                    Ban,M.Oe ars
1      Cmns r.Jh rde Foec rgsTae)
2                     Hiknn is an
3          Ftel,Ms aqe et Ll a el
4                    Aln r ila er
                  ...            
886                   Mnvl,Rv uzs
887                Gaa,Ms.Mrae dt
888          Jhso,Ms.CteieHln"are
889                   Bh,M.Kr oel
890                    Doe,M.Ptik
Name: Name, Length: 891, dtype: object

In [127]:
titanic['Name'].str[::-1] # reverse

0                                sirraH newO .rM ,dnuarB
1      )reyahT sggirB ecnerolF( yeldarB nhoJ .srM ,sg...
2                                 aniaL .ssiM ,nenikkieH
3           )leeP yaM yliL( htaeH seuqcaJ .srM ,ellertuF
4                               yrneH mailliW .rM ,nellA
                             ...                        
886                                sazouJ .veR ,alivtnoM
887                         htidE teragraM .ssiM ,maharG
888             "eirraC" neleH enirehtaC .ssiM ,notsnhoJ
889                                llewoH lraK .rM ,rheB
890                                  kcirtaP .rM ,yelooD
Name: Name, Length: 891, dtype: object