In [1]:
import pandas as pd

# Load the data
df = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vReZBM5OC6GLYbacisp_ToNiu3CLWxqPXw7mWBsdRjnYOFLWNufdQ4qd8u5qTzUF2_sBUAMEi5cgy1U/pub?gid=1040198428&single=true&output=csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Separating Features: Strings
Pandas also contains string methods that can be applied to entire columns, these are accessible by putting .str before common strings methods. For example, say we want separate columns for first and
last names. We can use .str.split(). By default it will split on white space and replace the strings with lists. By adding some arguments we can split on commas and expand the result into separate columns.
The first argument in the Series.str.split() method is the separator to split on. expand=True returns 2 new columns rather than one column with a list of strings.

In [2]:
# create 2 new columns, FirstName and LastName by splitting the Name column
df[['LastName','FirstName']] = df['Name'].str.split(',', expand=True)
# drop the 'Name' column
df.drop('Name', axis=1, inplace=True)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,LastName,FirstName
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,,S,Braund,Mr. Owen Harris
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C,Cumings,Mrs. John Bradley (Florence Briggs Thayer)
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,Heikkinen,Miss. Laina
3,4,1,1,female,35.0,1,0,113803,53.1,C123,S,Futrelle,Mrs. Jacques Heath (Lily May Peel)
4,5,0,3,male,35.0,0,0,373450,8.05,,S,Allen,Mr. William Henry


In [None]:
# Clean-up
# One thing we need to address is a little leftover dirt in our data from the last operation. 
# We separated the 'Name' string on the comma, not the white space, so the white space remained in the string.
df.loc[0,'FirstName']

In [3]:
# Let's clean up that column with another Pandas string method, Series.str.strip(). This removes whitespace from the front and back of all of the strings in a column.

df['FirstName'] = df['FirstName'].str.strip()
df.loc[0, 'FirstName']

'Mr. Owen Harris'

In [4]:
# Combining Strings
# We can also combine strings. Perhaps we decided that instead of 'Last Name, First Name' we wanted the format of our 'Name' column to be 'First Name Last Name'. We could use the '+' operator to
# combine them in reverse order to the original and leave out the comma. However, we want a space between them, so we will add that as well.
df['Name'] = df['FirstName'] + ' ' + df['LastName']
df.drop(columns=['LastName','FirstName'], inplace= True)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Name
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,,S,Mr. Owen Harris Braund
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs. John Bradley (Florence Briggs Thayer) Cum...
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss. Laina Heikkinen
3,4,1,1,female,35.0,1,0,113803,53.1,C123,S,Mrs. Jacques Heath (Lily May Peel) Futrelle
4,5,0,3,male,35.0,0,0,373450,8.05,,S,Mr. William Henry Allen
