---

_You are currently looking at **version 1.0** of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the [Jupyter Notebook FAQ](https://www.coursera.org/learn/python-text-mining/resources/d9pwm) course resource._

---

# Working with Text Data in pandas

In [13]:
import pandas as pd

time_sentences = ["Monday: The doctor's appointment is at 2:45pm.", 
                  "Tuesday: The dentist's appointment is at 11:30 am.",
                  "Wednesday: At 7:00pm, there is a basketball game!",
                  "Thursday: Be back home by 11:15 pm at the latest.",
                  "Friday: Take the train at 08:10 am, arrive at 09:00am."]

df = pd.DataFrame(time_sentences, columns=['text'])
df

s = ["It's Sunday ðŸ˜¤","I am laying in sofa with my sleepy pet at 2:45 pm",
    "I bunked statistics class and walked towards the big tree at 5:25 pm",
    "I had a chicken dinner at 7:45 pm"]

d = pd.DataFrame(s,columns=['sent'])
d.head(5)

Unnamed: 0,sent
0,It's Sunday ðŸ˜¤
1,I am laying in sofa with my sleepy pet at 2:45 pm
2,I bunked statistics class and walked towards t...
3,I had a chicken dinner at 7:45 pm


In [14]:
# find the number of characters for each string in df['text']
df['text'].str.len()

d['sent'].str.len()

0    13
1    49
2    68
3    33
Name: sent, dtype: int64

In [16]:
# find the number of tokens for each string in df['text']
df['text'].str.split().str.len()

d['sent'].str.split().str.len()

0     3
1    12
2    13
3     8
Name: sent, dtype: int64

In [18]:
# find which entries contain the word 'appointment'
df['text'].str.contains('appointment')

d['sent'].str.contains('I')

0    True
1    True
2    True
3    True
Name: sent, dtype: bool

In [19]:
# find how many times a digit occurs in each string
df['text'].str.count(r'\d')

d['sent'].str.count(r'\d')

0    0
1    3
2    3
3    3
Name: sent, dtype: int64

In [21]:
# find all occurances of the digits
df['text'].str.findall(r'\d')

d['sent'].str.findall(r'\d')

0           []
1    [2, 4, 5]
2    [5, 2, 5]
3    [7, 4, 5]
Name: sent, dtype: object

In [23]:
# group and find the hours and minutes
df['text'].str.findall(r'(\d?\d):(\d\d)')

d['sent'].str.findall(r'(\d?\d):(\d\d)')

0           []
1    [(2, 45)]
2    [(5, 25)]
3    [(7, 45)]
Name: sent, dtype: object

In [24]:
# replace weekdays with '???'
df['text'].str.replace(r'\w+day\b', '???')

d['sent'].str.replace(r'\w+day\b','???')

0                                           It's ??? ðŸ˜¤
1    I am laying in sofa with my sleepy pet at 2:45 pm
2    I bunked statistics class and walked towards t...
3                    I had a chicken dinner at 7:45 pm
Name: sent, dtype: object

In [9]:
# replace weekdays with 3 letter abbrevations
df['text'].str.replace(r'(\w+day\b)', lambda x: x.groups()[0][:3])

0          Mon: The doctor's appointment is at 2:45pm.
1       Tue: The dentist's appointment is at 11:30 am.
2          Wed: At 7:00pm, there is a basketball game!
3         Thu: Be back home by 11:15 pm at the latest.
4    Fri: Take the train at 08:10 am, arrive at 09:...
Name: text, dtype: object

In [25]:
# create new columns from first match of extracted groups
df['text'].str.extract(r'(\d?\d):(\d\d)')

d['sent'].str.extract(r'(\d?\d):(\d\d)')

  
  after removing the cwd from sys.path.


Unnamed: 0,0,1
0,,
1,2.0,45.0
2,5.0,25.0
3,7.0,45.0


In [27]:
# extract the entire time, the hours, the minutes, and the period
df['text'].str.extractall(r'((\d?\d):(\d\d) ?([ap]m))')

d['sent'].str.extractall(r'((\d?\d):(\d\d) ?([ap]m))')

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0,2:45 pm,2,45,pm
2,0,5:25 pm,5,25,pm
3,0,7:45 pm,7,45,pm


In [28]:
# extract the entire time, the hours, the minutes, and the period with group names
df['text'].str.extractall(r'(?P<time>(?P<hour>\d?\d):(?P<minute>\d\d) ?(?P<period>[ap]m))')

d['sent'].str.extractall(r'(?P<time>(?P<hour>\d?\d):(?P<minute>\d\d) ?(?P<period>[ap]m))')

Unnamed: 0_level_0,Unnamed: 1_level_0,time,hour,minute,period
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0,2:45 pm,2,45,pm
2,0,5:25 pm,5,25,pm
3,0,7:45 pm,7,45,pm
