In [6]:
tweet = "@nltk Text analysis is awesome! #regex #pandas #python"

words = tweet.split(' ')
hashtags=[word for word in words if word.startswith('#')]
print(hashtags)

['#regex', '#pandas', '#python']


In [7]:
hashtags = [word for word in tweet.split() if word.startswith('#')]

In [8]:
print(hashtags)

['#regex', '#pandas', '#python']


In [11]:
#using RE for the same thing
import re

hashtags_re = [word for word in tweet.split() if re.search('#[A-Za-z0-9]+', word)]
print(hashtags_re)

['#regex', '#pandas', '#python']


# Working with text in Pandas dataframe

In [12]:
import pandas as pd

time_sentences = ["Monday: The doctor's appointment is at 2:45pm.", 
                  "Tuesday: The dentist's appointment is at 11:30 am.",
                  "Wednesday: At 7:00pm, there is a basketball game!",
                  "Thursday: Be back home by 11:15 pm at the latest.",
                  "Friday: Take the train at 08:10 am, arrive at 09:00am."]

df = pd.DataFrame(time_sentences, columns=['text'])
df

Unnamed: 0,text
0,Monday: The doctor's appointment is at 2:45pm.
1,Tuesday: The dentist's appointment is at 11:30...
2,"Wednesday: At 7:00pm, there is a basketball game!"
3,Thursday: Be back home by 11:15 pm at the latest.
4,"Friday: Take the train at 08:10 am, arrive at ..."


In [13]:
df['text'].str.len()

0    46
1    50
2    49
3    49
4    54
Name: text, dtype: int64

In [14]:
df['text'].str.split().str.len()

0     7
1     8
2     8
3    10
4    10
Name: text, dtype: int64

In [15]:
df['text'].str.count(r'\d')

0    3
1    4
2    3
3    4
4    8
Name: text, dtype: int64

In [16]:
## Find times mentioned in the text
df['text'].str.findall(r'(\d?\d):(\d\d)')

0               [(2, 45)]
1              [(11, 30)]
2               [(7, 00)]
3              [(11, 15)]
4    [(08, 10), (09, 00)]
Name: text, dtype: object

In [17]:
#Replace names of days of week with ???
df['text'].str.replace(r'\w+day\b', '???')

0          ???: The doctor's appointment is at 2:45pm.
1       ???: The dentist's appointment is at 11:30 am.
2          ???: At 7:00pm, there is a basketball game!
3         ???: Be back home by 11:15 pm at the latest.
4    ???: Take the train at 08:10 am, arrive at 09:...
Name: text, dtype: object

In [22]:
df['text'].str.findall(r'(\w+day\b)')

0       [Monday]
1      [Tuesday]
2    [Wednesday]
3     [Thursday]
4       [Friday]
Name: text, dtype: object

In [18]:
# Replace name of full day of week with short form

df['text'].str.replace(r'(\w+day\b)', lambda x: x.groups()[0][:3])

0          Mon: The doctor's appointment is at 2:45pm.
1       Tue: The dentist's appointment is at 11:30 am.
2          Wed: At 7:00pm, there is a basketball game!
3         Thu: Be back home by 11:15 pm at the latest.
4    Fri: Take the train at 08:10 am, arrive at 09:...
Name: text, dtype: object

In [23]:
# extract creates a new columns 
# for ex to extract out hours:
df['text'].str.extract(r'(\d?\d):(\d\d)')

#extract only extracts the first match


Unnamed: 0,0,1
0,2,45
1,11,30
2,7,0
3,11,15
4,8,10


In [25]:
#testing what happens if there is no match
df['text'].str.extract(r'(home)')

Unnamed: 0,0
0,
1,
2,
3,home
4,


In [26]:
#you can extract all occurances and make columns like this:

df['text'].str.extractall(r'((\d?\d):(\d\d) ?([ap]?m))')

#this produces a multiindex data frame: https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html


Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,2:45pm,2,45,pm
1,0,11:30 am,11,30,am
2,0,7:00pm,7,0,pm
3,0,11:15 pm,11,15,pm
4,0,08:10 am,8,10,am
4,1,09:00am,9,0,am


In [30]:
#named grous: use special sequence ?P<> to indicate the name of the group

# for example, recreating the previous one with 
df['text'].str.extractall(r'(?P<time>(?P<hour>\d?\d):(?P<minutes>\d\d) ?(ap)m)')

Unnamed: 0_level_0,Unnamed: 1_level_0,time,hour,minutes,3
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


In [28]:
native_name = "प्रसाद"

english_name = "Prasad"

print(native_name, '\n', english_name)


प्रसाद 
 Prasad
