# Python Pandas extract URL or date by regex

In [1]:
import pandas as pd

# Reading the CSV file as it is
result = pd.read_csv('../csv/url_dates.csv')  

In [2]:
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', -1)  # or 199

In [3]:
# Checking sample data
result.head()

Unnamed: 0,log
0,2019-10-28 19:56:03 DEMO <GET https://www.wikipedia.org/> (The Free Encyclopedia) 2019-10-29 9:06:03
1,"2019-10-29 19:56:03 DEMO <GET https://en.wikipedia.org/wiki/Main_Page> (5,962,233 articles in English) 2019-10-31 11:16:43"
2,2019-10-29 19:56:03 DEMO <GET https://it.wikipedia.org/wiki/Pagina_principale> (1 561 730 voci in italiano) 2019-10-30 21:15:23
3,2019-10-30 19:56:03 DEMO <GET https://pt.wikipedia.org/wiki/Wikip%C3%A9dia:P%C3%A1gina_principal> (1 014 783 artigos em português) 2019-10-30 20:26:35


# URL extraction from Dataframe

In [4]:
# extract urls by matching protocol - https and end >
# first part is a matching group while the ending is a non matching group
result['url'] = result.log.str.extract(r'(https.*)(?:>)').head()

In [5]:
# filtering results if needed
result[result['url'].str.contains('it.wikipedia.org')]

Unnamed: 0,log,url
2,2019-10-29 19:56:03 DEMO <GET https://it.wikipedia.org/wiki/Pagina_principale> (1 561 730 voci in italiano) 2019-10-30 21:15:23,https://it.wikipedia.org/wiki/Pagina_principale


In [6]:
# extract urls by matching protocol - https and end >
# first part is a matching group while the ending is a non matching group
result['url'] = result.log.str.extract(r'(https.*)(?:>)').head()

In [7]:
# examples

result.log.str.extract(r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})').head()


Unnamed: 0,0
0,https://www.wikipedia.org/>
1,https://en.wikipedia.org/wiki/Main_Page>
2,https://it.wikipedia.org/wiki/Pagina_principale>
3,https://pt.wikipedia.org/wiki/Wikip%C3%A9dia:P%C3%A1gina_principal>


In [8]:
# examples
result.log.str.extract(r'(ftp|http|https):\/\/(\w+:{0,1}\w*@)?(\S+)(:[0-9]+)?(\/|\/([\w#!:.?+=&%@!\-\/]))?').head()


Unnamed: 0,0,1,2,3,4,5
0,https,,www.wikipedia.org/>,,,
1,https,,en.wikipedia.org/wiki/Main_Page>,,,
2,https,,it.wikipedia.org/wiki/Pagina_principale>,,,
3,https,,pt.wikipedia.org/wiki/Wikip%C3%A9dia:P%C3%A1gina_principal>,,,


# Date extraction from Dataframe

In [9]:
# extract single date
result['date'] = result.log.str.extract(r'(\d{4}-\d{2}-\d{2})')

In [10]:
result['date']

0    2019-10-28
1    2019-10-29
2    2019-10-29
3    2019-10-30
Name: date, dtype: object

In [11]:
# extract multiple dates
result.log.str.extractall(r'(\d{4}-\d{2}-\d{2})')

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Unnamed: 0_level_1,match,Unnamed: 2_level_1
0,0,2019-10-28
0,1,2019-10-29
1,0,2019-10-29
1,1,2019-10-31
2,0,2019-10-29
2,1,2019-10-30
3,0,2019-10-30
3,1,2019-10-30


In [12]:
# unstack the multiindex
result.log.str.extractall(r'(\d{4}-\d{2}-\d{2})').unstack()

Unnamed: 0_level_0,0,0
match,0,1
0,2019-10-28,2019-10-29
1,2019-10-29,2019-10-31
2,2019-10-29,2019-10-30
3,2019-10-30,2019-10-30


In [13]:
# extract datetime
result['datetime'] = result.log.str.extract(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})')

In [14]:
result['datetime']

0    2019-10-28 19:56:03
1    2019-10-29 19:56:03
2    2019-10-29 19:56:03
3    2019-10-30 19:56:03
Name: datetime, dtype: object

In [15]:
# match datetime extract only date
result['date'] = result.log.str.extract(r'(\d{4}-\d{2}-\d{2}) (?:\d{2}-\d{2}-\d{2})')

In [16]:
result['date']

0    NaN
1    NaN
2    NaN
3    NaN
Name: date, dtype: object

In [17]:
# match datetime extract only date
result[['date', 'time']] = result.log.str.extract(r'(\d{4}-\d{2}-\d{2}) (\d{2}:\d{2}:\d{2})')

In [18]:
result[['date', 'time']]

Unnamed: 0,date,time
0,2019-10-28,19:56:03
1,2019-10-29,19:56:03
2,2019-10-29,19:56:03
3,2019-10-30,19:56:03


# Split URLs

In [19]:
result['url_split'] = 'https' + result.log.str.split('https', expand=True)[1].str.split('>', expand=True)[0]

In [20]:
result['url_split']

0    https://www.wikipedia.org/                                        
1    https://en.wikipedia.org/wiki/Main_Page                           
2    https://it.wikipedia.org/wiki/Pagina_principale                   
3    https://pt.wikipedia.org/wiki/Wikip%C3%A9dia:P%C3%A1gina_principal
Name: url_split, dtype: object