In [1]:
import itertools # iterators for efficient looping
import pandas as pd # primary data structure library
import re # match or find other strings or sets of strings

In [2]:
df = pd.read_csv('THE_HINDU_AA_NEWS_DETAILS.csv') # Data read into a pandas dataframe
df.head()

Unnamed: 0,Headline,Authors,Main_Image,Published_Date,Meetings,News,Source_urls
0,"Engagements for Sunday, March 31",['To Run A.M. Until P.M.'],https://www.thehindu.com/static/theme/default/...,2019-03-31 01:07:44,"Meetings, Assemption School, St. Teresa Church...","RELIGION\r\nRamayanam: Mathivannan, Asthika Sa...",https://www.thehindu.com/news/cities/chennai/e...
1,Chennai Today,[],https://www.thehindu.com/static/theme/default/...,2019-03-30 00:00:00,"Meetings, Community Service Centre, Balfour Rd...","Religion\r\nKamba Ramayanam: Mathivannan, Asth...",https://www.thehindu.com/todays-paper/tp-natio...
2,Madurai Today,[],https://www.thehindu.com/static/theme/default/...,2019-03-30 00:00:00,"Meeting; Ahana Hospital, Anna Bus Stand; Quali...",RELIGION\r\nMeenakshi Sundareswarar Temple: Th...,https://www.thehindu.com/todays-paper/tp-natio...
3,engagements,[],https://www.thehindu.com/static/theme/default/...,2019-03-30 00:00:00,TIRUNELVELI\r\nAlcoholics Anonymous and Al-Ano...,TIRUNELVELI\r\nAlcoholics Anonymous and Al-Ano...,https://www.thehindu.com/todays-paper/tp-natio...
4,CHENNAI TODAY,['To Run A.M. Until P.M.'],https://www.thehindu.com/static/theme/default/...,2019-03-29 01:44:55,"Meetings, Kevin School, Royapuram; Singaram Pi...",RELIGION\r\nRamayanam: Harikatha by Kalyanapur...,https://www.thehindu.com/news/cities/chennai/c...


In [3]:
# size of dataframe (rows, columns)
df.shape

(11796, 7)

In [4]:
df['Meetings'].head() # top 5 rows of the dataset 

0    Meetings, Assemption School, St. Teresa Church...
1    Meetings, Community Service Centre, Balfour Rd...
2    Meeting; Ahana Hospital, Anna Bus Stand; Quali...
3    TIRUNELVELI\r\nAlcoholics Anonymous and Al-Ano...
4    Meetings, Kevin School, Royapuram; Singaram Pi...
Name: Meetings, dtype: object

In [5]:
len(df['Meetings']) # lenght of the dataframe

11796

In [6]:
meet_df = df[df['News'].str.contains('Meeti', case=False, na=False)].dropna() # filtering Meetings on the list of News  
len(meet_df)

10590

In [7]:
# remove a few unnecessary columns
# in pandas axis=0 represents rows (default) and axis=1 represents columns.
meet_df.drop(['Authors', 'Main_Image', 'News', 'Source_urls'], axis=1, inplace=True) 
meet_df.to_csv('meetings.csv', index=False)

In [8]:
df = pd.read_csv('meetings.csv')
df.head()

Unnamed: 0,Headline,Published_Date,Meetings
0,"Engagements for Sunday, March 31","Sunday, March 31, 2019","Meetings, Assemption School, St. Teresa Church..."
1,Chennai Today,"Saturday, March 30, 2019","Meetings, Community Service Centre, Balfour Rd..."
2,Madurai Today,"Saturday, March 30, 2019","Meeting; Ahana Hospital, Anna Bus Stand; Quali..."
3,engagements,"Saturday, March 30, 2019",TIRUNELVELI\r\r\nAlcoholics Anonymous and Al-A...
4,CHENNAI TODAY,"Friday, March 29, 2019","Meetings, Kevin School, Royapuram; Singaram Pi..."


In [9]:
# Filtering out the times from the Meetings Schedule
meets = []
for item in df['Meetings']:
    a = re.findall(r'\s(\d{2}\.\d{2}\s?(?:am.|pm.|a.m.|p.m.))|\s(\d{1}\s?(?:am.|pm.|a.m.|p.m.))|\s(\d{2}\s?(?:am.|pm.|a.m.|p.m.))|\s(\d{1}\.\d{2}\s?(?:am.|pm.|a.m.|p.m.))|\s(\d{2}\.\d{1}\s?(?:am.|pm.|a.m.|p.m.))', item)
    str_list = list(itertools.chain(*a))
    meets.append(list(filter(None, str_list)))
    
df['time'] = meets
df['time'].head()

0    [10.00 a.m., 11 a.m., 11.30 a.m., 6 p.m., 7 pm.]
1                                            [7 p.m.]
2                                            [7 p.m.]
3       [7 p.m., 5 p.m., 10.30 a.m., 5 p.m., 10 a.m.]
4                                            [7 p.m.]
Name: time, dtype: object

In [10]:
# Extract the Day of meetings held
day = [item.split()[0].replace(',', '') for item in df['Published_Date']]
df['day'] = day
df['day'].head()

0      Sunday
1    Saturday
2    Saturday
3    Saturday
4      Friday
Name: day, dtype: object

In [11]:
# Filtering the Date of meetings held
df['date'] = pd.to_datetime(df['Published_Date'])
df['date'].head()

0   2019-03-31
1   2019-03-30
2   2019-03-30
3   2019-03-30
4   2019-03-29
Name: date, dtype: datetime64[ns]

In [12]:
# Creating new DataFrame from the Filtered_AA_Meeting_Schedules
df = pd.DataFrame({'Headline' : df['Headline'],
                  'Day' : df['day'],
                  'Date' : df['date'],
                  'Timings' : df['time']})
df.head()

Unnamed: 0,Headline,Day,Date,Timings
0,"Engagements for Sunday, March 31",Sunday,2019-03-31,"[10.00 a.m., 11 a.m., 11.30 a.m., 6 p.m., 7 pm.]"
1,Chennai Today,Saturday,2019-03-30,[7 p.m.]
2,Madurai Today,Saturday,2019-03-30,[7 p.m.]
3,engagements,Saturday,2019-03-30,"[7 p.m., 5 p.m., 10.30 a.m., 5 p.m., 10 a.m.]"
4,CHENNAI TODAY,Friday,2019-03-29,[7 p.m.]


In [13]:
df.to_csv('Meeting_Schedule.csv', index=True)