In [1]:
import itertools # iterators for efficient looping
import pandas as pd # primary data structure library
import re # match or find other strings or sets of strings

In [2]:
df = pd.read_csv('The_Hindu_AA_Meetings.csv') # Data read into a pandas dataframe
df.head()

Unnamed: 0,Headline,Author,Published_Region,Published_Date,Meetings,Main_Article,Source_urls
0,Madurai Today,Special Correspondent,TAMIL NADU,"April 14, 2019",Anonymous: Meeting; U. C. Higher Secondary Sc...,RELIGION\r\nSelva Vinayagar Temple: Panchangam...,https://www.thehindu.com/todays-paper/tp-natio...
1,Madurai Today,Special Correspondent,TAMIL NADU,"April 13, 2019","Anonymous: Meeting; Ahana Hospital, Anna Bus ...",RELIGION\r\nMeenakshi Sundareswarar Temple: Ch...,https://www.thehindu.com/todays-paper/tp-natio...
2,Engagements,,TAMIL NADU,"April 13, 2019",,,https://www.thehindu.com/todays-paper/tp-natio...
3,Madurai Today,,TAMIL NADU,"April 12, 2019","Anonymous: Meeting; Tamilarasi School, Melur,...",RELIGION\r\nMeenakshi Sundareswarar Temple: Ch...,https://www.thehindu.com/todays-paper/tp-natio...
4,engagements,,TAMIL NADU,"April 12, 2019",,,https://www.thehindu.com/todays-paper/tp-natio...


In [3]:
# size of dataframe (rows, columns)
df.shape

(10616, 7)

In [4]:
df['Meetings'].head() # top 5 rows of the dataset 

0     Anonymous: Meeting; U. C. Higher Secondary Sc...
1     Anonymous: Meeting; Ahana Hospital, Anna Bus ...
2                                                  NaN
3     Anonymous: Meeting; Tamilarasi School, Melur,...
4                                                  NaN
Name: Meetings, dtype: object

In [5]:
len(df['Meetings']) # lenght of the dataframe

10616

In [8]:
meet_df = df[df['Meetings'].str.contains('Anonymous', case=False, na=False)].dropna() # filtering Meetings on the list of News  
len(meet_df)

831

In [9]:
# remove a few unnecessary columns
# in pandas axis=0 represents rows (default) and axis=1 represents columns.
meet_df.drop(['Author', 'Main_Article', 'Source_urls'], axis=1, inplace=True) 
meet_df.to_csv('meetings.csv', index=False)

In [10]:
# before running the script change the published date as below format using excel
df = pd.read_csv('meetings.csv')
df.head()

Unnamed: 0,Headline,Published_Region,Published_Date,Meetings
0,Madurai Today,TAMIL NADU,"Sunday, April 14, 2019",Anonymous: Meeting; U. C. Higher Secondary Sc...
1,Madurai Today,TAMIL NADU,"Saturday, April 13, 2019","Anonymous: Meeting; Ahana Hospital, Anna Bus ..."
2,Madurai Today,TAMIL NADU,"Wednesday, April 10, 2019","Anonymous: Meeting; R. C. Church, Sivaganga R..."
3,Madurai today for April 7,TAMIL NADU,"Sunday, April 7, 2019",Anonymous: Meeting; U. C. Higher Secondary Sc...
4,"Madurai Today - March 5, 2019",TAMIL NADU,"Friday, April 5, 2019","Anonymous: Meeting; Tamilarasi School, Melur,..."


In [11]:
# Filtering out the times from the Meetings Schedule
meets = []
for item in df['Meetings']:
    a = re.findall(r'\s(\d{2}\.\d{2}\s?(?:am.|pm.|a.m.|p.m.))|\s(\d{1}\s?(?:am.|pm.|a.m.|p.m.))|\s(\d{2}\s?(?:am.|pm.|a.m.|p.m.))|\s(\d{1}\.\d{2}\s?(?:am.|pm.|a.m.|p.m.))|\s(\d{2}\.\d{1}\s?(?:am.|pm.|a.m.|p.m.))', item)
    str_list = list(itertools.chain(*a))
    meets.append(list(filter(None, str_list)))
    
df['time'] = meets
df['time'].head()

0    [11 a.m., 6.30 p.m., 7 p.m.]
1                        [7 p.m.]
2             [6.30 p.m., 7 p.m.]
3    [11 a.m., 6.30 p.m., 7 p.m.]
4             [6.30 p.m., 7 p.m.]
Name: time, dtype: object

In [12]:
# Extract the Day of meetings held
day = [item.split()[0].replace(',', '') for item in df['Published_Date']]
df['day'] = day
df['day'].head()

0       Sunday
1     Saturday
2    Wednesday
3       Sunday
4       Friday
Name: day, dtype: object

In [13]:
# Filtering the Date of meetings held
df['date'] = pd.to_datetime(df['Published_Date'])
df['date'].head()

0   2019-04-14
1   2019-04-13
2   2019-04-10
3   2019-04-07
4   2019-04-05
Name: date, dtype: datetime64[ns]

In [14]:
# Creating new DataFrame from the Filtered_AA_Meeting_Schedules
df = pd.DataFrame({'Headline' : df['Headline'], 
                   'Day' : df['day'], 
                   'Published_Date' : df['date'],
                   'Published_Region' : df['Published_Region'], 
                   'Timings' : df['time'], 
                   'Meetings' : df['Meetings']})
df.head()

Unnamed: 0,Headline,Day,Published_Date,Published_Region,Timings,Meetings
0,Madurai Today,Sunday,2019-04-14,TAMIL NADU,"[11 a.m., 6.30 p.m., 7 p.m.]",Anonymous: Meeting; U. C. Higher Secondary Sc...
1,Madurai Today,Saturday,2019-04-13,TAMIL NADU,[7 p.m.],"Anonymous: Meeting; Ahana Hospital, Anna Bus ..."
2,Madurai Today,Wednesday,2019-04-10,TAMIL NADU,"[6.30 p.m., 7 p.m.]","Anonymous: Meeting; R. C. Church, Sivaganga R..."
3,Madurai today for April 7,Sunday,2019-04-07,TAMIL NADU,"[11 a.m., 6.30 p.m., 7 p.m.]",Anonymous: Meeting; U. C. Higher Secondary Sc...
4,"Madurai Today - March 5, 2019",Friday,2019-04-05,TAMIL NADU,"[6.30 p.m., 7 p.m.]","Anonymous: Meeting; Tamilarasi School, Melur,..."


In [15]:
df.to_csv('Meeting_Schedule.csv', index=False)