### Data Pre-Processing for creating AA_Meetups Dataset

#### The collected data undergoes through a series of steps during preprocessing

- Data Cleaning
- Data Integration
- Data Transformation

#### Importing necessary Libraries

In [1]:
from datetime import datetime # Date customization
import itertools # iterators for efficient looping
import pandas as pd # primary data structure library
import re # match or find other strings or sets of strings
import ast # Abstract Syntax Trees

### Collected data ['The_Hindu.csv'] read into a pandas dataframe

In [2]:
data = pd.read_csv('The_Hindu.csv')

In [3]:
data.head()

Unnamed: 0,Headlines,Descriptions,Authors,Published_Dates,Articles,Source_URLs
0,"Engagements for Thursday, April 18","RELIGION Ramayanam: Akkaraikeni Srinidhi, Sri ...",,18-Apr-19,"RELIGIONRamayanam: Akkaraikeni Srinidhi, Sri R...",https://www.thehindu.com/news/cities/chennai/e...
1,"Madurai Today - April18, 2019",RELIGION Selva Vinayagar Temple: Chitra Pourna...,,18-Apr-19,RELIGIONSelva Vinayagar Temple:Chitra Pournami...,https://www.thehindu.com/todays-paper/tp-natio...
2,engagements,TIRUNELVELI Alcoholics Anonymous and Al-Anon: ...,,18-Apr-19,TIRUNELVELIAlcoholics Anonymous and Al-Anon:Re...,https://www.thehindu.com/todays-paper/tp-natio...
3,"Madurai Today - April18, 2019",RELIGIONSelva Vinayagar Temple: Chitra Pournam...,,17-Apr-19,RELIGIONSelva Vinayagar Temple: Chitra Pournam...,https://www.thehindu.com/events/madurai-today-...
4,Madurai Today,RELIGION Selva Vinayagar Temple: Pradosha puja...,Special Correspondent,17-Apr-19,"RELIGIONSelva Vinayagar Temple:Pradosha puja, ...",https://www.thehindu.com/todays-paper/tp-natio...


In [4]:
data.shape

(11842, 6)

### Creating the Headline Constraints list accordingly to Locations and events

In [5]:
Constraints = ['Engage', 'Chennai', 'Madurai', 'Thiruvananthapuram', 'Tirunelveli', 'Caoimbatore', 'Coimbatore', 'Hyderabad', 
               'Hyderabas', 'New delhi', 'Tirupur', 'Tiruchi', 'Vijayawada', 'Dindugul', 'Kochi', 'Nellai', 'Kozhikode', 
               'Tuticorin', 'Mangalore', 'Bangalore', 'Bengaluru']

### Creating Empty lists for AA_Meetups data to be Filtered

In [6]:
articles, headlines, dates, locations, days, years, timings = [], [], [], [], [], [], []

### Itertate the loop to extract the Alcoholics keyword related articles. Using regex, extracting the Alcoholics Anonymous Meetings Schedule from the Articles column

In [7]:
time = r'\s(\d{2}\.\d{2}\s?(?:am.|pm.|a.m.|p.m.))|\s(\d{1}\s?(?:am.|pm.|a.m.|p.m.))|\s(\d{2}\s?(?:am.|pm.|a.m.|p.m.))|\s(\d{1}\.\d{2}\s?(?:am.|pm.|a.m.|p.m.))|\s(\d{2}\.\d{1}\s?(?:am.|pm.|a.m.|p.m.))|\s(\d{1}\s?(?:a.|p.))|\s(\d{2}\s?(?:a.|p.))|(\d{1}\s?(?:a.|p.))|(\d{2}\s?(?:a.|p.))|(\s\d{1}\.?(?:a.|p.))|(\s\d{2}\.?(?:a.|p.))'

In [8]:
for i in range(len(data)):
    for title in Constraints:
        if (title.lower() in data['Headlines'][i].lower() and 'Alcoholics' and 'Meeting' in data['Articles'][i]):
            start = data['Articles'][i].find('Alcoholics')
            end = data['Articles'][i].find('Please')
            articles.append(data['Articles'][i][start:end])
            locations.append(title)
            headlines.append(data['Headlines'][i])
            dates.append(data['Published_Dates'][i])
            x = datetime.strptime(data['Published_Dates'][i], '%d-%b-%y')
            days.append(x.strftime('%A'))
            years.append(x.strftime('%Y'))
            try:
                a = re.findall(time, data['Articles'][i])
                str_list = list(itertools.chain(*a))
                timings.append(list(filter(None, str_list)))
            except:
                timings.append(None)
            break

### Replace 'Engage' with 'Engagement' and 'Hyderabas' with 'Hyderabad'

In [9]:
for i, item in enumerate(locations):
    if 'Engage' == item:
        locations[i] = item.replace(item, 'Engagement')
    elif 'Hyderabas' == item:
        locations[i] = item.replace(item, 'Hyderabad')
    else:
        item

### Checking Array Length of each list to create DataFrame

In [10]:
print(len(articles), len(headlines), len(dates), len(locations), len(days), len(years), len(timings))

8526 8526 8526 8526 8526 8526 8526


### Creating a csv file after validating array length

In [11]:
hindu = pd.DataFrame({'Headlines' : headlines,
                      'Locations' : locations,
                      'Day' : days,
                      'Year' : years,
                      'Published_Dates' : dates,
                      'Total Schedules in Region' : timings,
                      'Articles' : articles})

In [12]:
hindu.to_csv('AA_Meets.csv', index=False)

In [13]:
hindu.head()

Unnamed: 0,Headlines,Locations,Day,Year,Published_Dates,Total Schedules in Region,Articles
0,"Engagements for Thursday, April 18",Engagement,Thursday,2019,18-Apr-19,"[6.30 p.mn, 7 p.m., 7 p.m., 7 p.m., 7 p.m., 7 ...","Alcoholics Anonymous: Meetings, Church of Chri..."
1,"Madurai Today - April18, 2019",Madurai,Thursday,2019,18-Apr-19,"[7 p.m., 5.45 a.m., 7 p.m., 6 a.m., 9 a.m., 5 ...","Alcoholics Anonymous:Meeting; Dheep Hospital, ..."
2,"Madurai Today - April18, 2019",Madurai,Wednesday,2019,17-Apr-19,"[7 p.m., 5.45 a.m., 7 p.m., 6 a.m., 9 a.m., 5 ...","Alcoholics Anonymous: Meeting; Dheep Hospital,..."
3,Madurai Today,Madurai,Wednesday,2019,17-Apr-19,"[4.30 p.m., 4 a.m., 7.30 p.m., 9.50 a.m., 11 a...","Alcoholics Anonymous:Meeting; R. C. Church, Si..."
4,Chennai Today,Chennai,Tuesday,2019,16-Apr-19,"[6.30 p.mn, 7 p.m., 7 p.m., 4.30 p.m., 6.30 p....","Alcoholics Anonymous: Meetings, Santhome HSS.,..."


In [14]:
hindu.shape

(8526, 7)