## Data Pre-Processing for creating AA_Meetups Dataset

### The collected data undergoes through a series of steps during preprocessing

* Data Cleaning
* Data Integration
* Data Transformation

### Importing necessary Libraries

In [1]:
from datetime import datetime # Date customization
import itertools # iterators for efficient looping
import pandas as pd # primary data structure library
import re # match or find other strings or sets of strings
import ast # Abstract Syntax Trees

### Creating Empty lists for AA_Meetups data to be Filtered

In [2]:
locations, days, dates, years, meetings, timings = [], [], [], [], [], []

### Collected data ['The_Hindu.csv'] read into a pandas dataframe

In [3]:
df = pd.read_csv('The_Hindu.csv') # Data read into a pandas dataframe
df.head()

Unnamed: 0,Headlines,Descriptions,Authors,Published_Dates,Articles,Source_URLs
0,"Engagements for Thursday, April 18","RELIGION Ramayanam: Akkaraikeni Srinidhi, Sri ...",,18-Apr-19,"RELIGIONRamayanam: Akkaraikeni Srinidhi, Sri R...",https://www.thehindu.com/news/cities/chennai/e...
1,"Madurai Today - April18, 2019",RELIGION Selva Vinayagar Temple: Chitra Pourna...,,18-Apr-19,RELIGIONSelva Vinayagar Temple:Chitra Pournami...,https://www.thehindu.com/todays-paper/tp-natio...
2,engagements,TIRUNELVELI Alcoholics Anonymous and Al-Anon: ...,,18-Apr-19,TIRUNELVELIAlcoholics Anonymous and Al-Anon:Re...,https://www.thehindu.com/todays-paper/tp-natio...
3,"Madurai Today - April18, 2019",RELIGIONSelva Vinayagar Temple: Chitra Pournam...,,17-Apr-19,RELIGIONSelva Vinayagar Temple: Chitra Pournam...,https://www.thehindu.com/events/madurai-today-...
4,Madurai Today,RELIGION Selva Vinayagar Temple: Pradosha puja...,Special Correspondent,17-Apr-19,"RELIGIONSelva Vinayagar Temple:Pradosha puja, ...",https://www.thehindu.com/todays-paper/tp-natio...


### size of initial dataframe (rows, columns)

In [4]:
df.shape

(11842, 6)

### Cleaning the data by droping the missing values and creating new dataframe consist of Alcoholics Anonymous Articles

In [5]:
meet_df = df[df['Articles'].str.contains('Alcoholics', case=False, na=False)]
meet_df.head()

Unnamed: 0,Headlines,Descriptions,Authors,Published_Dates,Articles,Source_URLs
0,"Engagements for Thursday, April 18","RELIGION Ramayanam: Akkaraikeni Srinidhi, Sri ...",,18-Apr-19,"RELIGIONRamayanam: Akkaraikeni Srinidhi, Sri R...",https://www.thehindu.com/news/cities/chennai/e...
1,"Madurai Today - April18, 2019",RELIGION Selva Vinayagar Temple: Chitra Pourna...,,18-Apr-19,RELIGIONSelva Vinayagar Temple:Chitra Pournami...,https://www.thehindu.com/todays-paper/tp-natio...
2,engagements,TIRUNELVELI Alcoholics Anonymous and Al-Anon: ...,,18-Apr-19,TIRUNELVELIAlcoholics Anonymous and Al-Anon:Re...,https://www.thehindu.com/todays-paper/tp-natio...
3,"Madurai Today - April18, 2019",RELIGIONSelva Vinayagar Temple: Chitra Pournam...,,17-Apr-19,RELIGIONSelva Vinayagar Temple: Chitra Pournam...,https://www.thehindu.com/events/madurai-today-...
4,Madurai Today,RELIGION Selva Vinayagar Temple: Pradosha puja...,Special Correspondent,17-Apr-19,"RELIGIONSelva Vinayagar Temple:Pradosha puja, ...",https://www.thehindu.com/todays-paper/tp-natio...


### size of filtered dataframe (rows, columns)

In [6]:
meet_df.shape

(11191, 6)

### Using regex, extracting the Alcoholics Anonymous Meetings from the Articles column

In [7]:
for meeting in meet_df['Articles']:
    try:
        meetings.append(re.findall("Alcoholics.*$",meeting,re.MULTILINE)[0])
    except:
        meetings.append(None)

### Filtering out the times from the Alcoholics Anonymous Meetings Schedule

In [8]:
for item in meet_df['Articles']:
    try:
        a = re.findall(r'\s(\d{2}\.\d{2}\s?(?:am.|pm.|a.m.|p.m.))|\s(\d{1}\s?(?:am.|pm.|a.m.|p.m.))|\s(\d{2}\s?(?:am.|pm.|a.m.|p.m.))|\s(\d{1}\.\d{2}\s?(?:am.|pm.|a.m.|p.m.))|\s(\d{2}\.\d{1}\s?(?:am.|pm.|a.m.|p.m.))', item)
        str_list = list(itertools.chain(*a))
        timings.append(list(filter(None, str_list)))
    except:
        timings.append(None)

### With the help of Headlines Column filtering the published region 

In [9]:
for headline in meet_df['Headlines']:
    if 'engagements' in headline.lower():
        locations.append('engagements')
    elif 'chennai' in headline.lower():
        locations.append('chennai')
    elif 'madurai' in headline.lower():
        locations.append('madurai')
    elif 'thiruvananthapuram' in headline.lower():
        locations.append('thiruvananthapuram')
    elif 'tirunelveli' in headline.lower():
        locations.append('tirunelveli')
    elif 'coimbatore' or 'caoimbatore' in headline.lower():
        locations.append('coimbatore')
    elif 'hyderabad' in headline.lower():
        locations.append('hyderabad')
    elif 'new delhi' in headline.lower():
        locations.append('new delhi')
    elif 'tirupur' in headline.lower():
        locations.append('tirupur')
    elif 'tiruchi' in headline.lower():
        locations.append('tiruchi')
    elif 'vijayawada' in headline.lower():
        locations.append('vijayawada')
    elif 'dindugul' in headline.lower():
        locations.append('dindugul')
    elif 'kochi' in headline.lower():
        locations.append('kochi')
    elif 'nellai' in headline.lower():
        locations.append('nellai')
    elif 'kozhikode' in headline.lower():
        locations.append('kozhikode')
    elif 'Tuticorin' in headline.lower():
        locations.append('Tuticorin')
    elif 'mangalore' in headline.lower():
        locations.append('mangalore')
    else:
        locations.append(None)

### Extracting the Day in the AA_Meetups held

In [10]:
for date in meet_df['Published_Dates']:
    x = datetime.strptime(date, '%d-%b-%y')
    days.append(x.strftime('%A'))

### Extracting the Date in the AA_Meetups held

In [11]:
dates = []
for date in meet_df['Published_Dates']:
    x = datetime.strptime(date, '%d-%b-%y')
    dates.append(x.strftime('%d-%m-%y'))

### Extracting the Year in the AA_Meetups held

In [12]:
for date in meet_df['Published_Dates']:
    x = datetime.strptime(date, '%d-%b-%y')
    years.append(x.strftime('%Y'))

### Checking Array Length of each list to create DataFrame

In [13]:
print(len(locations), len(days),  len(dates),  len(meetings), len(years), len(timings))

11191 11191 11191 11191 11191 11191


### Creating a csv file after validating array length

In [14]:
df = pd.DataFrame({'Published_Region' : locations,
                   'Day' : days, 
                   'Published_Date' : dates,
                   'Year' : years,
                   'Meetings' : meetings,
                   'Timings' : timings})

df.to_csv('AA_Meets.csv', index=False)
df.head()

Unnamed: 0,Published_Region,Day,Published_Date,Year,Meetings,Timings
0,engagements,Thursday,18-04-19,2019,"Alcoholics Anonymous: Meetings, Church of Chri...","[6.30 p.mn, 7 p.m., 7 p.m., 7 p.m., 7 p.m., 7 ..."
1,madurai,Thursday,18-04-19,2019,"Alcoholics Anonymous:Meeting; Dheep Hospital, ...","[7 p.m., 5.45 a.m., 7 p.m., 6 a.m., 9 a.m., 5 ..."
2,engagements,Thursday,18-04-19,2019,Alcoholics Anonymous and Al-Anon:Regular meeti...,[7 p.m.]
3,madurai,Wednesday,17-04-19,2019,"Alcoholics Anonymous: Meeting; Dheep Hospital,...","[7 p.m., 5.45 a.m., 7 p.m., 6 a.m., 9 a.m., 5 ..."
4,madurai,Wednesday,17-04-19,2019,"Alcoholics Anonymous:Meeting; R. C. Church, Si...","[4.30 p.m., 4 a.m., 7.30 p.m., 9.50 a.m., 11 a..."
