## RESOURCES

#### PACKAGES

In [38]:
# DATA
import pandas as pd
import numpy as np

# READING/WRITING
import json
import csv

# PARSING
from bs4 import BeautifulSoup, NavigableString, Tag
import lxml.html
import requests
from string import capwords
from titlecase import titlecase

# DATES/TIMES
from datetime import datetime
import pytz
from pytz import timezone
import time

# SYSTEM
import os, sys
from pathlib2 import Path
import ConfigParser
import multiprocessing as mp

# ERROR HANDLING
import errno
import traceback

# LOGGING
import logging
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText

# MISC
import collections
import re
import copy
import random
from pprint import pprint
import uuid



# print 'ALL MODULES LOADED'

In [190]:
# fid_110_tid_706047_pg_1.html - ValueError: time data 'Today 12:35 PM' does not match format '%m-%d-%Y %I:%M %p'
    
test_path = '../data/samples/'
test_fid = '110/'
test_file = 'fid_110_tid_706047_pg_1.html'

html_page = load_html_file(test_path + test_fid + test_file)

## FUNCTIONS

### POSTS

#### GET HTML POSTS

In [39]:
def get_html_posts(html_page):
    
    post_tables = html_page.find_all('table', {'id': re.compile('post')})
    post_rows = [post_table.find_all('tr', recursive=False) for post_table in post_tables]
    
    return post_rows
    
# html_posts = get_html_posts(html_page)
# len(html_posts)

#### GET POST DATE AND TIME

In [40]:
def get_date_time(el):
    
    top_bar = el.find_all('td', {'class': 'thead'})[0]
    top_bar_text = top_bar.get_text().encode('ascii', 'ignore').decode('ascii')
    top_bar_text = top_bar_text.replace('#', '').replace(',', '')
    
    if ('Today' or 'Yesterday') in top_bar_text:
        
        date_time = top_bar_text
            
    else:

        local_datetime = datetime.strptime(top_bar_text.split(' ', 1)[1], '%m-%d-%Y %I:%M %p')
        date_time = convert_to_utc(local_datetime, 'US/Mountain')
    
#     return utc_datetime
    return date_time

# get_date_time(html_posts[0][0]) # '2005-01-07 11:17:00 UTC+0000'

#### GET POST ID

In [41]:
def get_post_id(el):
    
    post_id_link = el.find_all('a', {'id': re.compile('postcount')})
    pid = post_id_link[0]['id'].split('t')[-1]
    
    return pid

# get_post_id(html_posts[0][0])

#### GET POST NUM

In [42]:
def get_post_num(el):
    
    top_bar = el.find_all('td', {'class': 'thead'})[0]
    top_bar_text = top_bar.get_text().encode('ascii', 'ignore').decode('ascii')
    top_bar_text = top_bar_text.replace('#', '').replace(',', '')
    
    post_count = top_bar_text.split(' ', 1)[0]
    
    return post_count

# get_post_num(html_posts[0][0]) #41

#### GET POST TEXT

In [43]:
def get_post_text(el):
    
    post_msg_el = el.find_all('div', {'id': re.compile('post_message')})[0]
    
    try: 
        post_msg_el.div.decompose() # TO REMOVE QUOTE

    except AttributeError:

        pass # LET'S EVENTUALLY LOG SOMETHING HERE

    msg_text = post_msg_el.get_text(" ", strip=True)
    
    return msg_text

# reg_trs = posts[0].find_all('tr', recursive=False)
# quote_trs = posts[8].find_all('tr', recursive=False)
# print 'NO QUOTE', '\n', get_post_message(reg_trs[2]), '\n' # "Met a boy, because if he...with a womans heart."
# print 'WITH QUOTE', '\n', get_post_message(quote_trs[2]) # "I am glad this information...which must be horrible"
# get_post_text(html_posts[0][2])

#### GET FORUM INFO

In [44]:
def get_forum_id(file_path):
    
    file_name = file_path.split('/')[-1]
    
    return file_name.split('_')[1]

def get_thread_id(file_path):
    
    file_name = file_path.split('/')[-1]
    
    return file_name.split('_')[3]

def get_page_number(file_path):
    
    file_name = file_path.split('/')[-1]
    
    return file_name.split('_')[-1].split('.')[0]

# file_path = '../examples/threads/parsing/1506/fid_1506_tid_97875_pg_4.html'
# print 'FORUM:', get_forum_id(file_path), 'THREAD ID:', get_thread_id(file_path), 'PAGE:', get_page_number(file_path)

#### GET POST INFO

In [45]:
def get_post_info(file_name, post_el):
    
    post = collections.OrderedDict()
    
    post['date'] = get_date_time(post_el[0])
    
    post['fid'] = get_forum_id(file_name)
    post['tid'] = get_thread_id(file_name)
    post['pg'] = get_page_number(file_name)
    
    post['pid'] = get_post_id(post_el[0])
    post['post_num'] = get_post_num(post_el[0])
    
    post['user'] = get_user_name(post_el[1])
    post['uid'] = get_user_id(post_el[1])

    post['text'] = get_post_text(post_el[2])
    
    return post

# dir_path = '../examples/threads/parsing/1506/'
# file_name = 'fid_1506_tid_97875_pg_4.html'
# html_page = load_html_file(dir_path + file_name)
# html_posts = get_html_posts(html_page)

# post = get_post_info(file_name, html_posts[0])

# for key, val in post.items():
#     print key + ':', val

#### CONVERT TO UTC

In [46]:
def convert_to_utc(local_time, local_tz, string=True):
    
    fmt = '%Y-%m-%d %H:%M:%S %Z%z'
    utc = pytz.utc
    local_timezone = timezone(local_tz)
    
    local_datetime = local_timezone.localize(local_time)
    utc_datetime = local_datetime.astimezone(utc)
    
    if string:
        utc_datetime = utc_datetime.strftime(fmt)
    
    return utc_datetime

# local_datetime = datetime.strptime('02-17-2018 07:50 AM', '%m-%d-%Y %I:%M %p')
# convert_to_utc(local_datetime, 'US/Mountain') # '2018-02-17 14:50:00 UTC+0000'

### USER

#### GET USERNAME

In [47]:
def get_user_name(el):

    user_el = el.find_all('a', {'class': 'bigusername'})[0]
    user_name = user_el.get_text()
    
    return user_name

# user_name = get_user_name(trs[1])
# print user_name # maytayah

#### GET USER ID

In [48]:
def get_user_id(el):

    user_el = el.find_all('a', {'class': 'bigusername'})[0]
    user_id = user_el['href'].split('=')[-1]
    
    return user_id

# user_id = get_user_id(trs[1])
# print user_id # 380798

#### GET USER ROLE

In [49]:
def get_user_role(el):
    
    user_roles = {'red': 'admin', 'blue': 'super', 'purple': 'site', 'darkgreen': 'mod', 
                  'magenta': 'leave', 'MediumTurquoise': 'card', 'black': 'user'}
    
    try:
        user_el = el.find_all('a', {'class': 'bigusername'})[0]
        # <a class="bigusername" href="member.php?u=380798"><font color="purple"><b>maytayah</b></font></a>
        color = user_el.font['color']
    
    # if user is simply 'registered' then it won't have <font> element
    except TypeError as err:
        
        color = u'black'
        
    return user_roles[color]

# # Administrator: red
# # PTO Super Moderator: blue
# # PTO Site Moderator: purple
# # PTO Moderator: darkgreen
# # Moderator On Leave: magenta
# # PTO Card Swap Host: MediumTurquoise
# # User: black

# reg_trs = posts[0].find_all('tr', recursive=False)
# mod_trs = posts[6].find_all('tr', recursive=False)

# print 'USER ONE:', get_user_role(reg_trs[1]), '\n', 'USER TWO:', get_user_role(mod_trs[1])

#### GET USER ROLE DESCRIPTION

In [50]:
def get_user_role_desc(el):
    
    return el.find_all('div', {'class': 'smallfont'})[0].text

# # WILL INCLUDE BANNED AND ACCOUNT CLOSED
# reg_trs = posts[0].find_all('tr', recursive=False)
# mod_trs = posts[6].find_all('tr', recursive=False)

# print 'USER ONE:', get_user_role_desc(reg_trs[1]), '\n', 'USER TWO:', get_user_role_desc(mod_trs[1])

#### GET JOIN DATE

In [51]:
def get_join_date(el):
    
    text = el.find(text=re.compile('Join'))
    
    if text: 
        join_date = text.split(':')[1].strip()
    else:
        join_date = None
    
    return join_date
    
# print 'JOIN:', get_join_date(html_posts[30][1])

#### GET LOCATION

In [52]:
def get_location(el):
    
    text = el.find(text=re.compile('Location'))
    
    if text:
        location = text.split(':')[1].strip()
    else:
        location = None
    
    return location

# print 'LOCATION:', get_location(html_posts[30][1])

#### GET NUM POSTS

In [53]:
def get_num_posts(el):
    
    text = el.find(text=re.compile('Posts'))
    
    if text:
        num_posts = text.split(':')[1].strip()
    else:
        num_posts = None
    
    return num_posts

# print 'POSTS:', get_num_posts(html_posts[30][1])

#### GET NUM THANKS

In [54]:
def get_num_thanks(el):
    
    text = el.find(text=re.compile('Thanks'))
    
    if text:
        num_thanks = text.split(':')[1].strip()
    else:
        num_thanks = None
        
    return num_thanks

# print 'THANKS:', get_num_thanks(html_posts[30][1])

#### GET NUM THANKED

In [55]:
def get_num_thanked(el):
        
    text = el.find(text=re.compile('Thanked'))
    
    if text:
        num_thanked = text.split(' ')[1].strip()
    else:
        num_thanked = None
    
    return num_thanked

def get_num_posts_thanked(el):
    
    text = el.find(text=re.compile('Thanked'))
    
    if text:
        num_posts_thanked = text.split(' ')[4].strip()
    else:
        num_posts_thanked = None
    
    return num_posts_thanked

# print 'THANKED', get_num_thanked(html_posts[30][1]), 'TIMES IN', get_num_posts_thanked(html_posts[30][1]), 'POSTS'

#### GET USER INFO

In [56]:
def get_user_info(el):
    
    user = collections.OrderedDict()
    
    user['user'] = get_user_name(el[1])
    user['uid'] = get_user_id(el[1])
    user['role'] = get_user_role(el[1])
    user['role_desc'] = get_user_role_desc(el[1])
    user['join_date'] = get_join_date(el[1])
    user['location'] = get_location(el[1])
    user['posts'] = get_num_posts(el[1])
    user['thanks'] = get_num_thanks(el[1])
    user['thanked'] = get_num_thanked(el[1])
    user['posts_thanked'] = get_num_posts_thanked(el[1])
    
    return user

In [57]:
# html_page = load_html_file(dir_path + '/39/fid_39_tid_81190_pg_10.html')
# html_posts = get_html_posts(html_page)
# post_nums = [29, 30, 31]

# for post_num in post_nums:    
    
#     user = get_user_info(html_posts[post_num])
    
#     for key, val in user.items():
#         print key + ':', val
#     print ''

### THANK YOU'S

#### GET HTML THANK YOUS

In [58]:
def get_html_thank_yous(html_page):
        
    return html_page.find_all(id=re.compile('post_thanks_box'))
    
# thanks_none = get_html_thank_yous(html_file_none)
# thanks_many = get_html_thank_yous(html_file_many)
# len(thanks_none), len(thanks_many)

#### GET THANKS LINKS

In [59]:
def get_thanks_links(el):
    
    return el.find_all('a')

# no_links = [get_thanks_links(thanks) for thanks in thanks_none]
# links = [get_thanks_links(thanks) for thanks in thanks_many]

# len(no_links), len(links)

#### GET THANKED POST ID

In [60]:
def get_thanked_post_id(el):
    
    return el['id'].split('_')[-1]

# pid_none = get_thanked_post_id(thanks_none[0])
# pid_many = get_thanked_post_id(thanks_many[0])
# pid_none, pid_many

#### GET DATE OF THANKS

In [61]:
def get_date_of_thanks(el):
        # <a>...</a>(02-27-2015),</div>        
    ascii_date = el.next_sibling
    # u'\xa0(05-13-2009),
    decoded_date = ascii_date.encode('ascii', 'ignore').decode('ascii')
    # (05-13-2009),
    date_of_thanks = re.sub('[(),]', '', decoded_date)
    
    return date_of_thanks

# get_date_of_thanks(links[0])

#### GET THANKER USER ID

In [62]:
def get_thanker_user_id(el):
    
    return el['href'].split('=')[-1]

# get_thanker_user_id(links[0])

#### GET THANKER USER NAME

In [63]:
def get_thanker_user_name(el):
    
    return el.text

# get_thanker_user_name(links[0])

#### GET THANK YOU INFO

In [64]:
def get_thank_you_info(thanks_el, link_el):
    
    if link_el:
    
        thank_you = collections.OrderedDict()

        thank_you['pid'] = get_thanked_post_id(thanks_el)
        thank_you['date'] = get_date_of_thanks(link_el)
        thank_you['from_uid'] = get_thanker_user_id(link_el)
        thank_you['from_user'] = get_thanker_user_name(link_el)

        return thank_you

## FILE HANDLING

In [71]:
# forum_dicts = []

# skip = ['logs', 'csv']
# directories = get_forum_directories('../examples/threads/parsing/', skip)

# # forum_file_dicts = []
# # forum_file_dicts = forum_file_dicts + [get_forum_file_dicts(directory, 3) for directory in directories]
# # sigh = [file_dict[0] ]
# # forum_file_dicts = forum_file_dicts + [get_forum_file_dicts(directory, 3) for directory in directories]
    
# # forum_dicts = [forum_dict for forum_dict in forum_file_dicts]
                             
# #                              get_forum_file_dicts(directory, list_size)
# #                              for directory in directories]

# # forum_dicts


# # for forum_file_dict in forum_file_dicts: # for x in non_flat
# # #     print forum_file_dict
# #     for file_dict in forum_file_dict: # for y in x
# #         print len(file_dict['files']) # y

# # forum_dicts = [file_dict for forum_file_dict in forum_file_dicts for file_dict in forum_file_dict]

# forum_dicts = [file_dict for forum_file_dict in 
#               [get_forum_file_dicts(directory, 3) for directory in directories] 
#                for file_dict in forum_file_dict]



# forum_dicts

#### GET FORUM DIRECTORIES

In [65]:
def get_forum_directories(dir_path, skip=[]):
    
    directories = [str(path) for path in Path(dir_path).iterdir() 
                   if (path.is_dir() and path.stem not in skip)]
    
    return directories

# skip = ['logs', 'csv']
# get_forum_directories('../examples/threads/parsing/', skip)

In [66]:
def get_forum_files(forum_path):
    
    thread_path = Path(forum_path)
    all_files = [str(html_file) for html_file in thread_path.rglob('*.html')]
    
    return all_files


# test_path = '../examples/threads/parsing/1506'
# get_forum_files(test_path)

#### GET FORUM FILE DICTS

In [67]:
def get_forum_file_dicts(forum_path, list_size):
    
    all_files = get_forum_files(forum_path)
    
    chunks = split_file_list(all_files, 3)
    
    forum_id = forum_path.split('/')[4]
    
    forum_file_dicts = [{'fid': forum_id, 'files': chunk} for chunk in chunks]

    return forum_file_dicts

# htmlpath = '../examples/threads/parsing/1506'
# size = 3
# forum_file_dicts = get_forum_file_dicts(htmlpath, size)
# for forum_dict in forum_file_dicts:
#     print forum_dict['fid'] + ':', len(forum_dict['files'])

#### GET FORUM FILE DICT

In [68]:
def get_forum_file_dict(dir_path):
    
    thread_path = Path(dir_path)
    forum_files = {'fid': thread_path.stem, 'files': [str(html_file) for html_file in thread_path.rglob('*.html')]}
    
    return forum_files

#### LOAD HTML FILE

In [69]:
def load_html_file(file_path):
    
    html_page = open(file_path, 'r').read()
    
    posts_page = str(BeautifulSoup(html_page, 'html.parser'))
    no_literals = posts_page.replace('\n', '').replace('\t', '').replace('\r', '')
    
    return BeautifulSoup(no_literals, 'html.parser') 
    
# thread_path = '../examples/parsing/threads/'
# forum_id = '412/'
# html_file = 'fid_412_tid_102642_pg_1.html'

# html_page = load_html_file(thread_path + forum_id + html_file)

#### CREATE FORUM LOGGER

In [70]:
def create_forum_logger(file_path):

    log_file = 'PT_' + datetime.strftime(datetime.today(), '%d_%m_%Y_%I_%M_%p') + '.log'
    msgfmt = '%(asctime)s %(levelname)s: %(message)s'
    datefmt= '%d/%m/%Y %I:%M:%S %p'
    
    reload(logging)
    logging.basicConfig(filename=file_path + '/' + log_file, level=logging.INFO, format=msgfmt, datefmt=datefmt)
    logging.Formatter.converter = time.gmtime # CHANGE TO UTC TIME
    logger = logging.getLogger()
    
    return logger

# logger = create_forum_logger('../examples/threads/parsing/logs')
# html_file = '../examples/threads/parsing/39/fid_39_tid_81190_pg_10.html'.split('/')[-1]

# try:
#     1/0
# except Exception as err:

#     logger.error(html_file + ' - ' + err.__class__.__name__ + ': ' + str(err))
#     print html_file + ' - ' + err.__class__.__name__ + ': ' + str(err)
# # 25/08/2018 01:53:50 AM ERROR: fid_39_tid_81190_pg_10.html - ZeroDivisionError: integer division or modulo by zero

#### PRINT FINAL RESULTS

In [71]:
def print_final_results(thread_dfs, thread_data, headers, rows=5):
    
    for thread_df, data, header in zip(thread_dfs, thread_data, headers):
        print 'ROWS:', len(data), 'COLS:', len(header['headers']), 'SHAPE:', thread_df.shape
    print ''
    
    if rows:

        print thread_dfs[0][['fid', 'tid', 'pg', 'pid']].head(rows)
    #     print thread_dfs[0][['fid', 'tid', 'pg', 'pid']].tail(rows)

        print ''
        print thread_dfs[1][['user','join_date', 'location', 'posts']].head(rows)
    #     print thread_dfs[1][['user','join_date', 'location', 'posts']].tail(rows)

        print ''
        print thread_dfs[2][['date', 'pid', 'from_uid', 'from_user']].head(rows)
    #     print thread_dfs[2][['date', 'pid', 'from_uid', 'from_user']].tail(rows)
        print ''

#### CREATE CSV FILENAME

In [127]:
def create_csv_file_path(html_file_path, out_path, data_type):
    # '../examples/threads/parsing/1506/ --> fid_1506_tid_97875_pg_4 - .html' + '_type.csv = fid_1506_tid_97875_pg_4_posts.csv
    file_name = html_file_path.split('/')[-1][:-5] + ('_' + data_type + '.csv')
    csv_file_path = out_path + 'pages/' + data_type + '/' + file_name
    
    return csv_file_path

# hfp = '../data/forums/1506/fid_1506_tid_97875_pg_4.html'
# op = '../data/csv/'

# hfp = '../examples/threads/parsing/forums/39/fid_39_tid_81190_pg_2.html'
# op = '../examples/threads/parsing/csv/'
# create_csv_file_path(hfp, op, 'posts') # '../examples/threads/parsing/csv/pages/posts/fid_39_tid_81190_pg_2_posts.csv'

#### WRITE PAGE DATA TO CSV

In [73]:
# MULTIPROCESSING VERSION
def write_page_data_to_csv(thread_data, headers, html_file_path, out_path):
    
    for data, header in zip(thread_data, headers):

        csv_file_path = create_csv_file_path(html_file_path, out_path, header['type'])

        df = pd.DataFrame(data, columns=header['headers'])
        df.to_csv(csv_file_path, index=False, encoding='utf-8')
                    
#         print file_path, df.shape

# TODO: MAKE TESTS THAT DON'T RELY ON PREVIOUSLY RUN CODE
# posts_headers = {'type': 'posts', 'headers': ['date', 'fid', 'tid', 'pg', 'post_num', 'pid', 'user', 'uid', 'text']}
# users_headers = {'type': 'users', 'headers': ['user', 'uid', 'role', 'role_desc', 'join_date', 'location', 'posts', 'thanks', 'posts_thanked']}
# thanks_headers = {'type': 'thanks', 'headers': ['date', 'pid', 'from_uid', 'from_user']}

# thread_data = [posts, users, thank_yous]
# headers = [posts_headers, users_headers, thanks_headers]
# html_file_path = '../examples/threads/parsing/csv/pages/posts/fid_1506_tid_97875_pg_4_posts.csv'
# out_path = '../examples/threads/parsing/'

# write_page_data_to_csv(thread_data, headers, forum['fid'], out_path)

#### SPLIT LIST

In [125]:
# https://stackoverflow.com/a/43106405/6023530

def split_list(full_list, num_lists):
    return [array.tolist() for array in np.array_split(full_list, num_lists)]
      
# first_names = ['Steve', 'Jane', 'Sara', 'Mary','Jack','Bob', 'Bily', 'Boni', 'Chris','Sori', 'Will', 'Won','Li']
# split_list(first_names, 4)

## PARSING

#### PARSE THREADS

In [128]:
def parse_threads(html_file_path):
        
    try:
        
        out_path = '../examples/threads/parsing/csv/'
        

#         out_path = '../data/csv/'
        
        posts_headers = {'type': 'posts', 'headers': ['date', 'fid', 'tid', 'pg', 'post_num', 'pid', 'user', 'uid', 'text']}
        users_headers = {'type': 'users', 'headers': ['user', 'uid', 'role', 'role_desc', 'join_date', 'location', 'posts', 'thanks', 'posts_thanked']}
        thanks_headers = {'type': 'thanks', 'headers': ['date', 'pid', 'from_uid', 'from_user']}

        headers = [posts_headers, users_headers, thanks_headers]
        
        posts = []
        users = []
        thank_yous = []
                
        html_page = load_html_file(html_file_path)

        html_posts = get_html_posts(html_page)

        posts = posts + [get_post_info(html_file_path, html_post) for html_post in html_posts]

        users = users + [get_user_info(html_post) for html_post in html_posts]


        html_thank_yous = get_html_thank_yous(html_page)

        post_thank_you_links = [get_thanks_links(html_thank_you) for html_thank_you in html_thank_yous]

        thank_yous = thank_yous + [get_thank_you_info(html_thank_you, thank_you_link) 
                                    for html_thank_you, thank_you_links in zip(html_thank_yous, post_thank_you_links) 
                                    for thank_you_link in thank_you_links]
                
#         print html_file_path.split('/')[-1] + ':', posts_headers['type'] + ':', len(posts), users_headers['type'] + ':', len(users), thanks_headers['type'] + ':', len(thank_yous)
        
        thread_data = [posts, users, thank_yous]
        
        write_page_data_to_csv(thread_data, headers, html_file_path, out_path)
        
        return {'posts': len(posts), 'users': len(users), 'thanks': len(thank_yous)}
        
    except Exception as err:
    
        file_name = html_file_path.split('/')[-1]
        print file_name + ' - ' + err.__class__.__name__ + ': ' + str(err)
        
# TODO: RIGHT NOW THIS IS DEPENDANT ON THE HEADERS AND THREAD DATA BEING IN THE SAME ORDER.  SHOULD PROBABLY MAKE BOTH INTO DICTIONARIES
# SO THAT A KEY CAN BE USED TO MATCH DATA TYPE TO DATA HEADER.

#### RUN PARSER

In [None]:
# TODO's
# Figure out how to procure the UTC for 'Today' and 'Yesterday'

In [155]:
import parmap
import tqdm
from random import sample
from collections import Counter

processors = 1

forums_dir = '../examples/threads/parsing/'
skip = ['logs', 'csv', 'error']

forum_paths = get_forum_directories(forums_dir, skip)

pages_by_forum = {forum_path.split('/')[-1] : [str(html_file) for html_file 
                                               in Path(forum_path).glob('*.html')] 
                                               for forum_path in forum_paths}
totals = Counter()

for forum, html_file_paths in pages_by_forum.items():
    
    print 'FORUM:', forum, '\n'
    
    print len(html_file_paths), 'HTML FILES WILL NOW BE PROCESSED BY', processors, 'PROCESSOR(S)' # SHOULD BE 1679
    
    processed = parmap.map(parse_threads, html_file_paths, pm_processes=processors, pm_pbar=True)
    
    print len(processed), 'HTML FILES WERE PROCESSED BY', processors, 'PROCESSOR(S)', '\n'
    
    totals += report_totals(processed)
    
    print ''
    
print 'TOTAL LENGTH OF CSV\'s:', '\t', 'POSTS:', totals['posts'], '\t', 'USERS', totals['users'], '\t', 'THANKS:', totals['thanks']

In [153]:
def report_totals(results):
    
    num_files = len(results)
    
    total_posts = sum([parsed['posts'] for parsed in results])
    total_users = sum([parsed['users'] for parsed in results])
    total_thanks = sum([parsed['thanks'] for parsed in results])
    
    csv_posts_lines = total_posts + num_files
    csv_users_lines = total_users + num_files
    csv_thanks_lines = total_thanks + num_files
    
    total_processed = total_posts + total_users + total_thanks
    total_csv_lines = csv_posts_lines + csv_users_lines + csv_thanks_lines
    
    print total_processed, 'PROCESSED --->', '\t', 'POSTS:', total_posts, '\t', 'USERS', total_users, '\t', 'THANKS:', total_thanks
    print total_csv_lines, 'CSV LINES --->', '\t', 'POSTS:', csv_posts_lines, '\t', 'USERS', csv_users_lines, '\t', 'THANKS:', csv_thanks_lines
    
    return Counter({'posts': csv_posts_lines, 'users': csv_users_lines, 'thanks': csv_thanks_lines })
    
# report_totals(processed)

In [None]:
# # if __name__ == '__main__':
import parmap
import tqdm
from random import sample

processors = 1

forums_path = '../examples/threads/parsing/'
skip = ['logs', 'csv', 'error']

# forums_path = '../data/samples/'
# skip = ['restricted', 'logs'] # LOGS HAS 43 FILES, RESTRICTED 2

dir_paths = get_forum_directories(forums_path, skip)
html_file_paths = [str(html_file) for dir_path in dir_paths for html_file in Path(dir_path).rglob('*.html')]
test_files = sample(html_file_paths, 100)

# TODO: MAKE PROCESSED LIST ACTUALLY CONTAIN SOMETHING INTSTEAD OF NONE
print len(test_files), 'HTML FILES WILL NOW BE PROCESSED BY', processors, 'PROCESSOR(S)' # SHOULD BE 1679

processed = parmap.map(parse_threads, test_files, pm_processes=processors, pm_pbar=True)

print len(processed), 'HTML FILES WERE PROCESSED BY', processors, 'PROCESSOR(S)', '\n'
# report_totals(processed)

100 HTML FILES WILL NOW BE PROCESSED BY 1 PROCESSOR(S)


#### COMBINE CSV'S BY DATA TYPE

In [88]:
root_path = '../data/csv/'

pages_dir = root_path + 'pages/'
master_dir = root_path + 'masters/'

data_types = ['posts', 'users', 'thanks']

csv_pages = {data_type: [str(csv_page_file) for csv_page_file in Path(pages_dir + data_type).iterdir()] for data_type in data_types }

for data_type, files in csv_pages.items():
    
    print 'DATA TYPE:', data_type
    
    data_frames = parmap.map(pd.read_csv, files, pm_processes=1, pm_pbar=True)
    data_frame = pd.concat(data_frames, ignore_index=True)
    
    csv_file_path = master_dir + 'prison_talk_' + data_type + '.csv'
    data_frame.to_csv(csv_file_path, index=False, encoding='utf-8')
    
    sum_frame_rows = sum([frame.shape[0] for frame in data_frames])
    print 'NUM FRAMES:', len(data_frames), 'SUM OF ROWS IN FRAMES:', sum_frame_rows, 'NUM ROWS IN CONCAT FRAMES:', data_frame.shape[0], '\n'

DATA TYPE: posts
NUM FRAMES: 40 SUM OF ROWS IN FRAMES: 1009 NUM ROWS IN CONCAT FRAMES: 1009 

DATA TYPE: users
NUM FRAMES: 40 SUM OF ROWS IN FRAMES: 1009 NUM ROWS IN CONCAT FRAMES: 1009 

DATA TYPE: thanks
NUM FRAMES: 40 SUM OF ROWS IN FRAMES: 446 NUM ROWS IN CONCAT FRAMES: 446 



#### SPLIT ALL POSTS CSV INTO FORUMS

In [111]:
df = pd.read_csv(master_dir + 'prison_talk_posts.csv')
for fid, group in df.groupby(['fid']):
    
    forum_csv_file = root_path + 'forums/fid_' + str(fid) + '_posts.csv'
    group.to_csv(forum_csv_file, index=False, encoding='utf-8')

## SCRATCH

In [159]:
# DIFFERENT DATA STRUCTURES FOR LOADING FILES 

forums_path = '../examples/threads/parsing/'
skip = ['logs', 'csv', 'error']
dir_paths = get_forum_directories(forums_path, skip)
html_file_paths = {dir_path.split('/')[-1] : \
                   [str(html_file) for dir_path in dir_paths for html_file in Path(dir_path).rglob('*.html')]}
html_file_paths.keys()

post_pages = [{'forum': forum, 'files': [str(csv_page_file) for csv_page_file in Path(post_pages_dir).glob('fid_' + forum + '*')]} for forum in forums]

i = 0

print 'FORUM:', post_pages[i]['forum']
print 'FILES:'
post_pages[i]['files']

csv_pages = [{'data_type': data_type, 
              'files': [str(csv_page_file) for csv_page_file in Path(pages_dir + data_type).iterdir()]} for data_type in data_types]

# csv_pages.keys()
post_pages.keys()

In [107]:
# IF YOU WANT TO MAKE A LARGE DICT {'1506', [file_one, file_two...file_n], ...'33', [file_one...] }
forums_dir = '../examples/threads/parsing/'
skip = ['logs', 'csv', 'error']

forum_paths = get_forum_directories(forums_dir, skip)

pages_by_forum = {forum_path.split('/')[-1] : [str(html_file) for html_file 
                                               in Path(forum_path).glob('*.html')] 
                                               for forum_path in forum_paths}

for forum, pages in pages_by_forum.items():
    print 'FORUM:', forum, '\n'
    for page in pages:
        print page   
    print ''

In [105]:
# IF YOU WANT TO MAKE A LIST OF DICTS [{'forum': '1506', 'files': [file_one, file_two...file_n]}]

forums_dir = '../examples/threads/parsing/'
skip = ['logs', 'csv', 'error']

forum_paths = get_forum_directories(forums_dir, skip)

pages_by_forum = [{'forum': forum_path.split('/')[-1], 
                   'files': [str(html_file) for html_file 
                             in Path(forum_path).glob('*.html')]} 
                             for forum_path in forum_paths]

for forum_dict in pages_by_forum:
    print 'FORUM:', forum_dict['forum'], '\n'
    for html_file in forum_dict['files']:
        print html_file
    print ''

In [200]:
# WERE ONE TO COMBINE FILES BY ITERATING OVER FORUMS
def combine_csv_files(csv_files):
    
    dfs = []
    
    for csv_file in csv_files:

        dfs.append(pd.read_csv(csv_file))
        
    return pd.concat(dfs, ignore_index=True)

# csv_post_files = ['../examples/threads/parsing/csv/pages/posts/fid_1506_tid_93600_pg_1_posts.csv',
#                   '../examples/threads/parsing/csv/pages/posts/fid_1506_tid_93600_pg_2_posts.csv',
#                   '../examples/threads/parsing/csv/pages/posts/fid_1506_tid_93600_pg_3_posts.csv']

# concat_df = combine_csv_files(csv_post_files)
# concat_df

In [None]:
#OLD VERSION BEFORE MULTIPROCESSING
def write_thread_data_to_csv(thread_data, headers, fid, out_path):
    
    dfs = []
    
    for data, header in zip(thread_data, headers):

        data_type = header['type']

        file_name = data_type + '_fid_' + fid + '.csv'
        file_path = out_path + 'csv/' + data_type + '/' + file_name

        df = pd.DataFrame(data, columns=header['headers'])
        df.to_csv(file_path, index=False, encoding='utf-8')
        
        dfs.append(df)
        
    return dfs
    

In [None]:
# OKAY, SO WHAT'S GOING ON HERE IS THAT WHATEVER I PASS TO THE MAP FUNCTION WILL BE AN ITERABLE
# AND THE MAP FUNCTION WILL TAKE ONE OF THE ITERABLE.  FOR EXAMPLE

# NUMS = [NUM_ZERO, NUM_ONE, NUM_TWO...NUM_N]
# NUM_FUNC(NUM)

# WOULD BE: POOL.MAP(NUM_FUNC, NUMS)

# SO WHAT I NEED TO DO IS PASS IT AN ITERABLE OF HTML FILES BUT MAKE THE FUNCTION SUCH THAT IT TAKES ONE FILE

# FORUMS = [{FID: 0, FILES: ['FILE_ONE.TXT', 'FILE_TW0.TXT', 'FILE_THREE.TXT']}]
# PARSE_THREADS(HTML_FILE):
#     DO STUFF WITH THE HTML_FILE

# FOR FORUM IN FORUMS:
#     PARSED_THREADS[FORUM['FID']] = POOL.MAP(PARSE_THREADS, FORUM['FILES'])

    
# HERE THE FORUM['FILES'] IS EQUIVALENT TO THE NUMS
# PARSE_THREADS(HTML_FILE) IS EQUIVALENT TO NUM_FUNC(NUM)
# AND A SINGLE HTML_FILE IS EQUIVALENT TO ONE NUM

In [48]:
# EXCEPTION TESTING
some_file = '../this/is/my.file'
try:
    1/0

except Exception as err:
    
    file_name = some_file.split('/')[-1]
    print file_name + ' - ' + err.__class__.__name__ + ': ' + str(err)
#     tb = sys.exc_info()[-1]
#     stk = traceback.extract_tb(tb, 1)
#     function_name = stk[0][3]
#     print 'The failing function was', function_name

#     log_msg = file_name + ' - ' + function_name + ' - ' + err.__class__.__name__ + ': ' + str(err)

#     logger.error(log_msg)

my.file - ZeroDivisionError: integer division or modulo by zero


In [79]:
#OLD VERSION BEFORE MULTIPROCESSING
def write_thread_data_to_csv(thread_data, headers, fid, out_path):
    
    dfs = []
    
    for data, header in zip(thread_data, headers):

        data_type = header['type']

        file_name = data_type + '_fid_' + fid + '.csv'
        file_path = out_path + 'csv/' + data_type + '/' + file_name

        df = pd.DataFrame(data, columns=header['headers'])
        df.to_csv(file_path, index=False, encoding='utf-8')
        
        dfs.append(df)
        
    return dfs
            
#         print file_path, df.shape
        
# posts_headers = {'type': 'posts', 'headers': ['date', 'fid', 'tid', 'pg', 'post_num', 'pid', 'user', 'uid', 'text']}
# users_headers = {'type': 'users', 'headers': ['user', 'uid', 'role', 'role_desc', 'join_date', 'location', 'posts', 'thanks', 'posts_thanked']}
# thanks_headers = {'type': 'thanks', 'headers': ['date', 'pid', 'from_uid', 'from_user']}

# thread_data = [posts, users, thank_yous]
# headers = [posts_headers, users_headers, thanks_headers]
# out_path = '../examples/threads/parsing/'

# write_thread_data_to_csv(thread_data, headers, forum['fid'], out_path)

In [None]:
# if __name__ == '__main__':
import parmap
import tqdm

forums_path = '../examples/threads/parsing/'
skip = ['logs', 'csv', 'error']



# directory_paths = get_forum_directories(forums_path, skip)
# forums = [get_forum_file_dict(directory_path) for directory_path in directory_paths]

dir_paths = get_forum_directories(forums_path, skip)
html_file_paths = [str(html_file) for dir_path in dir_paths for html_file in Path(dir_path).rglob('*.html')]

parmap.map(parse_threads, html_file_paths, pm_processes=1, pm_pbar=True)


# parsed_threads = {}

# pool = mp.Pool(processes=1)
# pool.map(target=parse_threads, args(html_files)

# for forum in forums:
#     fid = forum['fid']
#     parsed_threads[forum['fid']] = pool.map(parse_threads, forum['files'])

# pool.close()
# pool.join()