In [2]:
import pandas as pd
import os, sys
import csv
from pathlib2 import Path

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [3]:
def get_forum_files(forum_path, suffix):
    
    forum_path = Path(forum_path)
    all_files = [str(file) for file in forum_path.rglob('*.' + suffix)]
    
    return all_files

In [4]:
def get_forum_id(forum_path):
    
    forum_id = forum_path.split('/')[-1].split('_')[1]

    return forum_id

# forum_path = '/l/nx/data/deepspace/prisontalk/data/fid_583_posts.csv'
# fid = get_forum_id(forum_path)
# fid

In [5]:
def get_forum_ids(forum_paths):
    
    forum_ids = [int(get_forum_id(forum_path)) for forum_path in forum_paths]
    
    return forum_ids


# data_path = '/l/nx/data/deepspace/prisontalk/data/'
# forum_posts_path = 'forums/parsed/posts/'
# forum_files = get_forum_files(data_path + forum_posts_path, 'csv')
# file_fids = get_forum_ids(forum_files)
# print len(file_fids)

### FID TO FORUM NAME MAPPING

The forum files only have a column for the forum id (fid), but not the name of the forum.  Therefore we will need a dictionary to map one to the other so that we can have propper titles on the plots. In order to do so, we will create a dataframe with the fid and name and save it to csv.  That way we can import it into any program we wish to.  While we could simply use forum_list.csv, it is important to make sure that this is done by file so we can be assured that each forum in the file will have one mapping and not have any doubts that something is missing. Forums 957 and 1627 (the ones that require a login) will be removed manually.

In [8]:
data_path = '/l/nx/data/deepspace/prisontalk/data/'
forum_posts_path = 'forums/parsed/posts/'

forum_list = pd.read_csv(data_path + 'forums/forum_ids.csv')
forum_files = get_forum_files(data_path + forum_posts_path, 'csv')
print 'FORUM LIST:', forum_list.shape[0], 'FORUM FILES', len(forum_files)

FORUM LIST: 43 FORUM FILES 41


#### REMOVE FORUMS THAT REQUIRE LOGINS (957, 1627)

In [25]:
not_fid_957 = forum_list['fid'] != 957 # CONDITION A
not_fid_1627 = forum_list['fid'] != 1627 # CONDITION B

no_logins = forum_list[not_fid_957 & not_fid_1627] # NEW DF MINUS FID'S 957 & 1627
scraped_ids = no_logins[['fid', 'name']] # NEW DF WITH FID AND NAME ONLY

scraped_ids.to_csv(data_path + 'forums/scraped_ids.csv', index=False)

print 'FORUM LIST:', forum_list.shape[0], 'FORUM FILES:', len(forum_files), 'SCRAPED IDS:', scraped_ids.shape[0]

FORUM LIST: 43 FORUM FILES: 41 SCRAPED IDS: 41


#### SANITY CHECK  
Confirm lists are identical

In [26]:
data_path = '/l/nx/data/deepspace/prisontalk/data/'
forum_posts_path = 'forums/parsed/posts/'

scraped_forums = pd.read_csv(data_path + 'forums/scraped_ids.csv')
forum_files = get_forum_files(data_path + forum_posts_path, 'csv')

file_fids = get_forum_ids(forum_files) # ID's FROM PARSING THE FILES
scraped_fids = list(scraped_forums['fid']) # ID'S FROM THE forum_list DATAFRAME

is_identical = set(scraped_fids) == set(file_fids) #

print 'SCRAPED FIDS:', len(scraped_fids), 'FILE FIDS', len(file_fids), 'LISTS IDENTICAL:', is_identical

SCRAPED FIDS: 41 FILE FIDS 41 LISTS IDENTICAL: True
