<font color='orange' size=6>Authorship, Manning Live Project, Section 2</font>  
Mar 12, 2021  
<hr>

# Imports

In [1]:
import os
import re
from chardet.universaldetector import UniversalDetector
import pickle
import glob
import chardet
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bs
from typing import NamedTuple, List

In [2]:
# from project 1; for application of that code below
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn import metrics

# Config including paths, globals, constants

In [3]:
# check current working dir and create dir for files
cur_dir = os.getcwd()
cur_dir

'/Users/bradgreenwald/projects/manning/Author_ID_Live_Project'

In [4]:
# path to directory containing xml files with blogs
xml_path = os.path.join(os.getcwd(), 'data', 'blogs')
xml_path

'/Users/bradgreenwald/projects/manning/Author_ID_Live_Project/data/blogs'

In [5]:
# create list of xml_files, recalling that mac creates files in the folder
xml_filenames = [fname for fname in os.listdir(xml_path) if fname.endswith('xml')]
len(xml_filenames)

19320

# Helper Classes, Functions; also Hoisted Functions

In [6]:
dirp = lambda x: [d for d in dir(x) if not d.startswith('_')]
from pprint import pprint as pp

In [None]:
"""Use xml parsing from Beatiful Soup to get the content of the posts tag. Later this can be expanded to get dates"""
def get_posts(txt1):
    """Return list of posts-tag text content from byte content txt1"""
    soup = bs(txt1, 'lxml-xml')
    ans = soup.find_all('post')
    posts = []
    for ch in ans:
        posts.append(ch.text)
    return posts

# Pickled data

In [7]:
# xml files byte content in dict keyed by filename
with open(os.path.join(os.getcwd(), 'data', 'xml_files.pickle'), 'rb') as fh:
    xml_files = pickle.load(fh)

In [8]:
# decoded text dataframe with filenames
with open(os.path.join(cur_dir, 'data', 'df_txt.pickle'), 'rb') as fh:
    df_txt = pickle.load(fh)

In [9]:
df_txt.sample()

Unnamed: 0,txt,fnames
8290,"<Blog>\r\n\r\n<date>18,October,2003</date>\r\n...",2129306.female.23.Student.Sagittarius.xml


In [10]:
# cleaned post processed posts with metadata
with open(os.path.join(cur_dir, 'data', 'df_allclean.pickle'), 'rb') as fh:
    df_allposts = pickle.load(fh)

# Write a function that reads in the contents from one of the blogs files as binary data. Don’t try to parse the contents yet. 

In [None]:
# get first file
file1 = os.listdir(xml_path)[0]
file1

In [None]:
def get_bytes(xml_filename):
    """Read in the bytes from the xml file"""
    with open(os.path.join(xml_path, xml_filename), 'rb') as fh:
        content = fh.read()
    return content

In [None]:
content = get_bytes(file1)
len(content)

## Get all files as bytes into pickled dictionary for eacy access, keyed by filename

In [None]:
xml_files = dict()
for fname in xml_filenames:
    with open(os.path.join(os.getcwd(), xml_path, fname), 'rb') as fh:
        content = fh.read()
        xml_files[fname] = content

In [None]:
with open(os.path.join(os.getcwd(), 'data', 'xml_files.pickle'), 'wb') as fh:
    pickle.dump(xml_files, fh)

# Different Approach:  using the most common encodings, try to apply them.  No using chardet

## Get Encodings by Guess and Check

Turns out that at least one of the files that chardet returned None for encoding was decoded fine when I used ascii. I wonder how many times that would happen if we just use ascii, utf-8, windows-1252, iso-8859 (latin1)?

In [None]:
# TODO:  Refactor to use a list of 'encs_to_try' instead of hardcoding and not iterating
def encodings_guess_and_check(encs_to_try, xml_files_dict):
    """Determines file counts for a set of encodings by guess and check, for dict of bytes-files keyed by filename
    Args:
        encs_to_try: list of encodings, e.g. sample to get most popular
        xml_files_dict: dict keyed by filename, vals are encodings that worked (i.e. file decoded with no error)
    Return:
        fnames: list of filenames
        dlist: list of strings from applying determined encoding
        file_enc_dist: dict keyed by filename, vals are determined encoding
        decoded_dict: dict keyed by filename, vals are the strings resulting from applying determined encoding
    """
    file_enc_dict = dict()
    decoded_dict = dict()
    fnames = []
    dlist = []
    for fname, content in xml_files.items(): 
        try: 
            txt = content.decode(encoding='utf-8')
            file_enc_dict[fname] = 'utf-8'
            decoded_dict[fname] = txt
            dlist.append(txt)
            fnames.append(fname)
        except UnicodeDecodeError:
            try: 
                txt = content.decode(encoding='Windows-1252')
                file_enc_dict[fname] = 'Windows-1252'
                decoded_dict[fname] = txt
                dlist.append(txt)
                fnames.append(fname)
            except UnicodeDecodeError:
                try:
                    txt = content.decode(encoding='ISO-8859-1')
                    file_enc_dict[fname] = 'ISO-8859-1'
                    decoded_dict[fname] = txt
                    dlist.append(txt)
                    fnames.append(fname)
                except:
                    count += 1
                    print(f'{count = }')
    return fnames, dlist, file_enc_dict, decoded_dict

In [None]:
x_dict = {
    'txt': dlist,
    'fnames': fnames
}

In [None]:
df_txt = pd.DataFrame(x_dict)
df_txt.head()

of 19320 files, only 607 have errors when using utf-8
- and using Windows-1252 as second tier leaves only 33.  
- and using ISO-8859-1, it decodes them all.  

## Pickle the dataframe

In [None]:
with open(os.path.join(cur_dir, 'data', 'df_txt.pickle'), 'wb') as fh:
    pickle.dump(df_txt, fh)

## Explore and Parse the Decoded Strings

In [None]:
txt1 = df_txt.loc[0, 'txt']

In [None]:
txt1[:500]

In [None]:
"""Use xml parsing from Beatiful Soup to get the content of the posts tag. Later this can be expanded to get dates"""
def get_posts(txt1):
    """Return list of posts-tag text content from byte content txt1"""
    soup = bs(txt1, 'lxml-xml')
    ans = soup.find_all('post')
    posts = []
    for ch in ans:
        posts.append(ch.text)
    return posts

In [None]:
p3 = parse_xml(dlist[3])
len(p3)

In [None]:
# Check posts
for p in p3:
    print(p)

**Post processing**  
- When the blogs are printed, they seem much better than the raw string, thus one of the issues will be **formatting chars**.  

## Make dataframe with the parsed posts

In [None]:
# change index to filename for easier access
df_txt.set_index('fnames', inplace=True)
df_txt.sample()

In [None]:
# Class for posts to use for creating dataframe
class BlogPost(NamedTuple):
    auth_id: int
    gender: str
    age: int
    industry: str
    star_sign: str
    post: str
    
#     def __repr__(self):
#         return f'n'

In [None]:
# Create df for first 5 authors similar to df in part 1, using metadata
def posts_to_df(filenames):
    """Create dataframe for filenames, where each row is a post and includes metadata parsed from filename"""
    # get metadata
    blogposts = []
    for fname in filenames:
        auth_id, gender, age, industry, star_sign = fname.split('.')[:-1]
        posts = get_posts(df_txt.loc[fname]['txt'])
        for post in posts:
            bp = BlogPost(auth_id, gender, age, industry, star_sign, post)
            blogposts.append(bp)
    return blogposts

In [None]:
fname_all = xml_filenames

In [None]:
bp_all = posts_to_df(fname_all)
len(bp_all)

In [None]:
df_allposts = pd.DataFrame(bp_all)

In [None]:
df_allposts.shape

- Learned this last lesson, but worth repeating:
NamedTuples convert easily to pandas DataFrames

# Review 100 randomly sampled text documents from the set. Are there any post-processing steps needed? Are there aspects to the text that should be removed from the data before using it in machine learning?

In [None]:
df_allposts.sample(5)

In [None]:
sample_100 = df_allposts.sample(100)
sample_100.shape

In [None]:
g = (row for row in sample_100.iterrows())

In [None]:
ii = iter(g)

What makes a difference to authors?  prob not the regular stuff that we do for content, like lowercase.  Uppercase could be important. 
- suggestion to remove numbers
- suggestion to remove urllink
- urls
- formatting
- punctuation may be important for authors
- maybe things like trademark or copyright symbols
- maybe proper names of certain sorts like Months, places, because places are the same and don't really distinguish an authors style, especially if we are doing char-wise.  Maybe word-wise. 
- maybe NER
- quotations
- we could remove very common words?
- maybe parens? 
- there's other stuff that you start to notice that maybe won't help to distinguish authors.  

In [None]:
def post_process(post):
    post.strip()
    re_urllink = re.compile('urlLink')
    # how to remove nums?  i guess just remove each digit, replace with space, which will disappear? or we can 
    # normalize all spaces to single space.  or for authorship, maybe we want to Not look at spaces? 
    re_remove_nums = re.compile('\d[a-zA-Z0-9]') # remove entire thing with the number in it
    re_urls = re.compile('http.+')
    re_white_space = re.compile(r'[\n\t\r]+') # replace with space.  lastly convert all white space to one. 
    re_alot_white_space = re.compile(r'[\s]+')
    
    post = re_urllink.sub(' ', post)
    post = re_remove_nums.sub(' ', post)
    post = re_urls.sub(' ', post)
    post = re_white_space.sub(' ', post)
    post = re_alot_white_space.sub(' ', post)
    return post

In [None]:
h = next(ii)
print(len(h[1]['post']))
print(post_process(h[1]['post']))

In [None]:
new = post_process(df_allposts.loc[0, 'post'])
new

In [None]:
re.sub(r'[\s]+',' ', new)

### Try to apply to the post feature by row

In [None]:
# test
clean_posts = []
for txt in df_allposts[:10]['post']:
    print(post_process(txt))

In [None]:
df_allposts['clean'] = df_allposts['post'].apply(post_process)

In [None]:
# looks like it worked.  gonna pickle it
df_allposts.head()

In [None]:
### pickle df_cleanposts
with open(os.path.join(cur_dir, 'data', 'df_allclean.pickle'), 'wb') as fh:
    pickle.dump(df_allposts, fh)

# Update the dataset loading code from milestone 1 to use this process of BeautifulSoup to load the data, and then your post-processing script afterwards.

- I already did that as part of the process. Actually I'm not sure how we could have done the above other than if we did this, unless, again, I misunderstood the instructions

# Finalize the workflow by ensuring you can read your documents in, clean them up, and classify them using the rest of the procedure we used in Milestone 1.

- Using the list of authors from part 1, get all their posts
- sample 10 for each author (from the solution template)
- create train-test split in portion of dataset (some columns)
- create pipeline
- apply pipeline

In [None]:
# cast auth_id and age as ints
df_allposts['auth_id'] = df_allposts['auth_id'].astype(int)

In [None]:
authors = [3574878, 2845196, 3444474, 3445677, 828046, 
                       4284264, 3498812, 4137740, 3662461, 3363271]

In [None]:
auth10_posts = df_allposts[df_allposts['auth_id'].isin(authors)]

In [None]:
auth10_posts.shape

In [None]:
auth10_posts.sample()

In [None]:
all_data = auth10_posts['clean']
all_labels = auth10_posts['auth_id']

In [None]:
# Create the template - we may not need the transformer because we are using tfidf to vectorize as well as transform.
auth_clf = Pipeline([
    ('vect', TfidfVectorizer(ngram_range=(1, 6), analyzer='char', use_idf='false')),
    ('clf', SGDClassifier()),
])

In [None]:
def pipeline_to_f1_scores(diff_samples, clfs):
    """Apply the methodology used above on different sets of sampled data"""
    data = diff_samples['post']
    labels = diff_samples['auth_id']
    df_auth_train, df_auth_test, labels_train, labels_test = train_test_split(data, labels, 
                                                                          test_size=0.3, stratify=labels)
    clfs.fit(df_auth_train, labels_train)
    preds = clfs.predict(df_auth_test)
    acc = np.mean(preds==labels_test)

    # calculate the f1-scores; labels_test is global
    
    print(f'f-scores')
    for avg in ['weighted', 'macro', 'micro']:
        f1 = metrics.f1_score(labels_test, preds, average=avg, zero_division=0)
        print(f'\t{avg:8} {f1:.4}')

In [None]:
pipeline_to_f1_scores(auth10_posts, auth_clf)

# END