# Deliverable for Section 1:  Implementing a Standard Text Mining Workflow

<font size=5 color='orange'>**Authorship ID with Text Mining and ML - a LiveProject by Manning**</font>
<hr>

# Imports

In [1]:
import os
import sys
import pickle
import pandas as pd
import numpy as np
from typing import NamedTuple, List

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn import metrics

# Config and some Globals

In [2]:
%config Completer.use_jedi=False
%config InlineBackend.figure_format = 'retina'

In [3]:
# spell out path where xml files are
xml_path = '/Users/bradgreenwald/projects/manning/Author_ID_Live_Project/data/blogs'

# Helper classes and functions

In [4]:
class BlogPost(NamedTuple):
    """To implement a blog post"""
    auth_id: int
    gender: str
    age: int
    industry: str
    astro: str
    posts: List

In [5]:
from pprint import pprint as pp

# Get pickled data

In [6]:
# retrieve pickled xml files - no need to reload
with open('./data/xml_files.pickle', 'rb') as fh:
    xml_files = pickle.load(fh)

In [7]:
# get the posts
with open('./data/posts.pickle', 'rb') as fh:
    blog_posts = pickle.load(fh)

In [8]:
# get df_posts
with open('./data/df_posts.pickle', 'rb') as fh:
    df_posts = pickle.load(fh)

# Coding

## Load Data

In [9]:
# Create list of xml files (needed because Macs create hidden files in a directory)
xml_filenames = [fn for fn in os.listdir(xml_path) if fn.endswith('xml')]
len(xml_filenames)

19320

In [10]:
def load_xml_files(filenames, path):
    """Create and return dict of files, keyed by filename"""
    xml_files = dict()
    for xml_file in filenames:
        with open(os.path.join(path, xml_file), 'rt', errors='ignore') as fh:
            contents = fh.read()
            xml_files[xml_file] = contents
    return xml_files

In [11]:
# apply function
xml_files = load_xml_files(xml_filenames, xml_path)

In [12]:
len(xml_files)

19320

### Pickle  xml_files

In [13]:
# Takes a while to load, so pickle this
with open('./data/xml_files.pickle', 'wb') as fh:
    pickle.dump(xml_files, fh)

### Test on file with auth_id including 5114

In [14]:
# find the file requested in the assignment
[k for k in xml_files if '5114' in k]

['2511455.female.16.indUnk.Pisces.xml',
 '3511421.female.17.Student.Taurus.xml',
 '4151144.female.14.Student.Sagittarius.xml',
 '4251140.male.26.indUnk.Scorpio.xml',
 '4151142.female.16.Student.Taurus.xml',
 '3451143.female.23.indUnk.Pisces.xml',
 '5114.male.25.indUnk.Scorpio.xml',
 '2925114.female.16.Student.Gemini.xml']

In [15]:
# checking file starting with 5114 for first line per assignment
print(xml_files['5114.male.25.indUnk.Scorpio.xml'])

<Blog>

<date>28,February,2001</date>
<post>


       
      Slashdot raises lots of  urlLink interesting thoughts about banner ads .  The idea is to let users control the ad delivery, and even to allow users to comment on ads.
     

    
</post>

<date>27,February,2001</date>
<post>


       
       urlLink  The Merchants of Cool  , a Frontline documentary featuring Mindjack advisory board member Douglas Rushkoff, is on PBS tonight.  Check your local listings for the time.
     

    
</post>

<date>26,February,2001</date>
<post>


       
       urlLink ATMs dispensing music?   I don't quite see the logic in that.  I'm not entirely against paying a nominal fee for music, or any other media, but if I do have to pay for it I'd be much more likely to buy stuff from my own PC.
     

    
</post>

<date>22,February,2001</date>
<post>


       
      My chair started squeaking a few days ago and it's driving me nuts!
     

    
</post>

<date>20,February,2001</date>
<post>


       
   

## Parse raw data into pandas DataFrame

### Get all posts per xml_file and create BlogPost named tuples with proper metadata attrs

In [16]:
def get_posts(filename):
    """Captures a list of posts, where a post is the text between open (<post>) and close(</post>) tags as a post"""
    # use encoding per instructions
    with open(filename, encoding='Windows-1252', errors='ignore') as fh:
        num_blanks = 0
        posts = []
        next_post = ''
        get_post = False
        for i, line in enumerate(fh.readlines()):
            line = line.lstrip()

            # capture line if get_post flag is set
            if get_post:
                next_post += line
            # set flag to start capturing post when there is a open <post> tag
            if '<post>' in line:
                get_post = True

            # toggle get_post flag if reach close </post> tag
            elif '</post>' in line:
                get_post = False
                # post ended, capture then reset
                posts.append(next_post.replace('</post>', ''))
                next_post = ''
    return posts


def get_all_posts(filenames):
    """Create and return a list of all BlogPost NamedTuple instances with correct metadata from filename"""
    blog_posts = []
    for filename in filenames:
        temp_posts = get_posts(os.path.join(xml_path, filename))
        # get metadata
        md = filename.split('.')[:-1]
        for post in temp_posts:
            bp = BlogPost(int(md[0]), md[1], int(md[2]), md[3], md[4], post)
            blog_posts.append(bp)
    return blog_posts

In [17]:
blog_posts = get_all_posts(xml_filenames)

#### Pickle loaded posts 

In [18]:
with open(os.path.join(os.getcwd(), 'data', 'blog_posts.pickle'), 'wb') as fh:
    pickle.dump(blog_posts, fh)

### Create the DataFrame with posts and metadata

In [19]:
# Create a dataframe from blogposts
df_posts = pd.DataFrame(blog_posts)
df_posts.head()

Unnamed: 0,auth_id,gender,age,industry,astro,posts
0,4162441,male,16,Student,Sagittarius,DESTINY... you might not say anything ...
1,4162441,male,16,Student,Sagittarius,"DEAR ANGEL.. you say it or you don't, ..."
2,4162441,male,16,Student,Sagittarius,MAIN AUR MERI TANHAI (jagjeet singh) awara ha...
3,4162441,male,16,Student,Sagittarius,mail addressrs(s) urlLink http://rediff.com ...
4,4162441,male,16,Student,Sagittarius,RAP- ALLRISE so stand back cause u don't notic...


In [23]:
df_posts.describe()

Unnamed: 0,auth_id,age
count,681288.0,681288.0
mean,2397798.0,23.932321
std,1247720.0,7.785986
min,5114.0,13.0
25%,1239610.0,17.0
50%,2607256.0,24.0
75%,3525660.0,26.0
max,4337650.0,48.0


#### Pickle the new Large BlogPosts dataframe, which includes a row for every blog post along with metadata

In [24]:
# pickle new dataframe
with open(os.path.join(os.getcwd(), 'data', 'df_posts.pickle'), 'wb') as fh:
    pickle.dump(df_posts, fh)

## Check that we can extract data by author_id or list of author ids

In [25]:
sample_auths = [3444474, 2845196]

In [26]:
# We can simply select all rows with a given author id.  If needed we can grab the posts selecting that column
sample = df_posts[df_posts['auth_id']==sample_auths[0]]
print(len(sample))
sample.sample()

82


Unnamed: 0,auth_id,gender,age,industry,astro,posts
93157,3444474,female,17,Student,Sagittarius,"Yeah ok, so I was late leaving my house, and s..."


In [27]:
# We can simply select all rows with a given author id.  If needed we can grab the posts selecting that column
sample = df_posts[df_posts['auth_id'].apply(lambda item: item in sample_auths)]
print(len(sample))
sample.sample(4)

95


Unnamed: 0,auth_id,gender,age,industry,astro,posts
93132,3444474,female,17,Student,Sagittarius,//rocks out in my head Another boring day. I...
93101,3444474,female,17,Student,Sagittarius,"I need to stop posting, but I feel like bitchi..."
93144,3444474,female,17,Student,Sagittarius,"Aaah, my boss is not in today, I feel so relax..."
93081,3444474,female,17,Student,Sagittarius,So I'm listening to really sugary heartache po...


## Get all posts for a provided list of author ids. 

In [28]:
# provided in instructions
auth_ids = [3574878, 2845196, 3444474, 3445677, 828046, 4284264, 3498812, 4137740, 3662461, 3363271]

Pandas question: we can use a filter to select rows which have a particular value for some feature, e.g. all posts with a given auth id.  The boolean expression is a conditional id to the auth_id.  What if have a set of auth_ids, and you want to get all those rows.  Obviously you could just iterate through the list with a for loop, but seems like a common enough operation that there should be a pandas way to do it.  or numpy. 

Another way may be using apply with a function that the auth id is in that list, not sure of relative performance. ENDED UP WITH THIS.  This is another way to create a boolean filter that we can apply to the dataframe.

In [29]:
# create boolean filter
sel_auths = df_posts['auth_id'].apply(lambda x: x in auth_ids)

In [30]:
# This is the subset dataframe
df_posts_auth_sample = df_posts[sel_auths]
df_posts_auth_sample.shape

(153, 6)

In [31]:
df_posts.head(3)

Unnamed: 0,auth_id,gender,age,industry,astro,posts
0,4162441,male,16,Student,Sagittarius,DESTINY... you might not say anything ...
1,4162441,male,16,Student,Sagittarius,"DEAR ANGEL.. you say it or you don't, ..."
2,4162441,male,16,Student,Sagittarius,MAIN AUR MERI TANHAI (jagjeet singh) awara ha...


In [32]:
# Next, get posts (the column)
posts_sample = df_posts_auth_sample['posts']

## Run Standard Text Mining Pipeline

### Prepare data re train-test split and labels; consider stratification because of class imbalance

In [33]:
# first remove unnec columns
df_10auth_sel_posts = df_posts_auth_sample[['auth_id', 'posts']]

In [34]:
df_10auth_sel_posts.shape

(153, 2)

In [35]:
all_data = df_10auth_sel_posts['posts']
all_labels = df_10auth_sel_posts['auth_id']

In [36]:
# check that we have all auths and look at the distribution of the number of posts
df_10auth_sel_posts['auth_id'].value_counts()

3444474    82
3574878    14
2845196    13
828046     12
3498812    10
4284264     5
3662461     5
3445677     5
3363271     5
4137740     2
Name: auth_id, dtype: int64

**Very imbalanced dataset**

In [37]:
# There is generic statement to set up train test split, then you can add options to do it diff ways
df_auth_train, df_auth_test, labels_train, labels_test = train_test_split(all_data, all_labels, 
                                                                          test_size=0.3, stratify=all_labels)

### Create Pipelines for a few diff classifiers

In [38]:
# Create the template - we may not need the transformer because we are using tfidf to vectorize as well as transform.
auth_clf_naive_bayes = Pipeline([
    ('vect', TfidfVectorizer(ngram_range=(1, 6), analyzer='char', use_idf='false')),
    ('clf', MultinomialNB(alpha=1)),
])

In [39]:
# Create the template - we may not need the transformer because we are using tfidf to vectorize as well as transform.
auth_clf_perceptron = Pipeline([
    ('vect', TfidfVectorizer(ngram_range=(1, 6), analyzer='char', use_idf='false')),
    ('clf', Perceptron()),
])

In [40]:
# Create the template - we may not need the transformer because we are using tfidf to vectorize as well as transform.
auth_clf_svm = Pipeline([
    ('vect', TfidfVectorizer(ngram_range=(1, 6), analyzer='char', use_idf='false')),
    ('clf', SGDClassifier()),
])

In [41]:
clfs = {
    'naive_bayes':auth_clf_naive_bayes, 
    'perceptron': auth_clf_perceptron, 
    'svm': auth_clf_svm
}

### Train, predict, evaluate

In [42]:
# Train by using a method on each classifier 
clfs_preds = dict()
for clf_name, clf in clfs.items():
    clf.fit(df_auth_train, labels_train)
    preds = clf.predict(df_auth_test)
    clfs_preds[clf_name] = preds
    acc = np.mean(preds==labels_test)
    print(f'{clf_name:11}  Accuracy: {acc:.4}')

naive_bayes  Accuracy: 0.5435
perceptron   Accuracy: 0.7391
svm          Accuracy: 0.7609


In [43]:
# calculate the f1-scores; labels_test is global
for clf_name, preds in clfs_preds.items():
    print(f'{clf_name} f-scores')
    for avg in ['weighted', 'macro', 'micro']:
        f1 = metrics.f1_score(labels_test, preds, average=avg, zero_division=0)
        print(f'\t{avg:8} {f1:.4}')

naive_bayes f-scores
	weighted 0.3827
	macro    0.07042
	micro    0.5435
perceptron f-scores
	weighted 0.7076
	macro    0.4814
	micro    0.7391
svm f-scores
	weighted 0.7206
	macro    0.3747
	micro    0.7609


## Extra Challenge

In [44]:
### Use the last steps from above to make a function; using the same set of Pipelines defined as clfs

def pipeline_to_f1_scores(diff_samples, clfs):
    """Apply the methodology used above on different sets of sampled data"""
    data = diff_samples['posts']
    labels = diff_samples['auth_id']
    df_auth_train, df_auth_test, labels_train, labels_test = train_test_split(data, labels, 
                                                                          test_size=0.3, stratify=labels)
    clfs.fit(df_auth_train, labels_train)
    preds = clfs.predict(df_auth_test)
    acc = np.mean(preds==labels_test)

    # calculate the f1-scores; labels_test is global
    
    print(f'{clf_name} f-scores')
    for avg in ['weighted', 'macro', 'micro']:
        f1 = metrics.f1_score(labels_test, preds, average=avg, zero_division=0)
        print(f'\t{avg:8} {f1:.4}')

### Create different samples

In [45]:
# take a look at the number of posts per author.  lets try some more balanced samples
auths = df_posts['auth_id'].value_counts()
auths

449628     4221
734562     2301
589736     2294
1975546    2261
958176     2244
           ... 
3570196       1
3649483       1
4056423       1
4182724       1
3944260       1
Name: auth_id, Length: 19320, dtype: int64

In [46]:
auths_25 = auths[(df_posts['auth_id'].value_counts()==25)].index.to_list()
auths25 = auths_25[:10]

In [47]:
auths_15 = auths[(df_posts['auth_id'].value_counts()==15)].index.to_list()
auths15 = auths_15[:10]

In [48]:
df_auth25 = df_posts[df_posts['auth_id'].isin(auths25)][['auth_id', 'posts']]
df_auth25.shape

(250, 2)

In [49]:
df_auth15 = df_posts[df_posts['auth_id'].isin(auths15)][['auth_id', 'posts']]
df_auth15.shape

(150, 2)

In [50]:
pipeline_to_f1_scores(df_auth25)

naive_bayes f-scores
	weighted 0.3223
	macro    0.3232
	micro    0.36
perceptron f-scores
	weighted 0.7347
	macro    0.7367
	micro    0.7467
svm f-scores
	weighted 0.6722
	macro    0.6757
	micro    0.7067


In [51]:
pipeline_to_f1_scores(df_auth15)

naive_bayes f-scores
	weighted 0.2835
	macro    0.303
	micro    0.3778
perceptron f-scores
	weighted 0.6648
	macro    0.6616
	micro    0.6889
svm f-scores
	weighted 0.6314
	macro    0.6244
	micro    0.6889


- One concluding note is that the above approach is slow, and took a long time when I tried larger data sets; I will need to tune it to do that