In [3]:
import pandas as pd
import numpy as np

import json
import re
import os
import unicodedata
import itertools
import string
from nltk.corpus import stopwords
import nltk
import csv
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split


In [4]:
import pickle
from nltk import pos_tag
from nltk.tag.mapping import tagset_mapping
from nltk.util import ngrams

from collections import Counter

# Processing Function

In [5]:
def process_corpus(path, replace_dict, min_length=10):
    """
    Removes source-specific artifacts from documents.
    
    :param path: filepath containing JSON files for each document
    :param replace_dict: dictionary containing regex matching strings and strings to replace them with
    :param min_length: minimum document length
    
    :returns: list of strings, each containing document content
    """
    
    docs = []
    #files = os.listdir(path)
    json_files = [pos_json for pos_json in os.listdir(path) if pos_json.endswith('.json')]

    for file in json_files:
        #file = '\\' + file
        content = json.load(open(path + file))['content']
        
        # replace regex strings
        for key, value in replace_dict.items():
            content = re.sub(key, value, content)
        
        # remove small documents
        if len(content) >= min_length:
            docs.append(content)
        
    return(docs)

# Economist

In [6]:
economist_path = 'data_updated/economist/'

In [7]:
economist_dict = {}

# artifacts on accented letters
economist_dict['AaAaAeA '] = 'i'
economist_dict['AaAaAeAo'] = 'c'
economist_dict['AaAaAeAc'] = 'a'
economist_dict['AaAaAeA~'] = 'n'
economist_dict['AaAaAeA@|AaAaAeA\?|AaAaAeA{|AaAaAe'] = 'e'



# numbers
economist_dict['([\d]+)([.,]?)([\d]+)'] = 'NUM'


# Go online artifacts
# end paragraph without punctuation (probably headers or titles)
economist_dict['<p>Go online ([^<]*)</p>|<p>([^<]*)([^.?!"]){1}</p>'] = ''

# end of paragraph tags
economist_dict['</p>'] = ''


In [8]:
economist_docs = process_corpus(economist_path, economist_dict)

In [9]:
print(len(economist_docs))

1104


In [10]:
economist_paragraphs = []
for doc in economist_docs:
    economist_paragraphs += doc.strip().split('<p>')
    
economist_paragraphs = [doc.strip() for doc in economist_paragraphs if len(doc) != 0]

In [11]:
len(economist_paragraphs)

11198

# Wired

In [12]:
wired_path = 'data_updated/wired/'

In [13]:
wired_dict = {}

# numbers
wired_dict['([\d]+)([.,]?)([\d]+)'] = 'NUM'

# end paragraph without punctuation (probably headers or titles)
# author/subject descriptions at end of article
# paragraph symbols
wired_dict['<p>([^<]*)([^.?!"]){1}</p>|<p>([^<]*)([A-Z]+) \(@(.*)|¶'] = ''

#email addresses
wired_dict['[\w\.-]+@[\w\.-]+']=''

# end of paragraph tags
wired_dict['</p>'] = ''

In [14]:
wired_docs = process_corpus(wired_path, wired_dict)

In [15]:
print(len(wired_docs))

1296


In [16]:
wired_paragraphs = []

for doc in wired_docs:
    wired_paragraphs += doc.strip().split('<p>')
    
wired_paragraphs = [doc.strip() for doc in wired_paragraphs if len(doc) != 0]

In [17]:
len(wired_paragraphs)

17142

# New Yorker

In [18]:
newyorker_path = 'data_updated/newyorker/'

In [19]:
newyorker_dict = {}

# artifacts on accented letters
newyorker_dict['AaAaAeA '] = 'i'
newyorker_dict['AaAaAeAo'] = 'c'
newyorker_dict['AaAaAeAc'] = 'a'
newyorker_dict['AaAaAeA~'] = 'n'
newyorker_dict['AaAaAeA@|AaAaAeA\?|AaAaAeA{|AaAaAe'] = 'e'

# numbers
newyorker_dict['([\d]+)([.,]?)([\d]+)'] = 'NUM'

# end paragraph without punctuation (probably headers or titles)
# bylines
newyorker_dict['<p>([^<]*)([^.?!"]){1}</p>|<p>Byline([^<]*)</p>'] = ''

# end of paragraph tags
newyorker_dict['</p>'] = ''

In [20]:
newyorker_docs = process_corpus(newyorker_path, newyorker_dict)

In [21]:
print(len(newyorker_docs))

807


In [22]:
newyorker_paragraphs = []
for doc in newyorker_docs:
    newyorker_paragraphs += doc.strip().split('<p>')
    
newyorker_paragraphs = [doc.strip() for doc in newyorker_paragraphs if len(doc) != 0]

In [23]:
len(newyorker_paragraphs)

19030

# EW

In [24]:
ew_path = 'data_updated/ew/'

In [25]:
ew_dict = {}

# artifacts on accented letters
ew_dict['AaAaAeA '] = 'i'
ew_dict['AaAaAeAo'] = 'c'
ew_dict['AaAaAeAc'] = 'a'
ew_dict['AaAaAeA~'] = 'n'
ew_dict['AaAaAeA@|AaAaAeA\?|AaAaAeA{|AaAaAe'] = 'e'

# numbers
ew_dict['([\d]+)([.,]?)([\d]+)'] = 'NUM'

#email addresses
ew_dict['[\w\.-]+@[\w\.-]+']=''

# Go online artifacts
# end paragraph without punctuation (probably headers or titles)
#ew_dict['<p>Go online ([^<]*)</p>|<p>([^<]*)([^.?!+"]){1}</p>'] = ''

#bullet points
ew_dict['\xc2\xb7']=''

# end of paragraph tags
ew_dict['</p>'] = ''

In [26]:
ew_docs = process_corpus(ew_path, ew_dict)

In [27]:
print(len(ew_docs))

2190


In [28]:
ew_paragraphs = []

for doc in ew_docs:
    ew_paragraphs += doc.strip().split('<p>')
    
ew_paragraphs = [doc.strip() for doc in ew_paragraphs if len(doc) != 0]

In [29]:
def remove_ratings(graf):
    
    """Removes ratings such as "B+", "A", "C-" from the end of a paragraph."""
    
    # Check if the last character in the last word is a sentence-ending character ('.', '!' and so on)
    if not any(x == graf.split()[-1][-1] for x in ['.', '!', '?', '\"', ')', '”', '…']):
        
        # If not, return paragraph with the last word removed
        return graf[0:-len(graf.split()[-1])].strip()
    
    else:
        return graf

In [30]:
ew_paragraphs = [remove_ratings(p) for p in ew_paragraphs]

In [31]:
print(len(ew_paragraphs))

17536


# Make Dataframe

In [32]:
paragraphs = economist_paragraphs + wired_paragraphs + newyorker_paragraphs+ew_paragraphs

In [33]:
sources = list(itertools.repeat('economist', len(economist_paragraphs)))
sources += list(itertools.repeat('wired', len(wired_paragraphs)))
sources += list(itertools.repeat('newyorker', len(newyorker_paragraphs)))
sources +=list(itertools.repeat('ew',len(ew_paragraphs)))

In [34]:
d = {'content':paragraphs, 'source':sources}
final_df = pd.DataFrame(data=d)

In [35]:
len(final_df)

64906

In [36]:
final_df.head()

Unnamed: 0,content,source
0,"DOWN the Euphrates river, halfway between Deir...",economist
1,Never have America and its allies had such a h...,economist
2,"But like their Parthian forebears, Iran and it...",economist
3,Iran's gains are even more striking elsewhere....,economist
4,"Farther south, America's hopes of stemming Ira...",economist


In [37]:
#final_df.to_csv('pre_pre_processed_data.csv')

# More Preprocessing

In [38]:
def removeHTMLTags(x):
    content = re.sub("(?i)<\/?\w+((\s+\w+(\s*=\s*(?:\".*?\"|'.*?'|[^'\">\s]+))?)+\s*|\s*)\/?>", '', x)
    #content1 = re.sub("\\b\\x94\\b", ' ', content)
    
    return(content)

In [39]:
final_df['Content_Preprocessed']=final_df['content'].apply(lambda x:removeHTMLTags(x))

In [40]:
final_df['Content_Preprocessed'].head()

0    DOWN the Euphrates river, halfway between Deir...
1    Never have America and its allies had such a h...
2    But like their Parthian forebears, Iran and it...
3    Iran's gains are even more striking elsewhere....
4    Farther south, America's hopes of stemming Ira...
Name: Content_Preprocessed, dtype: object

In [41]:
# Create a version of the content where all text inside quotes is replacde by 'Qx'

def replace_quotes(X):
    
    """
    Replaces all text inside quotes with 'Qx'.
    """
    
    X = re.sub(r'\"([^\"]+?)[^!?\'\.,]\" ', "\'Qx\' ", X) # "text"+space
    
    X = re.sub(r'\"([^\"]+?)\"\.', "\'Qx\'.", X) # "text". -- applies to economist (british)
    X = re.sub(r'\"([^\"]+?)\",', "\'Qx\',", X) # "text", -- applies to economist (british)
    
    X = re.sub(r'\"([^\"]+?),\"', "\'Qx,\'", X) # "text,"   
    X = re.sub(r'\"([^\"]+?)[!?\'\.]\"', "\'Qx.\'", X) # "text!" or "text?" or "text." 
    
    X = re.sub(r'“(.+?),”', "\'Qx,\'", X) # “text,” (CURLY QUOTES)
    X = re.sub(r'“(.+?)[! ?\'\.]”', "\'Qx.\'", X) # “text!” or “text?” or “text.”
    X = re.sub(r'“(.+?)”', "\'Qx\'", X) # “text”

    X = re.sub(r'\x93(.+?),\x94', "\'Qx,\'", X) # "text," -- applies to wired (weird chars)
    X = re.sub(r'\x93(.+?)[! ?\'\.]\x94', "\'Qx.\'", X) # "text!" or "text " or "text?" or "text." -- wired
    X = re.sub(r'\x93(.+?)\x94', "\'Qx\'", X) # "text" -- applies to wired (weird chars)
    
    return X

replaced_text = [replace_quotes(s) for s in final_df['Content_Preprocessed'].values]

final_df['Content_NoQuotes'] = replaced_text

In [42]:
# Filter out paragraphs with 20 non-quote words or less

# final_df_filtered = final_df[final_df['Content_NoQuotes'].apply(lambda x: len([w for w in x.split() if 'Qx' not in w])) > 20]


In [43]:
filter_bool = final_df['Content_NoQuotes'].apply(lambda x: len([w for w in x.split() if 'Qx' not in w])) > 20

In [44]:
final_df_filtered = final_df[['source', 'Content_Preprocessed', 'Content_NoQuotes']][filter_bool.values]

In [45]:
final_df_filtered.shape

(52812, 3)

In [46]:
# Check class balance

final_df_filtered.groupby('source').count()

Unnamed: 0_level_0,Content_Preprocessed,Content_NoQuotes
source,Unnamed: 1_level_1,Unnamed: 2_level_1
economist,10982,10982
ew,12414,12414
newyorker,15780,15780
wired,13636,13636


# Feature Creation

In [47]:
from textstat.textstat import textstat

In [48]:
# Readability scores
# final_df_filtered['readability_f'] = final_df_filtered['Content_Preprocessed'].apply(textstat.flesch_reading_ease)
final_df_filtered['readability_SMOG']=final_df_filtered['Content_Preprocessed'].apply(textstat.smog_index)

In [49]:
# Tokenize by sentences
final_df_filtered['Sentence_Tokens'] = final_df_filtered['Content_Preprocessed'].apply(nltk.sent_tokenize)

# Number of sentences = length of sentence tokens list
final_df_filtered['Num_Sentences'] = final_df_filtered['Sentence_Tokens'].apply(len)

# Average sentence length = number of words/number of sentences
word_counts = final_df_filtered['Content_Preprocessed'].apply(lambda x: len(x.split()))
final_df_filtered['avg_sent_len'] = word_counts/final_df_filtered['Num_Sentences']

In [50]:
final_df_filtered.reset_index(inplace = True)

In [51]:
final_df_filtered.sort_values(by = 'Num_Sentences', ascending=False).head()

Unnamed: 0,index,source,Content_Preprocessed,Content_NoQuotes,readability_SMOG,Sentence_Tokens,Num_Sentences,avg_sent_len
19400,21553,wired,"WIRED: Peter Thiel, expressing his dissatisfac...","WIRED: Peter Thiel, expressing his dissatisfac...",10.6,"[WIRED: Peter Thiel, expressing his dissatisfa...",38,13.421053
18741,20796,wired,"WIRED: You founded Metafilter, a communal webl...","WIRED: You founded Metafilter, a communal webl...",9.0,"[WIRED: You founded Metafilter, a communal web...",37,13.027027
38377,44973,newyorker,In a conversation that took place in a buildin...,In a conversation that took place in a buildin...,10.7,[In a conversation that took place in a buildi...,36,3.416667
40780,47907,ew,YOU KNOW IT'S BEEN A GOOD PARTY WHEN JOSS WHED...,YOU KNOW IT'S BEEN A GOOD PARTY WHEN JOSS WHED...,10.6,[YOU KNOW IT'S BEEN A GOOD PARTY WHEN JOSS WHE...,33,16.515152
15399,16615,wired,GEORGE CARLIN. Lenny Bruce. That dog puppet. W...,GEORGE CARLIN. Lenny Bruce. That dog puppet. W...,10.3,"[GEORGE CARLIN., Lenny Bruce., That dog puppet...",31,12.451613


In [52]:
final_df_filtered.Num_Sentences.describe()

count    52812.000000
mean         4.953912
std          2.728608
min          1.000000
25%          3.000000
50%          5.000000
75%          6.000000
max         38.000000
Name: Num_Sentences, dtype: float64

In [53]:
final_df_filtered.avg_sent_len.describe()

count    52812.000000
mean        20.059174
std          7.261715
min          3.000000
25%         15.300000
50%         19.000000
75%         23.500000
max        180.000000
Name: avg_sent_len, dtype: float64

In [54]:
# Get standard deviation sentence length for each paragraph

def getSDSentLen(p):
    sent_tokens = p[0]
    avg_sent_len = p[1]
    
    # Get length of each sentence as np array
    lengths = np.array([len(s.split()) for s in sent_tokens])
    
    # Calculate SD
    return (np.sqrt(np.sum((lengths - avg_sent_len)**2)/len(sent_tokens)))

final_df_filtered['sd_sent_len'] = final_df_filtered[['Sentence_Tokens',
                                                      'avg_sent_len']].apply(lambda x: getSDSentLen(x), axis=1)

In [55]:
# If have time: These functions could be rewritten so they run faster...
# decide what's not a word based on presence of alphabetic characters, not list of non-words

def normStopWordFrequency(para):
    stopwords1=set(stopwords.words('english'))
    word_tok = nltk.word_tokenize(para) #need to take out commas plus other stuff
    NoWord = [',','(',')',':',';','.','%','\x96','\x94','{','}','[',']','!','?',"''","``", "'Qx", "'"]
    word_tok2 = [i for i in word_tok if i not in NoWord]
    nw = len(word_tok2)
    word_tok_stop=[i for i in word_tok if i.lower() in stopwords1]
    n_stop=len(word_tok_stop)
    return(n_stop/nw)

def normFunctWordFrequency(functional,para):
    word_tok = nltk.word_tokenize(para) #need to take out commas plus other stuff
    NoWord = [',','(',')',':',';','.','%','\x96','\x94','{','}','[',']','!','?',"''","``", "'Qx", "'"]
    word_tok2 = [i for i in word_tok if i not in NoWord]
    nw = len(word_tok2)
    word_tok_funct=[i for i in word_tok if i.lower() in functional]
    n_funct=len(word_tok_funct)
    return(n_funct/nw)

def normPunctFrequency(para):
    count = lambda l1, l2: len(list(filter(lambda c: c in l2, l1)))
                               
    no_punct = count(para, string.punctuation)
    word_tok = nltk.word_tokenize(para) #need to take out commas plus other stuff
    NoWord = [',','(',')',':',';','.','%','\x96','\x94','{','}','[',']','!','?',"''","``", "'Qx", "'"]
    word_tok2 = [i for i in word_tok if i not in NoWord]
    nw = len(word_tok2)
    return(no_punct/nw)

In [56]:
final_df_filtered['norm_stop_freq'] = final_df_filtered.apply(lambda row: normStopWordFrequency(row['Content_NoQuotes']), axis=1)


In [57]:
final_df_filtered['norm_punct_freq'] = final_df_filtered.apply(lambda row: normPunctFrequency(row['Content_NoQuotes']), axis=1)

In [58]:
# Get functional words

functional_file = open("functional.txt", "r")
f_words = [word.strip() for line in functional_file.readlines() for word in line.split(',') if word.strip()]
functional_file.close()

functional = list(set(f_words))

In [59]:
final_df_filtered['norm_funct_freq'] = final_df_filtered.apply(lambda row: normFunctWordFrequency(functional,
                                                                                                  row['Content_NoQuotes']),axis=1)

# POS Tag/Punctuation-Based Features

In [60]:
# IF MOVING THIS UP, remember token is now "Qx" not "'Qx"

def replace_qx(graf):
    
    """
    Replaces all instances of 'Qx' with "Qx"
    """

    graf = re.sub("'Qx'", '"Qx"', graf)
    graf = re.sub("'Qx,'", '"Qx,"', graf)
    graf = re.sub("'Qx.'", '"Qx."', graf)
    
    return graf

In [61]:
# Replace 'Qx' with "Qx" so as to not confuse tokenizer/tagger 
replaced_qx = [replace_qx(p) for p in final_df_filtered['Content_NoQuotes'].values]

final_df_filtered['Content_NoQuotes'] = replaced_qx

In [62]:
# Tokenize paragraphs (with quotes rendered as "Qx")
tokens = [nltk.word_tokenize(p) for p in final_df_filtered['Content_NoQuotes']]

# get (word, tag) tuples
pos_tags_raw = [pos_tag(token_list) for token_list in tokens]

In [63]:
universal_map = tagset_mapping('en-ptb', 'universal')

def map_taglist(tag_list):
    """
    Converts a list of PTB tags to a list of universal tags.
    """
    return([universal_map[tag] for tag in tag_list])

In [65]:
pos_tags = []

# only keep actual tags, as well as 'Qx', '!' and '?', etc.
for i in range(len(pos_tags_raw)):
    pos_tags.append([p if t not in ("Qx",'!', '?', ';', ':') else t for t, p in pos_tags_raw[i]])

In [66]:
# get universal POS tags

universal_tags = [map_taglist(taglist) for taglist in pos_tags]

In [67]:
# Adding tags back in to create custom tagset

# ':' includes dashes and colons and semi-colons

# list of accepted tags that we want to put back in
# UH = interjection
# VBG = gerund of present participle
# VBD = Verb, past tense
# PRP = Personal pronoun
# VBN = Verb, past participle

accepted_tags = ("Qx", ',', 'UH', 'VBG', 'VBD', 'PRP', 'NNS', 'VBP',
                 '!', '?', '``', "''", ':', ';')

def replace_tags(pos, univ):
    """Replaces tag in in universal list wherever it appears in POS tag list and is acceptable."""
    
    for i in range(len(pos)):
        if pos[i] in accepted_tags:
            univ[i] = pos[i]
    return univ

In [68]:
# Replace tags
universal_modified = [replace_tags(p, u) for p, u in list(zip(pos_tags,
                                                              universal_tags))]

In [69]:
# Put POS tags column in DF
final_df_filtered['pos_tags'] = universal_modified

In [70]:
# See final tag set
flat_tags = list(itertools.chain.from_iterable(final_df_filtered['pos_tags']))
universal_tagset = set(flat_tags)
print(universal_tagset)

{'ADJ', 'PRON', 'DET', 'VBG', 'VBD', 'PRP', 'X', 'VERB', ':', 'NUM', '.', 'NNS', 'CONJ', 'VBP', 'PRT', '?', "''", 'UH', ';', '!', 'ADP', 'NOUN', 'ADV', 'Qx', ',', '``'}


# Create Tag N-Grams

In [71]:
# Make bigrams and trigrams out of custom tagset

final_df_filtered['tag_bigrams'] = final_df_filtered['pos_tags'].apply(lambda x: list(ngrams(x, 2)))
final_df_filtered['tag_trigrams'] = final_df_filtered['pos_tags'].apply(lambda x: list(ngrams(x, 3)))

In [72]:
# Get set of all n-grams

set_bigrams = set(list(itertools.chain.from_iterable(final_df_filtered['tag_bigrams'])))
set_trigrams = set(list(itertools.chain.from_iterable(final_df_filtered['tag_trigrams'])))

set_grams = list(set_bigrams) + list(set_trigrams) + list(universal_tagset)

In [73]:
len(set_grams)

9844

In [74]:
def get_gram_set(p):
    
    """Gets the set of unigrams, bigrams and trigrams in a paragraph."""
    
    return set(p[0] + p[1] + p[2])


grafs_set_grams = final_df_filtered[['pos_tags',
                                     'tag_bigrams',
                                     'tag_trigrams']].apply(lambda x: get_gram_set(x), axis=1)

In [75]:
# Get the document frequency of each n-gram:

# Flatten list so we can count the document frequency for each gram
all_set_grams = [gram for p in grafs_set_grams for gram in p]

# Create document frequency counter
doc_freq = Counter(all_set_grams)

# rank by count
doc_freq_ranked = doc_freq.most_common()

In [76]:
# See top 30 most common grams
doc_freq_ranked[0:30]

[('NOUN', 52804),
 ('.', 52742),
 ('ADP', 52651),
 ('DET', 52531),
 ('VERB', 51666),
 ('ADJ', 51656),
 (',', 50897),
 (('DET', 'NOUN'), 50679),
 ('NNS', 48873),
 (('NOUN', 'ADP'), 48473),
 ('ADV', 48346),
 ('PRT', 48179),
 (('NOUN', '.'), 48130),
 (('ADJ', 'NOUN'), 47951),
 (('ADP', 'DET'), 47895),
 (('NOUN', 'NOUN'), 47119),
 ('CONJ', 46459),
 (('NOUN', ','), 46120),
 (('ADP', 'NOUN'), 46061),
 ('PRP', 44069),
 (('DET', 'ADJ'), 43619),
 (('ADP', 'DET', 'NOUN'), 41903),
 ('VBD', 39682),
 ('VBG', 39664),
 ('PRON', 39382),
 (('DET', 'ADJ', 'NOUN'), 38863),
 (('NOUN', 'VERB'), 35954),
 (('NOUN', 'PRT'), 35615),
 (('VERB', 'ADP'), 35411),
 (('DET', 'NOUN', 'ADP'), 35140)]

In [77]:
# Get the document frequency of each n-gram grouped by publication

# Get set of n-grams for every paragraph for every publication
ec_set_grams = [grams for grams, in_ec in list(zip(grafs_set_grams,
                                                   final_df_filtered['source']=='economist')) if in_ec]

ny_set_grams = [grams for grams, in_ny in list(zip(grafs_set_grams,
                                                   final_df_filtered['source']=='newyorker')) if in_ny]

wd_set_grams = [grams for grams, in_wd in list(zip(grafs_set_grams,
                                                   final_df_filtered['source']=='wired')) if in_wd]

ew_set_grams = [grams for grams, in_ew in list(zip(grafs_set_grams,
                                                   final_df_filtered['source']=='ew')) if in_ew]

# Flatten lists so we can count grams and make counters
ec_counter = Counter(list(itertools.chain.from_iterable(ec_set_grams)))
ny_counter = Counter(list(itertools.chain.from_iterable(ny_set_grams)))
wd_counter = Counter(list(itertools.chain.from_iterable(wd_set_grams)))
ew_counter = Counter(list(itertools.chain.from_iterable(ew_set_grams)))

In [78]:
# Get probability of each source

source_counts = Counter(final_df_filtered['source'])
total_grafs = len(final_df_filtered['source'])


# Probability of seeing a WIRED paragraph
p_wd = source_counts['wired']/total_grafs

# Probability of seeing an Economist paragraph
p_ec = source_counts['economist']/total_grafs

# Probability of seeing a New Yorker paragraph
p_ny = source_counts['newyorker']/total_grafs

# Probability of seeing a WIRED paragraph
p_ew = source_counts['ew']/total_grafs

print(p_wd, p_ec, p_ny, p_ew)

0.25819889419071423 0.20794516397788382 0.298795728243581 0.23506021358782095


In [79]:
# Create dictionary of source probabilities

source_prob = dict(zip(['wd', 'ec', 'ny', 'ew'], [p_wd, p_ec, p_ny, p_ew]))

source_prob

{'wd': 0.25819889419071423,
 'ec': 0.20794516397788382,
 'ny': 0.298795728243581,
 'ew': 0.23506021358782095}

In [80]:
# Get source_counter[gram] for each source as a dictionary, for each gram

def get_gram_dict(gram):
    return dict(zip(['wd', 'ec', 'ny', 'ew'], [wd_counter[gram], ec_counter[gram], ny_counter[gram], ew_counter[gram]]))

all_gram_dicts = dict(zip(set_grams, [get_gram_dict(gram) for gram in set_grams]))

In [81]:
get_gram_dict('NOUN')

{'wd': 13633, 'ec': 10981, 'ny': 15780, 'ew': 12410}

In [82]:
all_gram_dicts['NOUN']

{'wd': 13633, 'ec': 10981, 'ny': 15780, 'ew': 12410}

In [83]:
# Update source_count keys
source_counts['wd'] = source_counts.pop('wired')
source_counts['ny'] = source_counts.pop('newyorker')
source_counts['ec'] = source_counts.pop('economist')

source_counts

Counter({'ew': 12414, 'wd': 13636, 'ny': 15780, 'ec': 10982})

In [84]:
def calc_ig(gram_df, gram_dict, source_prob, source_counts, total_grafs):
    
    """ 
    Calculates Information Gain for a given n-gram.
    """
    
    # p_gram is probability of seeing this gram across all paragraphs
    # p_not_gram is 1 - p_gram
    p_gram = gram_df/total_grafs
    p_not_gram = 1 - p_gram
    
    # p(s|gram) is the probability of the source s given that we observe the gram
    p_source_given_gram = [gram_dict[s]/gram_df for s in ['wd', 'ec', 'ny', 'ew']]
    # If any of the probabilities are 0, change them to 1 so np.log(x) == 0
    p_source_given_gram = [p if p!=0 else 1 for p in p_source_given_gram]
    
    # Σ(p(s|g) * log(p(s|g)))
    s_given_g_entropy = sum([p * np.log(p) for p in p_source_given_gram])
    
    # p(s|not gram)
    p_s_given_not_gram = [(source_counts[s] - gram_dict[s])/(total_grafs - gram_df) for s in ['wd', 'ec', 'ny', 'ew']]
    # Change 0's to 1's so np.log(x) == 0
    p_s_given_not_gram = [p if p!=0 else 1 for p in p_s_given_not_gram]
    
    # Σ(p(s|not g) * log(p(s|not g)))
    p_s_given_not_g_entropy = sum([p * np.log(p) for p in p_s_given_not_gram])
    
    # Σ(p(s) * log(p(s))) 
    source_entropy = sum([(source_prob[s] * np.log(source_prob[s])) for s in ['wd', 'ec', 'ny', 'ew']])
    
    return (-source_entropy + (p_gram * s_given_g_entropy) + (p_not_gram * p_s_given_not_g_entropy))

In [85]:
# Information Gain

gram_ig = {gram: calc_ig(doc_freq[gram], all_gram_dicts[gram],
                         source_prob, source_counts, total_grafs) for gram in set_grams}

In [86]:
# Sort by IG value
sorted_ig = [(term, gram_ig[term]) for term in sorted(gram_ig,
                                                      key=gram_ig.get,
                                                      reverse=True)]

In [87]:
# Create dataframe of the proportional frequency of each n-gram in each publication, among
# the top 20 n-grams by information gain

top_ig_grams = sorted_ig[0:25]

gram = [g for g, ig in top_ig_grams]
gain = [ig for g, ig in top_ig_grams]
ec_prop = np.round([ec_counter[g]/source_counts['ec']*100 for g, ig in top_ig_grams], 2)
ny_prop = np.round([ny_counter[g]/source_counts['ny']*100 for g, ig in top_ig_grams], 2)
wd_prop = np.round([wd_counter[g]/source_counts['wd']*100 for g, ig in top_ig_grams], 2)
ew_prop = np.round([ew_counter[g]/source_counts['ew']*100 for g, ig in top_ig_grams], 2)

top_grams_df = pd.DataFrame(list(zip(gram, gain, ec_prop, ny_prop, wd_prop, ew_prop)),
                            columns=['gram', 'gain', 'ec_prop', 'ny_prop', 'wd_prop', 'ew_prop'])

In [88]:
top_grams_df

Unnamed: 0,gram,gain,ec_prop,ny_prop,wd_prop,ew_prop
0,"(., '')",0.081072,7.86,54.28,17.97,37.31
1,"(Qx, .)",0.078511,7.85,53.45,18.0,37.05
2,"(Qx, ., '')",0.07828,7.85,53.33,17.92,36.99
3,"(``, Qx, .)",0.07746,7.83,53.02,17.92,36.97
4,"(PRP, VBD)",0.064996,26.5,68.61,40.19,27.2
5,``,0.062612,37.78,74.82,31.74,47.65
6,'',0.06239,37.89,75.03,32.12,48.02
7,Qx,0.0609,36.73,73.78,31.52,46.62
8,"(``, Qx)",0.060564,36.6,73.47,31.26,46.5
9,"(,, '')",0.059261,9.51,49.67,19.09,33.06


In [89]:
# Manually select tags based on above table so we don't get repeats/multicollinearity

selected_tags = ((".", "''"), # quote that ends a sentence (American English)
                 ("PRP", "VBD"), # personal pronoun + past tense verb
                 '``', # any quote start
                 (",", "''"), # quote that ends with a comma (American English)
                 ("''", "PRP", "VBD"), # end of a quote followed by a personal pronoun + past tense verb
                 (",", "ADP"), # comma followed by a preposition or subordinating conjunction
                 ("NOUN", "VBD"), # noun + past tense verb
                 "VBD", # past tense verb
                 ("VBD", "."), # past tense verb that ends sentence????
                 ("VBD", ","), # past tense verb followed by comma?????
                 ("VBD", ".", "``"), # past tense verb that ends sentence, followed by beginning of quote???
                 ("VBD", "VERB"), # past tense verb + another verb
                 ("''", "NOUN", "VBD"), # end of a quote followed by noun + past tense verb
                 (".", "``"), # quote starts at beginning of sentence
                 ("''", "PRP"), # end quote + personal pronoun
                 (",", "VBD") # past tense verb following a comma
                )

In [118]:
len(selected_tags)

16

# Add POS Tag/Punctuation Features In

In [110]:
# Create new features based on selected_grams
# Each feature is the count of the gram, normalized by the # of unigrams/bigrams/trigrams

for gram in selected_tags:
    
    if isinstance(gram, str): # Unigrams
        col = 'pos_tags'
    elif isinstance(gram, tuple) and len(gram)==2:
        col = 'tag_bigrams'
    elif isinstance(gram, tuple) and len(gram)==3:
        col = 'tag_trigrams'

    final_df_filtered[gram] = final_df_filtered[col].apply(lambda tags: tags.count(gram)/len(tags))

In [130]:
final_df_filtered[list(selected_tags)].head()

Unnamed: 0,"(., '')","(PRP, VBD)",``,"(,, '')","('', PRP, VBD)","(,, ADP)","(NOUN, VBD)",VBD,"(VBD, .)","(VBD, ,)","(VBD, ., ``)","(VBD, VERB)","('', NOUN, VBD)","(., ``)","('', PRP)","(,, VBD)"
0,0.0,0.0,0.0,0.0,0.0,0.013333,0.013333,0.013158,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.013514,0.0,0.013333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.012346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.009346,0.0,0.0,0.0,0.0,0.0,0.018519,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.013889,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Features Dataframe for Modeling

In [120]:
final_df_filtered.head()

Unnamed: 0,index,source,Content_Preprocessed,Content_NoQuotes,readability_SMOG,Sentence_Tokens,Num_Sentences,avg_sent_len,sd_sent_len,norm_stop_freq,...,"(,, ADP)","(NOUN, VBD)",VBD,"(VBD, .)","(VBD, ,)","(VBD, ., ``)","(VBD, VERB)","('', NOUN, VBD)","(., ``)","('', PRP)"
0,0,economist,"DOWN the Euphrates river, halfway between Deir...","DOWN the Euphrates river, halfway between Deir...",11.2,"[DOWN the Euphrates river, halfway between Dei...",3,20.666667,11.841546,0.421875,...,0.013333,0.013333,0.013158,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,economist,Never have America and its allies had such a h...,Never have America and its allies had such a h...,10.7,[Never have America and its allies had such a ...,4,17.0,3.391165,0.5,...,0.013514,0.0,0.013333,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,economist,"But like their Parthian forebears, Iran and it...","But like their Parthian forebears, Iran and it...",5.7,"[But like their Parthian forebears, Iran and i...",5,13.8,5.844656,0.422535,...,0.012346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,economist,Iran's gains are even more striking elsewhere....,Iran's gains are even more striking elsewhere....,11.6,[Iran's gains are even more striking elsewhere...,5,17.8,7.62627,0.422222,...,0.0,0.0,0.018519,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,economist,"Farther south, America's hopes of stemming Ira...","Farther south, America's hopes of stemming Ira...",13.6,"[Farther south, America's hopes of stemming Ir...",3,18.666667,7.408704,0.362069,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [128]:
final_df_filtered.columns.values

array(['index', 'source', 'Content_Preprocessed', 'Content_NoQuotes',
       'readability_SMOG', 'Sentence_Tokens', 'Num_Sentences',
       'avg_sent_len', 'sd_sent_len', 'norm_stop_freq', 'norm_punct_freq',
       'norm_funct_freq', 'pos_tags', 'tag_bigrams', 'tag_trigrams',
       (',', 'VBD'), ('.', "''"), ('PRP', 'VBD'), '``', (',', "''"),
       ("''", 'PRP', 'VBD'), (',', 'ADP'), ('NOUN', 'VBD'), 'VBD',
       ('VBD', '.'), ('VBD', ','), ('VBD', '.', '``'), ('VBD', 'VERB'),
       ("''", 'NOUN', 'VBD'), ('.', '``'), ("''", 'PRP')], dtype=object)

In [140]:
og_features = ['source','readability_SMOG','avg_sent_len', 'Num_Sentences',
                                 'sd_sent_len','norm_stop_freq','norm_punct_freq','norm_funct_freq']

df_features = final_df_filtered[list(selected_tags)+og_features]

# NOTE: Descriptive statistics for SMOG show more variation than Flesch, so dropped Flesch

In [137]:
df_features.head()

Unnamed: 0,"(., '')","(PRP, VBD)",``,"(,, '')","('', PRP, VBD)","(,, ADP)","(NOUN, VBD)",VBD,"(VBD, .)","(VBD, ,)",...,"('', PRP)","(,, VBD)",source,readability_SMOG,avg_sent_len,Num_Sentences,sd_sent_len,norm_stop_freq,norm_punct_freq,norm_funct_freq
0,0.0,0.0,0.0,0.0,0.0,0.013333,0.013333,0.013158,0.0,0.0,...,0.0,0.0,economist,11.2,20.666667,3,11.841546,0.421875,0.234375,0.390625
1,0.0,0.0,0.0,0.0,0.0,0.013514,0.0,0.013333,0.0,0.0,...,0.0,0.0,economist,10.7,17.0,4,3.391165,0.5,0.132353,0.470588
2,0.0,0.0,0.0,0.0,0.0,0.012346,0.0,0.0,0.0,0.0,...,0.0,0.0,economist,5.7,13.8,5,5.844656,0.422535,0.197183,0.43662
3,0.0,0.009346,0.0,0.0,0.0,0.0,0.0,0.018519,0.0,0.0,...,0.0,0.0,economist,11.6,17.8,5,7.62627,0.422222,0.255556,0.411111
4,0.0,0.0,0.013889,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,economist,13.6,18.666667,3,7.408704,0.362069,0.327586,0.344828


In [141]:
# Pickle for modeling

with open('pickles/features_04_30.pkl', 'wb') as f:
    pickle.dump(df_features, f)

## Test SVM ... (move)

In [76]:
# X -> features, y -> label
X=df_features[['readability_f','readability_SMOG','avg_sent_len','sd_sent_len','norm_stop_freq','norm_punct_freq','norm_funct_freq']]
y=df_features.source

In [77]:
# dividing X, y into train and test data 
#We use stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=y,test_size=0.3, random_state = 0)

In [78]:
svm_model_linear = SVC(kernel = 'linear', C = 1).fit(X_train, y_train)
svm_predictions = svm_model_linear.predict(X_test)

In [79]:
accuracy = svm_model_linear.score(X_test, y_test)
print(accuracy)

0.49770107703


In [80]:
# creating a confusion matrix
cm = confusion_matrix(y_test, svm_predictions)
print(cm)

[[1475  258  760  802]
 [ 236 1437 1245  806]
 [ 517  436 3357  424]
 [ 946  696  849 1633]]


## Radial Kernel

In [None]:
svm_model_radial = SVC(kernel = 'rbf', C = 1).fit(X_train, y_train)
svm_predictions = svm_model_radial.predict(X_test)

In [None]:
accuracy = svm_model_radial.score(X_test, y_test)
print(accuracy)