In [1]:
import pandas as pd
import numpy as np

import json
import re
import os
import unicodedata
import itertools
import string
from nltk.corpus import stopwords
import nltk
import csv
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split


In [156]:
import pickle

# Processing Function

In [2]:
def process_corpus(path, replace_dict, min_length=10):
    """
    Removes source-specific artifacts from documents.
    
    :param path: filepath containing JSON files for each document
    :param replace_dict: dictionary containing regex matching strings and strings to replace them with
    :param min_length: minimum document length
    
    :returns: list of strings, each containing document content
    """
    
    docs = []
    #files = os.listdir(path)
    json_files = [pos_json for pos_json in os.listdir(path) if pos_json.endswith('.json')]

    for file in json_files:
        #file = '\\' + file
        content = json.load(open(path + file))['content']
        
        # replace regex strings
        for key, value in replace_dict.items():
            content = re.sub(key, value, content)
        
        # remove small documents
        if len(content) >= min_length:
            docs.append(content)
        
    return(docs)

# Economist

In [3]:
economist_path = 'data_updated/economist/'

In [4]:
economist_dict = {}

# artifacts on accented letters
economist_dict['AaAaAeA '] = 'i'
economist_dict['AaAaAeAo'] = 'c'
economist_dict['AaAaAeAc'] = 'a'
economist_dict['AaAaAeA~'] = 'n'
economist_dict['AaAaAeA@|AaAaAeA\?|AaAaAeA{|AaAaAe'] = 'e'



# numbers
economist_dict['([\d]+)([.,]?)([\d]+)'] = 'NUM'


# Go online artifacts
# end paragraph without punctuation (probably headers or titles)
economist_dict['<p>Go online ([^<]*)</p>|<p>([^<]*)([^.?!"]){1}</p>'] = ''

# end of paragraph tags
economist_dict['</p>'] = ''


In [5]:
economist_docs = process_corpus(economist_path, economist_dict)

In [6]:
print(len(economist_docs))

1104


In [7]:
economist_paragraphs = []
for doc in economist_docs:
    economist_paragraphs += doc.strip().split('<p>')
    
economist_paragraphs = [doc.strip() for doc in economist_paragraphs if len(doc) != 0]

In [8]:
len(economist_paragraphs)

11198

# Wired

In [9]:
wired_path = 'data_updated/wired/'

In [10]:
wired_dict = {}

# numbers
wired_dict['([\d]+)([.,]?)([\d]+)'] = 'NUM'

# end paragraph without punctuation (probably headers or titles)
# author/subject descriptions at end of article
# paragraph symbols
wired_dict['<p>([^<]*)([^.?!"]){1}</p>|<p>([^<]*)([A-Z]+) \(@(.*)|¶'] = ''

#email addresses
wired_dict['[\w\.-]+@[\w\.-]+']=''

# end of paragraph tags
wired_dict['</p>'] = ''

In [11]:
wired_docs = process_corpus(wired_path, wired_dict)

In [12]:
print(len(wired_docs))

1296


In [13]:
wired_paragraphs = []

for doc in wired_docs:
    wired_paragraphs += doc.strip().split('<p>')
    
wired_paragraphs = [doc.strip() for doc in wired_paragraphs if len(doc) != 0]

In [14]:
len(wired_paragraphs)

17142

# New Yorker

In [15]:
newyorker_path = 'data_updated/newyorker/'

In [16]:
newyorker_dict = {}

# artifacts on accented letters
newyorker_dict['AaAaAeA '] = 'i'
newyorker_dict['AaAaAeAo'] = 'c'
newyorker_dict['AaAaAeAc'] = 'a'
newyorker_dict['AaAaAeA~'] = 'n'
newyorker_dict['AaAaAeA@|AaAaAeA\?|AaAaAeA{|AaAaAe'] = 'e'

# numbers
newyorker_dict['([\d]+)([.,]?)([\d]+)'] = 'NUM'

# end paragraph without punctuation (probably headers or titles)
# bylines
newyorker_dict['<p>([^<]*)([^.?!"]){1}</p>|<p>Byline([^<]*)</p>'] = ''

# end of paragraph tags
newyorker_dict['</p>'] = ''

In [17]:
newyorker_docs = process_corpus(newyorker_path, newyorker_dict)

In [18]:
print(len(newyorker_docs))

807


In [19]:
newyorker_paragraphs = []
for doc in newyorker_docs:
    newyorker_paragraphs += doc.strip().split('<p>')
    
newyorker_paragraphs = [doc.strip() for doc in newyorker_paragraphs if len(doc) != 0]

In [20]:
len(newyorker_paragraphs)

19030

# EW

In [21]:
ew_path = 'data_updated/ew/'

In [22]:
ew_dict = {}

# artifacts on accented letters
ew_dict['AaAaAeA '] = 'i'
ew_dict['AaAaAeAo'] = 'c'
ew_dict['AaAaAeAc'] = 'a'
ew_dict['AaAaAeA~'] = 'n'
ew_dict['AaAaAeA@|AaAaAeA\?|AaAaAeA{|AaAaAe'] = 'e'

# numbers
ew_dict['([\d]+)([.,]?)([\d]+)'] = 'NUM'

#email addresses
ew_dict['[\w\.-]+@[\w\.-]+']=''

# Go online artifacts
# end paragraph without punctuation (probably headers or titles)
#ew_dict['<p>Go online ([^<]*)</p>|<p>([^<]*)([^.?!+"]){1}</p>'] = ''

#bullet points
ew_dict['\xc2\xb7']=''

# end of paragraph tags
ew_dict['</p>'] = ''

In [23]:
ew_docs = process_corpus(ew_path, ew_dict)

In [24]:
print(len(ew_docs))

2190


In [25]:
ew_paragraphs = []

for doc in ew_docs:
    ew_paragraphs += doc.strip().split('<p>')
    
ew_paragraphs = [doc.strip() for doc in ew_paragraphs if len(doc) != 0]

In [26]:
def remove_ratings(graf):
    
    """Removes ratings such as "B+", "A", "C-" from the end of a paragraph."""
    
    # Check if the last character in the last word is a sentence-ending character ('.', '!' and so on)
    if not any(x == graf.split()[-1][-1] for x in ['.', '!', '?', '\"', ')', '”', '…']):
        
        # If not, return paragraph with the last word removed
        return graf[0:-len(graf.split()[-1])].strip()
    
    else:
        return graf

In [27]:
ew_paragraphs = [remove_ratings(p) for p in ew_paragraphs]

In [28]:
print(len(ew_paragraphs))

17536


# Make Dataframe

In [29]:
paragraphs = economist_paragraphs + wired_paragraphs + newyorker_paragraphs+ew_paragraphs

In [30]:
sources = list(itertools.repeat('economist', len(economist_paragraphs)))
sources += list(itertools.repeat('wired', len(wired_paragraphs)))
sources += list(itertools.repeat('newyorker', len(newyorker_paragraphs)))
sources +=list(itertools.repeat('ew',len(ew_paragraphs)))

In [31]:
d = {'content':paragraphs, 'source':sources}
final_df = pd.DataFrame(data=d)

In [32]:
len(final_df)

64906

In [33]:
final_df.head()

Unnamed: 0,content,source
0,"DOWN the Euphrates river, halfway between Deir...",economist
1,Never have America and its allies had such a h...,economist
2,"But like their Parthian forebears, Iran and it...",economist
3,Iran's gains are even more striking elsewhere....,economist
4,"Farther south, America's hopes of stemming Ira...",economist


In [34]:
#final_df.to_csv('pre_pre_processed_data.csv')

# More Preprocessing

In [35]:
def removeHTMLTags(x):
    content = re.sub("(?i)<\/?\w+((\s+\w+(\s*=\s*(?:\".*?\"|'.*?'|[^'\">\s]+))?)+\s*|\s*)\/?>", '', x)
    #content1 = re.sub("\\b\\x94\\b", ' ', content)
    
    return(content)

In [36]:
final_df['Content_Preprocessed']=final_df['content'].apply(lambda x:removeHTMLTags(x))

In [37]:
final_df['Content_Preprocessed'].head()

0    DOWN the Euphrates river, halfway between Deir...
1    Never have America and its allies had such a h...
2    But like their Parthian forebears, Iran and it...
3    Iran's gains are even more striking elsewhere....
4    Farther south, America's hopes of stemming Ira...
Name: Content_Preprocessed, dtype: object

In [38]:
# Create a version of the content where all text inside quotes is replacde by 'Qx'

def replace_quotes(X):
    
    """
    Replaces all text inside quotes with 'Qx'.
    """
    
    X = re.sub(r'\"([^\"]+?)[^!?\'\.,]\" ', "\'Qx\' ", X) # "text"+space
    
    X = re.sub(r'\"([^\"]+?)\"\.', "\'Qx\'.", X) # "text". -- applies to economist (british)
    X = re.sub(r'\"([^\"]+?)\",', "\'Qx\',", X) # "text", -- applies to economist (british)
    
    X = re.sub(r'\"([^\"]+?),\"', "\'Qx,\'", X) # "text,"   
    X = re.sub(r'\"([^\"]+?)[!?\'\.]\"', "\'Qx.\'", X) # "text!" or "text?" or "text." 

    X = re.sub(r'\x93(.+?),\x94', "\'Qx,\'", X) # "text," -- applies to wired (weird chars)
    X = re.sub(r'\x93(.+?)[! ?\'\.]\x94', "\'Qx.\'", X) # "text!" or "text " or "text?" or "text." -- wired
    X = re.sub(r'\x93(.+?)\x94', "\'Qx\'", X) # "text" -- applies to wired (weird chars)
    
    return X

replaced_text = [replace_quotes(s) for s in final_df['Content_Preprocessed'].values]

final_df['Content_NoQuotes'] = replaced_text

In [39]:
# Filter out paragraphs with 20 non-quote words or less

final_df_filtered = final_df[final_df['Content_NoQuotes'].apply(lambda x: len([w for w in x.split() if 'Qx' not in w])) > 20]


In [40]:
filter_bool = final_df['Content_NoQuotes'].apply(lambda x: len([w for w in x.split() if 'Qx' not in w])) > 20

In [41]:
final_df_filtered = final_df[['source', 'Content_Preprocessed', 'Content_NoQuotes']][filter_bool.values]

In [42]:
final_df_filtered.shape

(52922, 3)

In [43]:
# Check class balance

final_df_filtered.groupby('source').count()

Unnamed: 0_level_0,Content_Preprocessed,Content_NoQuotes
source,Unnamed: 1_level_1,Unnamed: 2_level_1
economist,10982,10982
ew,12414,12414
newyorker,15780,15780
wired,13746,13746


# Feature Creation

In [44]:
from textstat.textstat import textstat

In [45]:
# Readability scores
# final_df_filtered['readability_f'] = final_df_filtered['Content_Preprocessed'].apply(textstat.flesch_reading_ease)
final_df_filtered['readability_SMOG']=final_df_filtered['Content_Preprocessed'].apply(textstat.smog_index)

In [48]:
# Tokenize by sentences
final_df_filtered['Sentence_Tokens'] = final_df_filtered['Content_Preprocessed'].apply(nltk.sent_tokenize)

# Number of sentences = length of sentence tokens list
final_df_filtered['Num_Sentences'] = final_df_filtered['Sentence_Tokens'].apply(len)

# Average sentence length = number of words/number of sentences
word_counts = final_df_filtered['Content_Preprocessed'].apply(lambda x: len(x.split()))
final_df_filtered['avg_sent_len'] = word_counts/final_df_filtered['Num_Sentences']

In [63]:
final_df_filtered.reset_index(inplace = True)

In [104]:
final_df_filtered.sort_values(by = 'Num_Sentences', ascending=False).head()

Unnamed: 0,index,source,Content_Preprocessed,Content_NoQuotes,readability_f,readability_SMOG,Sentence_Tokens,Num_Sentences,avg_sent_len
19466,21553,wired,"WIRED: Peter Thiel, expressing his dissatisfac...","WIRED: Peter Thiel, expressing his dissatisfac...",65.93,10.6,"[WIRED: Peter Thiel, expressing his dissatisfa...",38,13.421053
18804,20796,wired,"WIRED: You founded Metafilter, a communal webl...","WIRED: You founded Metafilter, a communal webl...",74.39,9.0,"[WIRED: You founded Metafilter, a communal web...",37,13.027027
38487,44973,newyorker,In a conversation that took place in a buildin...,In a conversation that took place in a buildin...,35.64,10.7,[In a conversation that took place in a buildi...,36,3.416667
40890,47907,ew,YOU KNOW IT'S BEEN A GOOD PARTY WHEN JOSS WHED...,YOU KNOW IT'S BEEN A GOOD PARTY WHEN JOSS WHED...,69.31,10.6,[YOU KNOW IT'S BEEN A GOOD PARTY WHEN JOSS WHE...,33,16.515152
15423,16615,wired,GEORGE CARLIN. Lenny Bruce. That dog puppet. W...,GEORGE CARLIN. Lenny Bruce. That dog puppet. W...,65.93,10.3,"[GEORGE CARLIN., Lenny Bruce., That dog puppet...",31,12.451613


In [97]:
final_df_filtered.Num_Sentences.describe()

count    52922.000000
mean         4.949851
std          2.728297
min          1.000000
25%          3.000000
50%          5.000000
75%          6.000000
max         38.000000
Name: Num_Sentences, dtype: float64

In [98]:
final_df_filtered.avg_sent_len.describe()

count    52922.000000
mean        20.051542
std          7.261307
min          3.000000
25%         15.285714
50%         19.000000
75%         23.500000
max        180.000000
Name: avg_sent_len, dtype: float64

In [131]:
# Get standard deviation sentence length for each paragraph

def getSDSentLen(p):
    sent_tokens = p[0]
    avg_sent_len = p[1]
    
    # Get length of each sentence as np array
    lengths = np.array([len(s.split()) for s in sent_tokens])
    
    # Calculate SD
    return (np.sqrt(np.sum((lengths - avg_sent_len)**2)/len(sent_tokens)))

final_df_filtered['sd_sent_len'] = final_df_filtered[['Sentence_Tokens',
                                                      'avg_sent_len']].apply(lambda x: getSDSentLen(x), axis=1)

In [135]:
# If have time: These functions could be rewritten so they run faster...

def normStopWordFrequency(para):
    stopwords1=set(stopwords.words('english'))
    word_tok = nltk.word_tokenize(para) #need to take out commas plus other stuff
    NoWord = [',','(',')',':',';','.','%','\x96','\x94','{','}','[',']','!','?',"''","``", 'Qx']
    word_tok2 = [i for i in word_tok if i not in NoWord]
    nw = len(word_tok2)
    word_tok_stop=[i for i in word_tok if i.lower() in stopwords1]
    n_stop=len(word_tok_stop)
    return(n_stop/nw)

def normFunctWordFrequency(functional,para):
    word_tok = nltk.word_tokenize(para) #need to take out commas plus other stuff
    NoWord = [',','(',')',':',';','.','%','\x96','\x94','{','}','[',']','!','?',"''","``", 'Qx']
    word_tok2 = [i for i in word_tok if i not in NoWord]
    nw = len(word_tok2)
    word_tok_funct=[i for i in word_tok if i.lower() in functional]
    n_funct=len(word_tok_funct)
    return(n_funct/nw)

def normPunctFrequency(para):
    count = lambda l1, l2: len(list(filter(lambda c: c in l2, l1)))
                               
    no_punct = count(para, string.punctuation)
    word_tok = nltk.word_tokenize(para) #need to take out commas plus other stuff
    NoWord = [',','(',')',':',';','.','%','\x96','\x94','{','}','[',']','!','?',"''","``", 'Qx']
    word_tok2 = [i for i in word_tok if i not in NoWord]
    nw = len(word_tok2)
    return(no_punct/nw)

In [136]:
final_df_filtered['norm_stop_freq'] = final_df_filtered.apply(lambda row: normStopWordFrequency(row['Content_NoQuotes']), axis=1)


In [137]:
final_df_filtered['norm_punct_freq'] = final_df_filtered.apply(lambda row: normPunctFrequency(row['Content_NoQuotes']), axis=1)

In [138]:
# Get functional words

functional_file = open("functional.txt", "r")
f_words = [word.strip() for line in functional_file.readlines() for word in line.split(',') if word.strip()]
functional_file.close()

functional = list(set(f_words))

In [139]:
final_df_filtered['norm_funct_freq'] = final_df_filtered.apply(lambda row: normFunctWordFrequency(functional,
                                                                                                  row['Content_NoQuotes']),axis=1)

# Features Dataframe for Modeling

In [140]:
final_df_filtered.head()

Unnamed: 0,index,source,Content_Preprocessed,Content_NoQuotes,readability_f,readability_SMOG,Sentence_Tokens,Num_Sentences,avg_sent_len,sd_sent_len,norm_stop_freq,norm_punct_freq,norm_funct_freq
0,0,economist,"DOWN the Euphrates river, halfway between Deir...","DOWN the Euphrates river, halfway between Deir...",50.46,11.2,"[DOWN the Euphrates river, halfway between Dei...",3,20.666667,11.841546,0.421875,0.234375,0.390625
1,1,economist,Never have America and its allies had such a h...,Never have America and its allies had such a h...,62.68,10.7,[Never have America and its allies had such a ...,4,17.0,3.391165,0.5,0.132353,0.470588
2,2,economist,"But like their Parthian forebears, Iran and it...","But like their Parthian forebears, Iran and it...",91.31,5.7,"[But like their Parthian forebears, Iran and i...",5,13.8,5.844656,0.422535,0.197183,0.43662
3,3,economist,Iran's gains are even more striking elsewhere....,Iran's gains are even more striking elsewhere....,61.87,11.6,[Iran's gains are even more striking elsewhere...,5,17.8,7.62627,0.422222,0.255556,0.411111
4,4,economist,"Farther south, America's hopes of stemming Ira...","Farther south, America's hopes of stemming Ira...",52.49,13.6,"[Farther south, America's hopes of stemming Ir...",3,18.666667,7.408704,0.35,0.316667,0.333333


In [154]:
df_features = final_df_filtered[['source','readability_SMOG','avg_sent_len', 'Num_Sentences',
                                 'sd_sent_len','norm_stop_freq','norm_punct_freq','norm_funct_freq']]

# NOTE: Descriptive statistics for SMOG show more variation than Flesch, so dropped Flesch

In [155]:
df_features.head()

Unnamed: 0,source,readability_SMOG,avg_sent_len,Num_Sentences,sd_sent_len,norm_stop_freq,norm_punct_freq,norm_funct_freq
0,economist,11.2,20.666667,3,11.841546,0.421875,0.234375,0.390625
1,economist,10.7,17.0,4,3.391165,0.5,0.132353,0.470588
2,economist,5.7,13.8,5,5.844656,0.422535,0.197183,0.43662
3,economist,11.6,17.8,5,7.62627,0.422222,0.255556,0.411111
4,economist,13.6,18.666667,3,7.408704,0.35,0.316667,0.333333


In [158]:
# Pickle for modeling

with open('pickles/features_04_29.pkl', 'wb') as f:
    pickle.dump(df_features, f)

## Test SVM ... (move)

In [76]:
# X -> features, y -> label
X=df_features[['readability_f','readability_SMOG','avg_sent_len','sd_sent_len','norm_stop_freq','norm_punct_freq','norm_funct_freq']]
y=df_features.source

In [77]:
# dividing X, y into train and test data 
#We use stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=y,test_size=0.3, random_state = 0)

In [78]:
svm_model_linear = SVC(kernel = 'linear', C = 1).fit(X_train, y_train)
svm_predictions = svm_model_linear.predict(X_test)

In [79]:
accuracy = svm_model_linear.score(X_test, y_test)
print(accuracy)

0.49770107703


In [80]:
# creating a confusion matrix
cm = confusion_matrix(y_test, svm_predictions)
print(cm)

[[1475  258  760  802]
 [ 236 1437 1245  806]
 [ 517  436 3357  424]
 [ 946  696  849 1633]]


## Radial Kernel

In [None]:
svm_model_radial = SVC(kernel = 'rbf', C = 1).fit(X_train, y_train)
svm_predictions = svm_model_radial.predict(X_test)

In [None]:
accuracy = svm_model_radial.score(X_test, y_test)
print(accuracy)