In [1]:
import pandas as pd
import numpy as np

import json
import re
import os
import unicodedata
import itertools

# Processing Function

In [2]:
def process_corpus(path, replace_dict, min_length=10):
    """
    Removes source-specific artifacts from documents.
    
    :param path: filepath containing JSON files for each document
    :param replace_dict: dictionary containing regex matching strings and strings to replace them with
    :param min_length: minimum document length
    
    :returns: list of strings, each containing document content
    """
    
    docs = []
    #files = os.listdir(path)
    json_files = [pos_json for pos_json in os.listdir(path) if pos_json.endswith('.json')]

    for file in json_files:
        #file = '\\' + file
        content = json.load(open(path + file))['content']
        
        # replace regex strings
        for key, value in replace_dict.items():
            content = re.sub(key, value, content)
        
        # remove small documents
        if len(content) >= min_length:
            docs.append(content)
        
    return(docs)

# Economist

In [11]:
economist_path = 'data_updated/economist/'

In [12]:
economist_dict = {}

# artifacts on accented letters
economist_dict['AaAaAeA '] = 'i'
economist_dict['AaAaAeAo'] = 'c'
economist_dict['AaAaAeAc'] = 'a'
economist_dict['AaAaAeA~'] = 'n'
economist_dict['AaAaAeA@|AaAaAeA\?|AaAaAeA{|AaAaAe'] = 'e'



# numbers
economist_dict['([\d]+)([.,]?)([\d]+)'] = 'NUM'


# Go online artifacts
# end paragraph without punctuation (probably headers or titles)
economist_dict['<p>Go online ([^<]*)</p>|<p>([^<]*)([^.?!"]){1}</p>'] = ''

# end of paragraph tags
economist_dict['</p>'] = ''


In [13]:
economist_docs = process_corpus(economist_path, economist_dict)

In [14]:
print(len(economist_docs))

1104


In [15]:
economist_paragraphs = []
for doc in economist_docs:
    economist_paragraphs += doc.strip().split('<p>')
    
economist_paragraphs = [doc.strip() for doc in economist_paragraphs if len(doc) != 0]

In [16]:
len(economist_paragraphs)

11198

# Wired

In [17]:
wired_path = 'data_updated/wired/'

In [18]:
wired_dict = {}

# numbers
wired_dict['([\d]+)([.,]?)([\d]+)'] = 'NUM'

# end paragraph without punctuation (probably headers or titles)
# author/subject descriptions at end of article
# paragraph symbols
wired_dict['<p>([^<]*)([^.?!"]){1}</p>|<p>([^<]*)([A-Z]+) \(@(.*)|¶'] = ''

#email addresses
wired_dict['[\w\.-]+@[\w\.-]+']=''

# end of paragraph tags
wired_dict['</p>'] = ''

In [19]:
wired_docs = process_corpus(wired_path, wired_dict)

In [20]:
print(len(wired_docs))

1296


In [21]:
wired_paragraphs = []
for doc in wired_docs:
    wired_paragraphs += doc.strip().split('<p>')
    
wired_paragraphs = [doc.strip() for doc in wired_paragraphs if len(doc) != 0]

In [22]:
len(wired_paragraphs)

17142

# New Yorker

In [23]:
newyorker_path = 'data_updated/newyorker/'

In [24]:
newyorker_dict = {}

# artifacts on accented letters
newyorker_dict['AaAaAeA '] = 'i'
newyorker_dict['AaAaAeAo'] = 'c'
newyorker_dict['AaAaAeAc'] = 'a'
newyorker_dict['AaAaAeA~'] = 'n'
newyorker_dict['AaAaAeA@|AaAaAeA\?|AaAaAeA{|AaAaAe'] = 'e'

# numbers
newyorker_dict['([\d]+)([.,]?)([\d]+)'] = 'NUM'

# end paragraph without punctuation (probably headers or titles)
# bylines
newyorker_dict['<p>([^<]*)([^.?!"]){1}</p>|<p>Byline([^<]*)</p>'] = ''

# end of paragraph tags
newyorker_dict['</p>'] = ''

In [25]:
newyorker_docs = process_corpus(newyorker_path, newyorker_dict)

In [26]:
print(len(newyorker_docs))

807


In [27]:
newyorker_paragraphs = []
for doc in newyorker_docs:
    newyorker_paragraphs += doc.strip().split('<p>')
    
newyorker_paragraphs = [doc.strip() for doc in newyorker_paragraphs if len(doc) != 0]

In [28]:
len(newyorker_paragraphs)

19030

# EW

In [29]:
ew_path = 'data_updated/ew/'

In [30]:
ew_dict = {}

# artifacts on accented letters
ew_dict['AaAaAeA '] = 'i'
ew_dict['AaAaAeAo'] = 'c'
ew_dict['AaAaAeAc'] = 'a'
ew_dict['AaAaAeA~'] = 'n'
ew_dict['AaAaAeA@|AaAaAeA\?|AaAaAeA{|AaAaAe'] = 'e'

# numbers
ew_dict['([\d]+)([.,]?)([\d]+)'] = 'NUM'

#email addresses
ew_dict['[\w\.-]+@[\w\.-]+']=''

# Go online artifacts
# end paragraph without punctuation (probably headers or titles)
#ew_dict['<p>Go online ([^<]*)</p>|<p>([^<]*)([^.?!+"]){1}</p>'] = ''

#bullet points
ew_dict['\xc2\xb7']=''

# end of paragraph tags
ew_dict['</p>'] = ''

In [31]:
ew_docs = process_corpus(ew_path, ew_dict)

In [32]:
print(len(ew_docs))

2190


In [33]:
ew_paragraphs = []
for doc in ew_docs:
    ew_paragraphs += doc.strip().split('<p>')
    
ew_paragraphs = [doc.strip() for doc in ew_paragraphs if len(doc) != 0]

In [34]:
print(len(ew_paragraphs))

17536


# Write-out

In [35]:
paragraphs = economist_paragraphs + wired_paragraphs + newyorker_paragraphs+ew_paragraphs

In [36]:
sources = list(itertools.repeat('economist', len(economist_paragraphs)))
sources += list(itertools.repeat('wired', len(wired_paragraphs)))
sources += list(itertools.repeat('newyorker', len(newyorker_paragraphs)))
sources +=list(itertools.repeat('ew',len(ew_paragraphs)))

In [37]:
d = {'content':paragraphs, 'source':sources}
final_df = pd.DataFrame(data=d)

In [38]:
len(final_df)

64906

In [39]:
final_df.head()

Unnamed: 0,content,source
0,"DOWN the Euphrates river, halfway between Deir...",economist
1,Never have America and its allies had such a h...,economist
2,"But like their Parthian forebears, Iran and it...",economist
3,Iran's gains are even more striking elsewhere....,economist
4,"Farther south, America's hopes of stemming Ira...",economist


In [79]:
#final_df.to_csv('pre_pre_processed_data.csv')

# More Preprocessing

In [40]:
def removeHTMLTags(x):
    content = re.sub("(?i)<\/?\w+((\s+\w+(\s*=\s*(?:\".*?\"|'.*?'|[^'\">\s]+))?)+\s*|\s*)\/?>", '', x)
    #content1 = re.sub("\\b\\x94\\b", ' ', content)
    
    return(content)

In [41]:
final_df['Content_Preprocessed']=final_df['content'].apply(lambda x:removeHTMLTags(x))

In [42]:
final_df['Content_Preprocessed'].head()

0    DOWN the Euphrates river, halfway between Deir...
1    Never have America and its allies had such a h...
2    But like their Parthian forebears, Iran and it...
3    Iran's gains are even more striking elsewhere....
4    Farther south, America's hopes of stemming Ira...
Name: Content_Preprocessed, dtype: object

In [43]:
final_df1 = final_df[final_df['Content_Preprocessed'].apply(lambda x:len(x.split()))>20]

In [44]:
len(final_df1)

55136

# Sentence-Level Preprocessing Begins

In [143]:
from textstat.textstat import textstat

In [None]:
final_df1['readability_f'] = final_df1['Content_Preprocessed'].apply(textstat.flesch_reading_ease)

In [None]:
final_df1['readability_SMOG']=final_df1['Content_Preprocessed'].apply(textstat.smog_index)

In [215]:
final_df1['readability_f'].describe()

count    49339.000000
mean        62.872928
std         14.706272
min        -46.270000
25%         53.440000
50%         63.190000
75%         72.760000
max        116.350000
Name: readability_f, dtype: float64

In [217]:
#Need to explore this futher. Brought it down to 230 from over 2.5k
final_df1[(final_df1['readability_f']<0)|(final_df1['readability_f']>100) ]

Unnamed: 0,content,source,readability_f,Content_Preprocessed
2102,Vladimir frowns and thinks. And then he clicks...,economist,116.35,Vladimir frowns and thinks. And then he clicks...
6436,The exhibition proceeds broadly chronologicall...,economist,-46.27,The exhibition proceeds broadly chronologicall...
10091,"Westron wynde, when wilt thou blow, the small ...",economist,100.58,"Westron wynde, when wilt thou blow, the small ..."
11424,[1] A tiny suction hose pulls out and holds th...,wired,101.09,[1] A tiny suction hose pulls out and holds th...
11508,They were in luck--the attendant gassed up the...,wired,101.60,They were in luck--the attendant gassed up the...
11668,"[5] ""The water jug helps me track how much wat...",wired,101.60,"[5] ""The water jug helps me track how much wat..."
11819,Land quarries and riverbanks were once the mai...,wired,102.10,Land quarries and riverbanks were once the mai...
11878,DREAD PIRATE ROBERTS 3/NUM/NUM 8:NUM Don't wan...,wired,100.88,DREAD PIRATE ROBERTS 3/NUM/NUM 8:NUM Don't wan...
11998,Lucas: We had NUM shots to get through. They'd...,wired,104.44,Lucas: We had NUM shots to get through. They'd...
12044,Dippé: We made so much noise that they put us ...,wired,102.10,Dippé: We made so much noise that they put us ...


In [228]:
import nltk
def getMeanSentLen(para):
    #Count number of sentences
    sent_tok = nltk.sent_tokenize(para)
    ns = len(sent_tok)
    
    #Count number of words
    word_tok = nltk.word_tokenize(para) #need to take out commas plus other stuff
    NoWord = [',','(',')',':',';','.','%','\x96','\x94','{','}','[',']','!','?',"''","``"]
    word_tok2 = [i for i in word_tok if i not in NoWord]
    nw = len(word_tok2)
    
    ##Average Sentence length are words divided by sentences
    avg=nw/ns
    
    return avg

In [243]:
import nltk
def getSDSentLen(para):
    #Count number of sentences
    sent_tok = nltk.sent_tokenize(para)
    ns = len(sent_tok)
    
    #Count number of words
    word_tok = nltk.word_tokenize(para) #need to take out commas plus other stuff
    NoWord = [',','(',')',':',';','.','%','\x96','\x94','{','}','[',']','!','?',"''","``"]
    word_tok2 = [i for i in word_tok if i not in NoWord]
    nw = len(word_tok2)
    
    ##Average Sentence length are words divided by sentences
    avg=nw/ns
    #print(avg)
    sum1=0
    #Standard Deviation 
    for sent in sent_tok:
        sum1=sum1+(len(sent.split())-avg)**2
        #print(len(sent.split()))
        #print(sum1)
    sd=(sum1/ns)**0.5
    return sd

In [259]:
from nltk.corpus import stopwords
def normStopWordFrequency(para):
    stopwords1=set(stopwords.words('english'))
    word_tok = nltk.word_tokenize(para) #need to take out commas plus other stuff
    NoWord = [',','(',')',':',';','.','%','\x96','\x94','{','}','[',']','!','?',"''","``"]
    word_tok2 = [i for i in word_tok if i not in NoWord]
    nw = len(word_tok2)
    word_tok_stop=[i for i in word_tok if i.lower() in stopwords1]
    n_stop=len(word_tok_stop)
    return(n_stop/nw)
    
    

In [286]:
from nltk.corpus import stopwords
def normFunctWordFrequency(functional,para):
    word_tok = nltk.word_tokenize(para) #need to take out commas plus other stuff
    NoWord = [',','(',')',':',';','.','%','\x96','\x94','{','}','[',']','!','?',"''","``"]
    word_tok2 = [i for i in word_tok if i not in NoWord]
    nw = len(word_tok2)
    word_tok_funct=[i for i in word_tok if i.lower() in functional]
    n_funct=len(word_tok_funct)
    return(n_funct/nw)

In [271]:
from nltk.corpus import stopwords
import string
def normPunctFrequency(para):
    count = lambda l1, l2: len(list(filter(lambda c: c in l2, l1)))
                               
    no_punct = count(para, string.punctuation)
    word_tok = nltk.word_tokenize(para) #need to take out commas plus other stuff
    NoWord = [',','(',')',':',';','.','%','\x96','\x94','{','}','[',']','!','?',"''","``"]
    word_tok2 = [i for i in word_tok if i not in NoWord]
    nw = len(word_tok2)
    return(no_punct/nw)

In [256]:
normStopWordFrequency(final_df1.loc[0,'Content_Preprocessed'])

0.38461538461538464

In [241]:
getSDSentLen(final_df1.loc[0,'Content_Preprocessed'])

16.25
9
52.5625
26
147.625
19
155.1875
11
182.75


6.7592529172978875

In [None]:
#final_df1['avg_sent_len'],final_df1['sd_sent_len'] = final_df1['Content_Preprocessed'].apply(lambda x:getMeanSDSentLen(x))

final_df1['avg_sent_len'] = final_df1.apply(lambda row: getMeanSentLen(row['Content_Preprocessed']), axis=1)



In [None]:
final_df1['sd_sent_len'] = final_df1.apply(lambda row: getSDSentLen(row['Content_Preprocessed']), axis=1)

In [None]:
final_df1['norm_stop_freq'] = final_df1.apply(lambda row: normStopWordFrequency(row['Content_Preprocessed']), axis=1)


In [None]:
final_df1['norm_punct_freq'] = final_df1.apply(lambda row: normPunctFrequency(row['Content_Preprocessed']), axis=1)b

In [303]:
import csv
functional_file = open("functional.txt", "r")
words= [word.strip() for line in functional_file.readlines() for word in line.split(',') if word.strip()]
functional=list(set(words))
print((functional))

['who', 'via', 'whether', 'myself', 'that', 'either', 'your', 'I', 'of', 'be', 'about', 'consequently', 'been', 'without', 'by', 'besides', 'next', 'meanwhile', 'his', 'each', 'till', 'sure', 'itself', 'rather', 'he', 'will', 'ourselves', 'how', 'likewise', 'themselves', 'before', 'whatever', 'yeah', 'because', 'shall', 'to', 'every', 'him', 'during', 'our', 'quite', 'beyond', 'following', 'towards', 'instead', 'are', 'oh', 'nowhere', 'as', 'when', 'eight', 'many', 'against', 'whoever', 'than', 'my', 'opposite', 'me', 'whereas', 'excluding', 'one', 'them', 'last', 'five', 'neither', 'from', 'between', 'somewhere', 'whose', 'am', 'ok', 'through', 'okay', 'over', 'all', 'ought', 'ten', 'onto', 'thirty', 'toward', 'underneath', 'up', 'not', 'hence', 'other', 'yourself', 'regarding', 'except', 'little', 'unless', 'versus', 'was', 'whenever', 'any', 'could', 'those', 'lest', 'has', 'second', 'considering', 'its', 'were', 'course', 'then', 'inside', 'it', 'first', 'had', 'did', 'further', 'c

In [None]:
final_df1['norm_funct_freq'] = final_df1.apply(lambda row: normFunctWordFrequency(functional,row['Content_Preprocessed']), axis=1)

In [291]:
final_df1.head()

Unnamed: 0,content,source,readability_f,Content_Preprocessed,avg_sent_len,sd_sent_len,norm_stop_freq,norm_punct_freq,readability_SMOG,norm_funct_freq
0,"THE first bicycles were freed on July NUMth, N...",economist,71.85,"THE first bicycles were freed on July NUMth, N...",16.25,6.759253,0.446154,0.184615,10.1,0.507692
1,Roel van Duijn and Luud Schimmelpennink starte...,economist,83.25,Roel van Duijn and Luud Schimmelpennink starte...,13.428571,7.326218,0.457447,0.223404,9.6,0.457447
2,"A few days later, at a street meeting where Mr...",economist,83.46,"A few days later, at a street meeting where Mr...",13.2,8.885944,0.424242,0.136364,7.6,0.393939
3,Half a century later the streets of Beijing ar...,economist,79.3,Half a century later the streets of Beijing ar...,17.666667,6.952218,0.509434,0.207547,6.4,0.509434
4,"The yellow bikes are from Ofo, so named becaus...",economist,72.16,"The yellow bikes are from Ofo, so named becaus...",16.166667,3.11359,0.371134,0.175258,9.3,0.402062


In [293]:
df_features= final_df1[['source','readability_f','readability_SMOG','avg_sent_len','sd_sent_len','norm_stop_freq','norm_punct_freq','norm_funct_freq']]

In [298]:
df_features.head()

Unnamed: 0,source,readability_f,readability_SMOG,avg_sent_len,sd_sent_len,norm_stop_freq,norm_punct_freq,norm_funct_freq
0,economist,71.85,10.1,16.25,6.759253,0.446154,0.184615,0.507692
1,economist,83.25,9.6,13.428571,7.326218,0.457447,0.223404,0.457447
2,economist,83.46,7.6,13.2,8.885944,0.424242,0.136364,0.393939
3,economist,79.3,6.4,17.666667,6.952218,0.509434,0.207547,0.509434
4,economist,72.16,9.3,16.166667,3.11359,0.371134,0.175258,0.402062


In [294]:
# X -> features, y -> label
X=df_features[['readability_f','readability_SMOG','avg_sent_len','sd_sent_len','norm_stop_freq','norm_punct_freq','norm_funct_freq']]
y=df_features.source

In [296]:
# dividing X, y into train and test data 
#We use stratified sampling
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=y,test_size=0.3, random_state = 0)

In [297]:
from sklearn.svm import SVC
svm_model_linear = SVC(kernel = 'linear', C = 1).fit(X_train, y_train)
svm_predictions = svm_model_linear.predict(X_test)

In [301]:
from sklearn.metrics import confusion_matrix
accuracy = svm_model_linear.score(X_test, y_test)
print(accuracy)

0.525672206459


In [302]:
# creating a confusion matrix
cm = confusion_matrix(y_test, svm_predictions)
print(cm)

[[1991  352  841  117]
 [ 423 1846 1548  191]
 [ 709  605 3650   60]
 [ 833  671  671  294]]
