In [1]:
import pandas as pd
import numpy as np

import json
import re
import os
import unicodedata
import itertools
import string
from nltk.corpus import stopwords
import nltk
import csv
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split


# Processing Function

In [2]:
def process_corpus(path, replace_dict, min_length=10):
    """
    Removes source-specific artifacts from documents.
    
    :param path: filepath containing JSON files for each document
    :param replace_dict: dictionary containing regex matching strings and strings to replace them with
    :param min_length: minimum document length
    
    :returns: list of strings, each containing document content
    """
    
    docs = []
    #files = os.listdir(path)
    json_files = [pos_json for pos_json in os.listdir(path) if pos_json.endswith('.json')]

    for file in json_files:
        #file = '\\' + file
        content = json.load(open(path + file))['content']
        
        # replace regex strings
        for key, value in replace_dict.items():
            content = re.sub(key, value, content)
        
        # remove small documents
        if len(content) >= min_length:
            docs.append(content)
        
    return(docs)

# Economist

In [3]:
economist_path = 'data_updated/economist/'

In [4]:
economist_dict = {}

# artifacts on accented letters
economist_dict['AaAaAeA '] = 'i'
economist_dict['AaAaAeAo'] = 'c'
economist_dict['AaAaAeAc'] = 'a'
economist_dict['AaAaAeA~'] = 'n'
economist_dict['AaAaAeA@|AaAaAeA\?|AaAaAeA{|AaAaAe'] = 'e'



# numbers
economist_dict['([\d]+)([.,]?)([\d]+)'] = 'NUM'


# Go online artifacts
# end paragraph without punctuation (probably headers or titles)
economist_dict['<p>Go online ([^<]*)</p>|<p>([^<]*)([^.?!"]){1}</p>'] = ''

# end of paragraph tags
economist_dict['</p>'] = ''


In [5]:
economist_docs = process_corpus(economist_path, economist_dict)

In [6]:
print(len(economist_docs))

1104


In [7]:
economist_paragraphs = []
for doc in economist_docs:
    economist_paragraphs += doc.strip().split('<p>')
    
economist_paragraphs = [doc.strip() for doc in economist_paragraphs if len(doc) != 0]

In [8]:
len(economist_paragraphs)

11198

# Wired

In [9]:
wired_path = 'data_updated/wired/'

In [10]:
wired_dict = {}

# numbers
wired_dict['([\d]+)([.,]?)([\d]+)'] = 'NUM'

# end paragraph without punctuation (probably headers or titles)
# author/subject descriptions at end of article
# paragraph symbols
wired_dict['<p>([^<]*)([^.?!"]){1}</p>|<p>([^<]*)([A-Z]+) \(@(.*)|¶'] = ''

#email addresses
wired_dict['[\w\.-]+@[\w\.-]+']=''

# end of paragraph tags
wired_dict['</p>'] = ''

In [11]:
wired_docs = process_corpus(wired_path, wired_dict)

In [12]:
print(len(wired_docs))

1296


In [13]:
wired_paragraphs = []

for doc in wired_docs:
    wired_paragraphs += doc.strip().split('<p>')
    
wired_paragraphs = [doc.strip() for doc in wired_paragraphs if len(doc) != 0]

In [14]:
len(wired_paragraphs)

17142

# New Yorker

In [15]:
newyorker_path = 'data_updated/newyorker/'

In [16]:
newyorker_dict = {}

# artifacts on accented letters
newyorker_dict['AaAaAeA '] = 'i'
newyorker_dict['AaAaAeAo'] = 'c'
newyorker_dict['AaAaAeAc'] = 'a'
newyorker_dict['AaAaAeA~'] = 'n'
newyorker_dict['AaAaAeA@|AaAaAeA\?|AaAaAeA{|AaAaAe'] = 'e'

# numbers
newyorker_dict['([\d]+)([.,]?)([\d]+)'] = 'NUM'

# end paragraph without punctuation (probably headers or titles)
# bylines
newyorker_dict['<p>([^<]*)([^.?!"]){1}</p>|<p>Byline([^<]*)</p>'] = ''

# end of paragraph tags
newyorker_dict['</p>'] = ''

In [17]:
newyorker_docs = process_corpus(newyorker_path, newyorker_dict)

In [18]:
print(len(newyorker_docs))

807


In [19]:
newyorker_paragraphs = []
for doc in newyorker_docs:
    newyorker_paragraphs += doc.strip().split('<p>')
    
newyorker_paragraphs = [doc.strip() for doc in newyorker_paragraphs if len(doc) != 0]

In [20]:
len(newyorker_paragraphs)

19030

# EW

In [21]:
ew_path = 'data_updated/ew/'

In [22]:
ew_dict = {}

# artifacts on accented letters
ew_dict['AaAaAeA '] = 'i'
ew_dict['AaAaAeAo'] = 'c'
ew_dict['AaAaAeAc'] = 'a'
ew_dict['AaAaAeA~'] = 'n'
ew_dict['AaAaAeA@|AaAaAeA\?|AaAaAeA{|AaAaAe'] = 'e'

# numbers
ew_dict['([\d]+)([.,]?)([\d]+)'] = 'NUM'

#email addresses
ew_dict['[\w\.-]+@[\w\.-]+']=''

# Go online artifacts
# end paragraph without punctuation (probably headers or titles)
#ew_dict['<p>Go online ([^<]*)</p>|<p>([^<]*)([^.?!+"]){1}</p>'] = ''

#bullet points
ew_dict['\xc2\xb7']=''

# end of paragraph tags
ew_dict['</p>'] = ''

In [23]:
ew_docs = process_corpus(ew_path, ew_dict)

In [24]:
print(len(ew_docs))

2190


In [25]:
ew_paragraphs = []

for doc in ew_docs:
    ew_paragraphs += doc.strip().split('<p>')
    
ew_paragraphs = [doc.strip() for doc in ew_paragraphs if len(doc) != 0]

In [26]:
def remove_ratings(graf):
    
    """Removes ratings such as "B+", "A", "C-" from the end of a paragraph."""
    
    # Check if the last character in the last word is a sentence-ending character ('.', '!' and so on)
    if not any(x == graf.split()[-1][-1] for x in ['.', '!', '?', '\"', ')', '”', '…']):
        
        # If not, return paragraph with the last word removed
        return graf[0:-len(graf.split()[-1])].strip()
    
    else:
        return graf

In [27]:
ew_paragraphs = [remove_ratings(p) for p in ew_paragraphs]

In [28]:
print(len(ew_paragraphs))

17536


# Make Dataframe

In [29]:
paragraphs = economist_paragraphs + wired_paragraphs + newyorker_paragraphs+ew_paragraphs

In [30]:
sources = list(itertools.repeat('economist', len(economist_paragraphs)))
sources += list(itertools.repeat('wired', len(wired_paragraphs)))
sources += list(itertools.repeat('newyorker', len(newyorker_paragraphs)))
sources +=list(itertools.repeat('ew',len(ew_paragraphs)))

In [31]:
d = {'content':paragraphs, 'source':sources}
final_df = pd.DataFrame(data=d)

In [32]:
len(final_df)

64906

In [33]:
final_df.head()

Unnamed: 0,content,source
0,"DOWN the Euphrates river, halfway between Deir...",economist
1,Never have America and its allies had such a h...,economist
2,"But like their Parthian forebears, Iran and it...",economist
3,Iran's gains are even more striking elsewhere....,economist
4,"Farther south, America's hopes of stemming Ira...",economist


In [34]:
#final_df.to_csv('pre_pre_processed_data.csv')

# More Preprocessing

In [35]:
def removeHTMLTags(x):
    content = re.sub("(?i)<\/?\w+((\s+\w+(\s*=\s*(?:\".*?\"|'.*?'|[^'\">\s]+))?)+\s*|\s*)\/?>", '', x)
    #content1 = re.sub("\\b\\x94\\b", ' ', content)
    
    return(content)

In [36]:
final_df['Content_Preprocessed']=final_df['content'].apply(lambda x:removeHTMLTags(x))

In [37]:
final_df['Content_Preprocessed'].head()

0    DOWN the Euphrates river, halfway between Deir...
1    Never have America and its allies had such a h...
2    But like their Parthian forebears, Iran and it...
3    Iran's gains are even more striking elsewhere....
4    Farther south, America's hopes of stemming Ira...
Name: Content_Preprocessed, dtype: object

In [38]:
# Filter out paragraphs with 20 words or less

final_df_filtered = final_df[final_df['Content_Preprocessed'].apply(lambda x:len(x.split()))>20]

In [39]:
len(final_df_filtered)

55136

# Sentence-Level Preprocessing Begins

In [40]:
from textstat.textstat import textstat

In [41]:
final_df_filtered['readability_f'] = final_df_filtered['Content_Preprocessed'].apply(textstat.flesch_reading_ease)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [42]:
final_df_filtered['readability_SMOG']=final_df_filtered['Content_Preprocessed'].apply(textstat.smog_index)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [43]:
final_df_filtered['readability_f'].describe()

count    55136.000000
mean        62.799754
std         14.746275
min        -68.600000
25%         53.380000
50%         63.190000
75%         72.760000
max        116.350000
Name: readability_f, dtype: float64

In [44]:
#Need to explore this futher. Brought it down to 230 from over 2.5k
final_df_filtered[(final_df_filtered['readability_f']<0)|(final_df_filtered['readability_f']>100) ]

Unnamed: 0,content,source,Content_Preprocessed,readability_f,readability_SMOG
427,The exhibition proceeds broadly chronologicall...,economist,The exhibition proceeds broadly chronologicall...,-46.27,0.0
7844,"Westron wynde, when wilt thou blow, the small ...",economist,"Westron wynde, when wilt thou blow, the small ...",100.58,0.0
8479,Vladimir frowns and thinks. And then he clicks...,economist,Vladimir frowns and thinks. And then he clicks...,116.35,3.1
11432,The raw components of the sneakers being produ...,wired,The raw components of the sneakers being produ...,-7.03,0.0
11447,#whatjayzsaidtosolange; coming within inches o...,wired,#whatjayzsaidtosolange; coming within inches o...,-4.32,0.0
11655,"THE PROOF Hawaii and Stephen F. Austin, two te...",wired,"THE PROOF Hawaii and Stephen F. Austin, two te...",102.61,0.0
12279,Which reminds me. I've got a great idea for a ...,wired,Which reminds me. I've got a great idea for a ...,111.07,0.0
12679,"""Your first place is home. Your second is work...",wired,"""Your first place is home. Your second is work...",106.37,3.1
12953,Do not read this in a house. Do not read this ...,wired,Do not read this in a house. Do not read this ...,115.13,3.1
13598,But he felt that he had no choice. Two months ...,wired,But he felt that he had no choice. Two months ...,101.09,0.0


In [45]:
def getMeanSentLen(para):
    #Count number of sentences
    sent_tok = nltk.sent_tokenize(para)
    ns = len(sent_tok)
    
    #Count number of words
    word_tok = nltk.word_tokenize(para) #need to take out commas plus other stuff
    NoWord = [',','(',')',':',';','.','%','\x96','\x94','{','}','[',']','!','?',"''","``"]
    word_tok2 = [i for i in word_tok if i not in NoWord]
    nw = len(word_tok2)
    
    ##Average Sentence length are words divided by sentences
    avg=nw/ns
    
    return avg

In [46]:
def getSDSentLen(para):
    #Count number of sentences
    sent_tok = nltk.sent_tokenize(para)
    ns = len(sent_tok)
    
    #Count number of words
    word_tok = nltk.word_tokenize(para) #need to take out commas plus other stuff
    NoWord = [',','(',')',':',';','.','%','\x96','\x94','{','}','[',']','!','?',"''","``"]
    word_tok2 = [i for i in word_tok if i not in NoWord]
    nw = len(word_tok2)
    
    ##Average Sentence length are words divided by sentences
    avg=nw/ns
    #print(avg)
    sum1=0
    #Standard Deviation 
    for sent in sent_tok:
        sum1=sum1+(len(sent.split())-avg)**2
        #print(len(sent.split()))
        #print(sum1)
    sd=(sum1/ns)**0.5
    return sd

In [47]:
def normStopWordFrequency(para):
    stopwords1=set(stopwords.words('english'))
    word_tok = nltk.word_tokenize(para) #need to take out commas plus other stuff
    NoWord = [',','(',')',':',';','.','%','\x96','\x94','{','}','[',']','!','?',"''","``"]
    word_tok2 = [i for i in word_tok if i not in NoWord]
    nw = len(word_tok2)
    word_tok_stop=[i for i in word_tok if i.lower() in stopwords1]
    n_stop=len(word_tok_stop)
    return(n_stop/nw)
    
    

In [48]:
def normFunctWordFrequency(functional,para):
    word_tok = nltk.word_tokenize(para) #need to take out commas plus other stuff
    NoWord = [',','(',')',':',';','.','%','\x96','\x94','{','}','[',']','!','?',"''","``"]
    word_tok2 = [i for i in word_tok if i not in NoWord]
    nw = len(word_tok2)
    word_tok_funct=[i for i in word_tok if i.lower() in functional]
    n_funct=len(word_tok_funct)
    return(n_funct/nw)

In [49]:
def normPunctFrequency(para):
    count = lambda l1, l2: len(list(filter(lambda c: c in l2, l1)))
                               
    no_punct = count(para, string.punctuation)
    word_tok = nltk.word_tokenize(para) #need to take out commas plus other stuff
    NoWord = [',','(',')',':',';','.','%','\x96','\x94','{','}','[',']','!','?',"''","``"]
    word_tok2 = [i for i in word_tok if i not in NoWord]
    nw = len(word_tok2)
    return(no_punct/nw)

In [50]:
normStopWordFrequency(final_df_filtered.loc[0,'Content_Preprocessed'])

0.421875

In [51]:
getSDSentLen(final_df_filtered.loc[0,'Content_Preprocessed'])

11.86029791643813

In [52]:
#final_df1['avg_sent_len'],final_df1['sd_sent_len'] = final_df1['Content_Preprocessed'].apply(lambda x:getMeanSDSentLen(x))

final_df_filtered['avg_sent_len'] = final_df_filtered.apply(lambda row: getMeanSentLen(row['Content_Preprocessed']), axis=1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [53]:
final_df_filtered['sd_sent_len'] = final_df_filtered.apply(lambda row: getSDSentLen(row['Content_Preprocessed']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [54]:
final_df_filtered['norm_stop_freq'] = final_df_filtered.apply(lambda row: normStopWordFrequency(row['Content_Preprocessed']), axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [55]:
final_df_filtered['norm_punct_freq'] = final_df_filtered.apply(lambda row: normPunctFrequency(row['Content_Preprocessed']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [56]:
functional_file = open("functional.txt", "r")
words= [word.strip() for line in functional_file.readlines() for word in line.split(',') if word.strip()]
functional=list(set(words))
print((functional))

['against', 'though', 'while', 'also', 'any', 'thus', 'is', 'should', 'must', 'nevertheless', 'four', 'a', 'some', 'from', 'our', 'until', 'further', 'nonetheless', 'do', 'why', 'via', 'nine', 'ok', 'his', 'behind', 'by', 'this', 'past', 'into', 'and', 'three', 'at', 'underneath', 'himself', 'opposite', 'can', 'however', 'across', 'whatever', 'whom', 'around', 'most', 'up', 'moreover', 'that', 'over', 'conversely', 'either', 'on', 'whether', 'because', 'as', 'than', 'ought', 'much', 'yourself', 'except', 'eight', 'otherwise', 'all', 'their', 'besides', 'like', 'third', 'despite', 'had', 'those', 'many', 'considering', 'him', 'excluding', 'within', 'these', 'upon', 'little', 'course', 'near', 'since', 'toward', 'such', 'onto', 'previous', 'enough', 'concerning', 'herself', 'he', 'theirs', 'I', 'six', 'have', 'versus', 'if', 'five', 'yes', 'neither', 'ours', 'about', 'before', 'not', 'below', 'between', 'meanwhile', 'are', 'inside', 'unlike', 'which', 'then', 'with', 'being', 'wherever',

In [57]:
final_df_filtered['norm_funct_freq'] = final_df_filtered.apply(lambda row: normFunctWordFrequency(functional,row['Content_Preprocessed']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [58]:
final_df_filtered.head()

Unnamed: 0,content,source,Content_Preprocessed,readability_f,readability_SMOG,avg_sent_len,sd_sent_len,norm_stop_freq,norm_punct_freq,norm_funct_freq
0,"DOWN the Euphrates river, halfway between Deir...",economist,"DOWN the Euphrates river, halfway between Deir...",50.46,11.2,21.333333,11.860298,0.421875,0.234375,0.390625
1,Never have America and its allies had such a h...,economist,Never have America and its allies had such a h...,62.68,10.7,17.0,3.391165,0.5,0.132353,0.470588
2,"But like their Parthian forebears, Iran and it...",economist,"But like their Parthian forebears, Iran and it...",91.31,5.7,14.2,5.858327,0.422535,0.197183,0.43662
3,Iran's gains are even more striking elsewhere....,economist,Iran's gains are even more striking elsewhere....,61.87,11.6,18.0,7.628892,0.422222,0.255556,0.411111
4,"Farther south, America's hopes of stemming Ira...",economist,"Farther south, America's hopes of stemming Ira...",52.49,13.6,20.0,7.527727,0.366667,0.316667,0.35


In [59]:
df_features= final_df_filtered[['source','readability_f','readability_SMOG','avg_sent_len','sd_sent_len','norm_stop_freq','norm_punct_freq','norm_funct_freq']]

In [60]:
df_features.head()

Unnamed: 0,source,readability_f,readability_SMOG,avg_sent_len,sd_sent_len,norm_stop_freq,norm_punct_freq,norm_funct_freq
0,economist,50.46,11.2,21.333333,11.860298,0.421875,0.234375,0.390625
1,economist,62.68,10.7,17.0,3.391165,0.5,0.132353,0.470588
2,economist,91.31,5.7,14.2,5.858327,0.422535,0.197183,0.43662
3,economist,61.87,11.6,18.0,7.628892,0.422222,0.255556,0.411111
4,economist,52.49,13.6,20.0,7.527727,0.366667,0.316667,0.35


In [61]:
# X -> features, y -> label
X=df_features[['readability_f','readability_SMOG','avg_sent_len','sd_sent_len','norm_stop_freq','norm_punct_freq','norm_funct_freq']]
y=df_features.source

In [None]:
# dividing X, y into train and test data 
#We use stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=y,test_size=0.3, random_state = 0)

In [None]:
svm_model_linear = SVC(kernel = 'linear', C = 1).fit(X_train, y_train)
svm_predictions = svm_model_linear.predict(X_test)

In [None]:
accuracy = svm_model_linear.score(X_test, y_test)
print(accuracy)

In [None]:
# creating a confusion matrix
cm = confusion_matrix(y_test, svm_predictions)
print(cm)

## Radial Kernel

In [None]:
svm_model_radial = SVC(kernel = 'rbf', C = 1).fit(X_train, y_train)
svm_predictions = svm_model_radial.predict(X_test)

In [None]:
accuracy = svm_model_radial.score(X_test, y_test)
print(accuracy)

## SOME OTHER KERNEL