In [324]:
# File that uses our best Naive Bayes model to classify full articles

In [317]:
import numpy as np
import pandas as pd
import re
import math
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords
import nltk.data

In [318]:
dat = pd.read_excel('all_sentences.xlsx')

In [319]:
def return_preds(probs, thresh):
    y_pred = []
    for row in probs:
        if row[1] > thresh:
            y_pred.append(1)
        else:
            y_pred.append(np.argmax(row))
    return y_pred

In [320]:
# Best model
X_train, X_test, y_train, y_test = train_test_split(dat['words_clean'], dat['label'], test_size=0.3)
nb_model = Pipeline([('vect', CountVectorizer(ngram_range=(1,1))), 
                     ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()),])
nb_model.fit(X_train, y_train)
probs = nb_model.predict_proba(X_test)
return_acc(probs, y_test, 0.17)[1]

0.55024813895781632

### Article Classification

In [325]:
# Function to clean word data--removes stopwords, makes lowercase, removes numbers
def clean_words(sentences):
    words_clean = np.full(len(sentences), None)
    for i, words in enumerate(sentences):
        word_list = re.split('\W+', words)
        words1 = [word.lower() for word in word_list if word.lower() not in stopwords.words('english')]
        words2 = [word for word in words1 if not any(char.isdigit() for char in word)]
        words_clean[i] = (' '.join(words2)).strip()
    return words_clean

In [326]:
# Function to check if article has text (blank articles sometimes appear)
def has_content(message):
    return message != ''

In [337]:
# Score articles by taking the mean of the difference in probabilities of lib and con for each sentences
def score_article_unsupervised(fname):
    f = open(fname,'r')
    message = f.read()
    #print(message)
    f.close()
    if has_content(message):
        tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        fp = open(fname)
        data = fp.read()
        #print('\n-----\n'.join(tokenizer.tokenize(data)))
        sentences = tokenizer.tokenize(data)
        new_text = clean_words(sentences)

        total = nb_model.predict(new_text)
        probs = nb_model.predict_proba(new_text)
        diffs = [row[2]-row[0] for row in probs]

        return probs, np.mean(diffs)

In [None]:
# Score each outlet

In [152]:
# 368, 12
fox_news = ['arts/fox/fox' + str(i) + '.txt' for i in range(368)]
fox_scores = []
for i in range(len(fox_news)):
    if score_article_unsupervised(fox_news[i]) != None:
        fox_scores.append(score_article_unsupervised(fox_news[i])[1])

In [153]:
# 320, 23
nyt_news = ['arts/nyt/nyt' + str(i) + '.txt' for i in range(320)]
nyt_scores = []
for i in range(len(nyt_news)):
    if score_article_unsupervised(nyt_news[i]) != None:
        nyt_scores.append(score_article_unsupervised(nyt_news[i])[1])

In [154]:
# 823, 57
cnn_news = ['arts/cnn/cnn' + str(i) + '.txt' for i in range(823)]
cnn_scores = []
for i in range(len(cnn_news)):
    if score_article_unsupervised(cnn_news[i]) != None:
        cnn_scores.append(score_article_unsupervised(cnn_news[i])[1])

In [334]:
# Overall average political score
print(np.mean(fox_scores))
print(np.mean(nyt_scores))
print(np.mean(cnn_scores))

-0.108170465399
-0.133872863003
-0.118018135553


In [335]:
from scipy import stats

In [336]:
# Check for significant difference in means
print stats.ttest_ind(nyt_scores, cnn_scores)
print stats.ttest_ind(cnn_scores, fox_scores)
print stats.ttest_ind(fox_scores, nyt_scores)

Ttest_indResult(statistic=-4.0775755512548564, pvalue=4.8710911694821782e-05)
Ttest_indResult(statistic=-2.3241519564134054, pvalue=0.020296171422587401)
Ttest_indResult(statistic=5.2072646749596316, pvalue=2.6412148897569954e-07)


In [357]:
# Test on pretrained data (labeling done by making assumptions about outlet)
# BAD supervised model--not statistically sound

In [342]:
def score_article_supervised(fname):
    f = open(fname,'r')
    message = f.read()
    #print(message)
    f.close()
    #import nltk.data
    if has_content(message):
        tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        fp = open(fname)
        data = fp.read()
        #print('\n-----\n'.join(tokenizer.tokenize(data)))
        sentences = tokenizer.tokenize(data)
        new_text = clean_words(sentences)

        total = nb_model.predict(new_text)
        probs = nb_model.predict_proba(new_text)
        #diffs = [row[2]-row[0] for row in probs]
        lib = sum([row[0] for row in probs])/len(probs)
        neut = sum([row[1] for row in probs])/len(probs)
        con = sum([row[2] for row in probs])/len(probs)
        
        #print(probs)
        #print(type(y_pred), type(y_test))
        #print(np.mean(y_pred == y_test))
        #print(y_pred)
        #if y_pred == []:
        #    return None
        #print(y_pred)
        #print(y_test.tolist())
        #print(probs)
        #print(y_pred)
        return [lib, neut, con]

In [343]:
breit = ['opinion_articles/breit/breit' + str(i) + '.txt' for i in range(38)]
cnn = ['opinion_articles/cnn/cnn' + str(i) + '.txt' for i in range(38)]
fox = ['opinion_articles/fox/fox' + str(i) + '.txt' for i in range(39)]
new_yorker = ['opinion_articles/new_yorker/new_yorker' + str(i) + '.txt' for i in range(39)]
nyt = ['opinion_articles/nyt/nyt' + str(i) + '.txt' for i in range(38)]

In [344]:
sources = ['breit', 'fox', 'new_yorker', 'nyt', 'cnn']
nums = [38, 39, 39, 38, 38]
labels = [0.7, 0.3, -0.5, -0.3, -0.1]

def make_fname(source, num, label, all_articles):
    arts = [('opinion_articles/' + source + '/' + source + str(i) + '.txt', label) for i in range(num)]
    all_articles += arts

articles = []    
for tup in zip(sources, nums, labels):
    make_fname(tup[0], tup[1], tup[2], articles)

In [345]:
articles[1][0]

'opinion_articles/breit/breit1.txt'

In [346]:
lib_scores = np.full(len(articles), None)
neut_scores = np.full(len(articles), None)
con_scores = np.full(len(articles), None)
labels = np.full(len(articles), None)
for i, article in enumerate(articles):
    ret = score_article_supervised(articles[i][0])
    lib_scores[i] = ret[0]
    neut_scores[i] = ret[1]
    con_scores[i] = ret[2]
    labels[i] = (articles[i][1])#/2 + 0.5
    
articles_supervised = pd.DataFrame(data={'lib':lib_scores, 'neut':neut_scores,
                                          'con':con_scores, 'label':labels})

In [347]:
articles_supervised

Unnamed: 0,con,label,lib,neut
0,0.369839,0.7,0.378079,0.252082
1,0.227844,0.7,0.350113,0.422043
2,0.39259,0.7,0.4145,0.19291
3,0.313,0.7,0.464913,0.222087
4,0.319412,0.7,0.356282,0.324306
5,0.310305,0.7,0.321217,0.368478
6,0.295338,0.7,0.388957,0.315706
7,0.359949,0.7,0.438573,0.201479
8,0.293797,0.7,0.485502,0.220701
9,0.432443,0.7,0.396859,0.170697


In [348]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [349]:
X_train, X_test, y_train, y_test = train_test_split(articles_supervised.drop(['label'], axis=1), 
                                                    articles_supervised['label'], test_size=0.25, random_state=0)

In [350]:
type(X_train), type(y_train)

(pandas.core.frame.DataFrame, pandas.core.series.Series)

In [351]:
log_model = LinearRegression()
log_model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [352]:
preds = log_model.predict(X_test)

In [353]:
preds

array([ 0.03125 ,  0.265625, -0.140625,  0.09375 ,  0.      ,  0.09375 ,
        0.125   ,  0.046875, -0.09375 ,  0.15625 ,  0.046875,  0.015625,
       -0.171875,  0.109375,  0.078125,  0.03125 ,  0.0625  , -0.15625 ,
       -0.078125,  0.015625,  0.3125  , -0.171875, -0.171875, -0.140625,
        0.015625, -0.046875, -0.15625 ,  0.15625 ,  0.265625, -0.03125 ,
        0.140625,  0.1875  ,  0.125   , -0.046875,  0.171875, -0.03125 ,
        0.03125 , -0.171875,  0.078125, -0.078125,  0.234375, -0.09375 ,
        0.      , -0.1875  , -0.015625,  0.015625,  0.046875, -0.078125])

In [354]:
log_model.score(X_test, y_test)

0.001027253121313354

In [355]:
from sklearn.metrics import mean_squared_error

In [356]:
mean_squared_error(y_test, preds)

0.17641581217447919