In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker
import seaborn as sns
%load_ext autoreload
%autoreload 2

from settings import *
from glove import *
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from textblob import TextBlob
from tweets_ops import *
from url_helpers import analyze_url
from matching import *
from textstat.textstat import textstat
import urllib, bs4

pd.options.mode.chained_assignment = None

import warnings
warnings.simplefilter("ignore", category=UserWarning)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
import subprocess
import os
import string

def is_clickbait(title):
    
    title = ''.join([c for c in title if c in string.printable])
    os.chdir('../lib/clickbait')
    out, err = subprocess.Popen(['venv/bin/python', 'src/detect.py', title], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
    os.chdir('../../src')
    return float(re.findall('\d*\.?\d+', str(out))[0])

In [4]:
outlet1 = 'dailymail.co.uk'
outlet2 = 'nytimes.com'
outlet3 = 'theatlantic.com'
outlet4 = 'washingtonpost.com'

G = read_graph(cache_dir+'diffusion_graph/pruned_graph_v3.tsv')
pagerank = nx.pagerank(G.reverse())
betweenness_centrality = nx.betweenness_centrality(G)
degree_centrality = nx.degree_centrality(G)
in_degree_centrality = nx.in_degree_centrality(G)
out_degree_centrality = nx.out_degree_centrality(G)

In [7]:
outlet = outlet3
df = pd.read_csv(cache_dir+'article_details_v5.tsv', sep='\t')


df[df.apply(lambda x: analyze_url(x['url'])[0]==outlet, axis=1)].to_csv(cache_dir+outlet+'.tsv', sep='\t', index=None)

aggregate_tweet_details(cache_dir+'diffusion_graph/pruned_graph_v3.tsv', cache_dir+'tweet_details_v3.tsv', cache_dir+outlet+'.tsv', cache_dir+outlet+'.tsv')
df = pd.read_csv(cache_dir+outlet+'.tsv', sep='\t').fillna(0)

for s in ['full', 'paragraph', 'sentence']:
    train = pd.read_csv(cache_dir+'similarity_model/train_pairs_v2_'+s+'.tsv', sep='\t').merge(df[['url']], left_on='article', right_on='url').drop('url', axis=1)
    train = train[train.related==True]
    test = pd.read_csv(cache_dir+'similarity_model/test_pairs_v2_'+s+'.tsv', sep='\t').merge(df[['url']], left_on='article', right_on='url').drop('url', axis=1)
    pd.concat([train, test]).to_csv(cache_dir+'similarity_model/exp2/'+s+'.tsv', sep='\t', index=None)
    
test_similarity_model(cache_dir+'similarity_model/exp2/', cache_dir+'similarity_model/rf_model.sav', cache_dir+'similarity_model/exp2/results.tsv')
sim = pd.read_csv(cache_dir+'similarity_model/exp2/results.tsv', sep='\t').drop('paper', axis=1).groupby('article').max().reset_index()
df = df.merge(sim, left_on='url', right_on='article').drop('article', axis=1)

df['readability'] = df['full_text'].apply(lambda x: textstat.flesch_reading_ease(x))
df['title_subjectivity'] = df['title'].apply(lambda x: TextBlob(x).subjectivity)
df['title_polarity'] = df['title'].apply(lambda x: TextBlob(x).polarity)
df['title_clickbaitness'] = df['title'].apply(is_clickbait)

df = pd.concat([df.drop(['quote_indicators'], axis=1), df['quote_indicators'].apply(lambda x: pd.Series(eval(x)))], axis=1)


df['has_author'] = ~(df['authors'].isnull() | (df['authors'] == ''))

df['pageRank'] = df['url'].apply(lambda x: pagerank[x])
df['betweenness_centrality'] = df['url'].apply(lambda x: betweenness_centrality[x])
df['degree_centrality'] = df['url'].apply(lambda x: degree_centrality[x])
df['in_degree_centrality'] = df['url'].apply(lambda x: in_degree_centrality[x])
df['out_degree_centrality'] = df['url'].apply(lambda x: out_degree_centrality[x])

df['word_count'] = df['full_text'].apply(lambda x: len(re.findall(r'\w+', x)))

df['alexa_rank']=df['url'].apply(lambda x: bs4.BeautifulSoup(urllib.request.urlopen("http://data.alexa.com/data?cli=10&dat=s&url="+str(x)).read(), "xml").find("REACH")['RANK'])


df.url = df.url.apply(lambda x: analyze_url(x)[0])

df = df[['url', 'likes', 'replies_count', 'title_clickbaitness',
    'betweenness_centrality', 'degree_centrality', 'in_degree_centrality', 'out_degree_centrality',
    'replies_mean_polarity', 'replies_mean_subjectivity', 'retweets',
    'stance', 'tweets_mean_polarity', 'tweets_mean_subjectivity',
    'tweets_time_delta', 'users_countries', 'users_median_followers',
    'users_median_friends', 'users_median_tweets', 'related', 'readability',
    'title_subjectivity', 'title_polarity', 'count_all_quotes',
    'count_PER_quotes', 'count_ORG_quotes', 'count_unnamed_quotes',
    'has_author', 'pageRank', 'word_count', 'alexa_rank']]

df.columns = ['url', '#Likes', '#Replies', 'Title Clickbaitness',
    'Betweenness Centrality', 'Degree Centrality', 'In Degree Centrality', 'Out Degree Centrality',
    'Replies Polarity', 'Replies Subjectivity', '#Retweets',
    'Replies Stance', 'Tweets Polarity', 'Tweets Subjectivity',
    'Tweets Shelf Life', '#Users Countries', '#Followers',
    '#Users Friends', '#Users Tweets', 'STS', 'Readability',
    'Title Subjectivity', 'Title Polarity', '#Quotes',
    '#Person Quotes', '#Scientific Mentions', '#Weasel Quotes',
    'Author Signature', 'Personalized PageRank', 'Article Word Count', 'Alexa Rank']

df.to_csv(cache_dir+outlet+'.tsv', sep='\t', index=None)

In [17]:
df = pd.read_csv(cache_dir+'article_details_v5.tsv', sep='\t')

aggregate_tweet_details(cache_dir+'diffusion_graph/pruned_graph_v3.tsv', cache_dir+'tweet_details_v3.tsv', cache_dir+'article_details_v5.tsv', cache_dir+'article_details_v6.tsv')
df = pd.read_csv(cache_dir+'article_details_v6.tsv', sep='\t').fillna(0)

for s in ['full', 'paragraph', 'sentence']:
    train = pd.read_csv(cache_dir+'similarity_model/train_pairs_v2_'+s+'.tsv', sep='\t').merge(df[['url']], left_on='article', right_on='url').drop('url', axis=1)
    train = train[train.related==True]
    test = pd.read_csv(cache_dir+'similarity_model/test_pairs_v2_'+s+'.tsv', sep='\t').merge(df[['url']], left_on='article', right_on='url').drop('url', axis=1)
    pd.concat([train, test]).to_csv(cache_dir+'similarity_model/exp2/'+s+'.tsv', sep='\t', index=None)
    
test_similarity_model(cache_dir+'similarity_model/exp2/', cache_dir+'similarity_model/rf_model.sav', cache_dir+'similarity_model/exp2/results.tsv')
sim = pd.read_csv(cache_dir+'similarity_model/exp2/results.tsv', sep='\t').drop('paper', axis=1).groupby('article').max().reset_index()
df = df.merge(sim, left_on='url', right_on='article').drop('article', axis=1)

df['readability'] = df['full_text'].apply(lambda x: textstat.flesch_reading_ease(str(x)))
df['title_subjectivity'] = df['title'].apply(lambda x: TextBlob(str(x)).subjectivity)
df['title_polarity'] = df['title'].apply(lambda x: TextBlob(str(x)).polarity)

In [20]:
df['title_clickbaitness'] = df['title'].apply(is_clickbait)

TypeError: 'int' object is not iterable

In [None]:
df = pd.concat([df.drop(['quote_indicators'], axis=1), df['quote_indicators'].apply(lambda x: pd.Series(eval(x)))], axis=1)


df['has_author'] = ~(df['authors'].isnull() | (df['authors'] == ''))

df['pageRank'] = df['url'].apply(lambda x: pagerank[x])
df['betweenness_centrality'] = df['url'].apply(lambda x: betweenness_centrality[x])
df['degree_centrality'] = df['url'].apply(lambda x: degree_centrality[x])
df['in_degree_centrality'] = df['url'].apply(lambda x: in_degree_centrality[x])
df['out_degree_centrality'] = df['url'].apply(lambda x: out_degree_centrality[x])

df['word_count'] = df['full_text'].apply(lambda x: len(re.findall(r'\w+', x)))

df['alexa_rank']=df['url'].apply(lambda x: bs4.BeautifulSoup(urllib.request.urlopen("http://data.alexa.com/data?cli=10&dat=s&url="+str(x)).read(), "xml").find("REACH")['RANK'])


df.url = df.url.apply(lambda x: analyze_url(x)[0])

df = df[['url', 'likes', 'replies_count', 'title_clickbaitness',
    'betweenness_centrality', 'degree_centrality', 'in_degree_centrality', 'out_degree_centrality',
    'replies_mean_polarity', 'replies_mean_subjectivity', 'retweets',
    'stance', 'tweets_mean_polarity', 'tweets_mean_subjectivity',
    'tweets_time_delta', 'users_countries', 'users_median_followers',
    'users_median_friends', 'users_median_tweets', 'related', 'readability',
    'title_subjectivity', 'title_polarity', 'count_all_quotes',
    'count_PER_quotes', 'count_ORG_quotes', 'count_unnamed_quotes',
    'has_author', 'pageRank', 'word_count', 'alexa_rank']]

df.columns = ['url', '#Likes', '#Replies', 'Title Clickbaitness',
    'Betweenness Centrality', 'Degree Centrality', 'In Degree Centrality', 'Out Degree Centrality',
    'Replies Polarity', 'Replies Subjectivity', '#Retweets',
    'Replies Stance', 'Tweets Polarity', 'Tweets Subjectivity',
    'Tweets Shelf Life', '#Users Countries', '#Followers',
    '#Users Friends', '#Users Tweets', 'STS', 'Readability',
    'Title Subjectivity', 'Title Polarity', '#Quotes',
    '#Person Quotes', '#Scientific Mentions', '#Weasel Quotes',
    'Author Signature', 'Personalized PageRank', 'Article Word Count', 'Alexa Rank']

df.to_csv(cache_dir+'article_details_v6.tsv', sep='\t', index=None)

In [11]:
df = pd.concat([pd.read_csv(cache_dir+outlet2+'.tsv', sep='\t'), pd.read_csv(cache_dir+outlet3+'.tsv', sep='\t')])
y = df[['url']].values
X = np.array(df.drop('url', axis=1).values, dtype=np.float32)
X = (X - X.mean(axis=0)) / (X.std(axis=0)+1e-9)

from sklearn.feature_selection import f_classif
_, result = f_classif(X,y)

d = {c:r for c, r in zip(df.columns[1:], result)}
df = pd.DataFrame(sorted(d.items(), key=lambda kv: kv[1]))

df[0] = df.apply(lambda x: x[0]+'**' if float(x[1])<.005 else x[0]+'*' if float(x[1])<.01 else x[0], axis=1)

df23 = df.copy()

  f = msb / msw


In [12]:
df = pd.concat([df12, df13, df23, df123], axis=1).head(15)[0]
df.columns = ['NYTimes-DailyMail', 'TheAtlantic-DailyMail', 'NYTimes-TheAtlantic', 'NYTimes-TheAtlantic-DailyMail']

In [13]:
print(df.to_latex(index=False))

\begin{tabular}{llll}
\toprule
      NYTimes-DailyMail &   TheAtlantic-DailyMail &     NYTimes-TheAtlantic & NYTimes-TheAtlantic-DailyMail \\
\midrule
           Alexa Rank** &            Alexa Rank** &      Article Word Count &        \#Scientific Mentions** \\
 \#Scientific Mentions** &  \#Scientific Mentions** &                \#Replies &           Article Word Count* \\
               \#Quotes* &                 \#Quotes &              \#Followers &                       \#Quotes \\
    Article Word Count* &          Title Polarity &    \#Scientific Mentions &                Title Polarity \\
         Title Polarity &          \#Users Friends &    Replies Subjectivity &                      \#Replies \\
        Tweets Polarity &    Replies Subjectivity &        Replies Polarity &                \#Weasel Quotes \\
         \#Weasel Quotes &       Tweets Shelf Life &   Personalized PageRank &               Tweets Polarity \\
    Tweets Subjectivity &         Tweets Polarity &       T