In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
from pathlib import Path
from math import sqrt

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker
import seaborn as sns
from sklearn.metrics import mean_squared_error

import warnings
warnings.simplefilter("ignore", category=UserWarning)
pd.options.mode.chained_assignment = None

from create_corpus import analyze_url

scilens_dir = str(Path.home()) + '/Dropbox/scilens/www2019/'
print_RMSE = False

## Feature Correlation

In [None]:
df = pd.read_csv(scilens_dir + 'scilens_train.tsv', sep='\t')
for ind, name in zip(['Title Clickbaitness', 'Replies Stance'], ['clickbait_distro', 'stance_distro']):
    sns.set(context='paper', style='white', color_codes=True, font_scale=2.5)
    fig, ax = plt.subplots(figsize=(10,10))
    for r, q, c in zip([-1, 1], ['Low', 'High'], ['#CC4545', '#459FCC']):
        ax = sns.kdeplot(df.loc[df['rate']==r][ind], label=q+' Quality Articles', color=c, shade= True, ax=ax)

    plt.legend(loc='upper left', ncol=1, bbox_to_anchor=(0, 1.02))
    plt.xlabel(ind)
    plt.ylabel('Density')
    
    if name == 'stance_distro':
        plt.xlim(1, -0.4)
        loc, _ = plt.xticks()
        loc = np.delete(loc, -0.4)
        plt.xticks(loc, [round(1- 2*l,2) for l in loc])
    sns.despine(left=True, bottom=True)
    fig.savefig(scilens_dir+'figures/'+name+'.pdf', bbox_inches='tight')


## ATC

In [None]:
trust_thr = .45

df = pd.read_csv(scilens_dir+'atc-crowd.csv').rename(columns={'how_do_you_rate_the_scientific_quality_of_this_article': 'non-experts (w/o indicators)'})
df = df[df._trust > trust_thr][['article', 'non-experts (w/o indicators)']]
df.article = df.article.apply(lambda x: x.replace('https://', 'http://'))
df_crowd = df.groupby('article').mean()

df = pd.read_csv(scilens_dir+'atc-crowd-ind.csv').rename(columns={'how_do_you_rate_the_scientific_quality_of_this_article': 'non-experts (w/ indicators)'})
df = df[df._trust > trust_thr][['article', 'non-experts (w/ indicators)']]
df.article = df.article.apply(lambda x: x.replace('https://', 'http://'))
df_crowd_ind = df.groupby('article').mean()

df1 = pd.read_csv(scilens_dir+'atc-Andreu.csv').rename(columns={'how_do_you_rate_the_scientific_quality_of_this_article': 'expert1'}).sort_values(by='article')
df2 = pd.read_csv(scilens_dir+'atc-Aina.csv').rename(columns={'how_do_you_rate_the_scientific_quality_of_this_article': 'expert2'}).sort_values(by='article').drop('article', axis=1)
df_exp = pd.concat([df1,df2], axis=1)[['article', 'expert1', 'expert2']]
df_exp.article = df_exp.article.apply(lambda x: x.replace('https://', 'http://'))
df_exp['diff'] = abs(df_exp['expert1'] - df_exp['expert2'])
df_exp['experts'] = (df_exp['expert1'] + df_exp['expert2'])/2
df_exp = df_exp.set_index('article')


df_sci = pd.read_csv(scilens_dir + 'atc_scilens.tsv', sep='\t')
df_sci = df_sci.set_index('article').rename(columns={'scilens':'SciLens'})

df = df_crowd.join(df_exp).join(df_crowd_ind).join(df_sci).reset_index()

if print_RMSE:
    print(sqrt(mean_squared_error(df[df['diff']==0]['non-experts (w/o indicators)'], df[df['diff']==0]['experts'])))
    print(sqrt(mean_squared_error(df[df['diff']==0]['non-experts (w/ indicators)'], df[df['diff']==0]['experts'])))
    print(sqrt(mean_squared_error(df[df['diff']==0]['SciLens'], df[df['diff']==0]['experts'])))
    print()
    print(sqrt(mean_squared_error(df[df['diff']==1]['non-experts (w/o indicators)'], df[df['diff']==1]['experts'])))
    print(sqrt(mean_squared_error(df[df['diff']==1]['non-experts (w/ indicators)'], df[df['diff']==1]['experts'])))
    print(sqrt(mean_squared_error(df[df['diff']==1]['SciLens'], df[df['diff']==1]['experts'])))
    print()
    print(sqrt(mean_squared_error(df[df['diff']>1]['non-experts (w/o indicators)'], df[df['diff']>1]['experts'])))
    print(sqrt(mean_squared_error(df[df['diff']>1]['non-experts (w/ indicators)'], df[df['diff']>1]['experts'])))
    print(sqrt(mean_squared_error(df[df['diff']>1]['SciLens'], df[df['diff']>1]['experts'])))
    print()
    print(sqrt(mean_squared_error(df['non-experts (w/o indicators)'], df['experts'])))
    print(sqrt(mean_squared_error(df['non-experts (w/ indicators)'], df['experts'])))
    print(sqrt(mean_squared_error(df['SciLens'], df['experts'])))

#plot    
df = df.sort_values(by=['experts', 'diff'])
l = df['article'].apply(lambda x: analyze_url(x)[0]).tolist()
l = [{'winefolly.com':'Wine Folly (blog)',
 'self.com':'Self (blog)',
 'humanprogress.org':'Human Progress',
 'vantagepointrecovery.com':'Vantage Point',
 'marketwatch.com':'Market Watch',
 'fortune.com':'Fortune',
 'esquire.com':'Esquire',
 'thisisinsider.com':'This is Insider',
 'mentalfloss.com':'Mental Floss',
 'uk.businessinsider.com':'Business Insider',
 'healthline.com':'Health Line',
 'illinoispolicy.org':'Illinois Policy',
 'prevention.com':'Prevention',
 'voanews.com':'VOA News',
 'womenshealthmag.com':'Women\'s Health',
 'drugaddictionnow.com':'Drug Addiction Now',
 'weforum.org':'WEForum',
 'outsideonline.com':'Outside Online',
 'nutritionadvance.com':'Nutrition Advance'}.get(o, o) for o in l]

df['article'] = [v + ' (' + str(l[:i].count(v) + 1) + ')' if l.count(v) > 1 else v for i, v in enumerate(l)]
#df['article'] = df.apply(lambda x: x['article']+'*' if x['diff']==0 else x['article']+'**' if x['diff']==1 else x['article']+'***', axis = 1)

df = df[['article', 'non-experts (w/o indicators)', 'non-experts (w/ indicators)', 'experts', 'SciLens']].rename(columns={'non-experts (w/o indicators)': 'Non-Experts (No Indicators)', 'non-experts (w/ indicators)': 'Non-Experts (Indicators)', 'experts': 'Experts', 'SciLens': 'Automatic'})
df = pd.melt(df, id_vars=['article'], var_name='Rated by', value_name='Quality').rename(columns={'article': 'Outlet'})
df['Quality'] = df['Quality'] + 2

df = df.sort_values(by=['Rated by', 'Quality'], ascending=[True,True])
df = pd.concat([df[(df['Rated by'] == 'experts')],df[~(df['Rated by'] == 'experts')]])

sns.set(context='paper', style='white', color_codes=True, font_scale=1.5)
fig, ax = plt.subplots(figsize=(8,10))
ax = sns.pointplot(hue='Rated by', x='Quality', y='Outlet', data=df[(df['Rated by'] == 'Experts')], markers='o', palette=['#9FCC45'], scale=3, ax=ax)
plt.setp(ax.lines, zorder=100)
plt.setp(ax.collections, zorder=100)
ax = sns.barplot(hue='Rated by', x='Quality', y='Outlet', data=df[~(df['Rated by'] == 'Experts')], palette=['#CC4545', '#459FCC', '#2A617D'], ax=ax)
ax.set_xticks(ticks=[0, 1, 2, 3, 4])
ax.set_xticklabels(['Very Low', 'Low', 'Borderline', 'High', 'Very High'])
#plt.xticks(rotation=90)
plt.legend(loc='upper center', ncol=2, bbox_to_anchor=(0.5, 1.09))
plt.ylabel('')
sns.despine(left=True, bottom=True)
fig.savefig(scilens_dir+'figures/atc.pdf', bbox_inches='tight')

## CRISPR

In [None]:
df = pd.read_csv(scilens_dir+'crispr-crowd.csv').rename(columns={'how_do_you_rate_the_scientific_quality_of_this_article': 'non-experts (w/o indicators)'})
df = df[df._trust > trust_thr][['article', 'non-experts (w/o indicators)']]
df.article = df.article.apply(lambda x: x.replace('https://', 'http://'))
df_crowd = df.groupby('article').mean()

df = pd.read_csv(scilens_dir+'crispr-crowd-ind.csv').rename(columns={'how_do_you_rate_the_scientific_quality_of_this_article': 'non-experts (w/ indicators)'})
df = df[df._trust > trust_thr][['article', 'non-experts (w/ indicators)']]
df.article = df.article.apply(lambda x: x.replace('https://', 'http://'))
df_crowd_ind = df.groupby('article').mean()

df1 = pd.read_csv(scilens_dir+'crispr-Dimitra.csv').rename(columns={'how_do_you_rate_the_scientific_quality_of_this_article': 'expert1'}).sort_values(by='article')
df2 = pd.read_csv(scilens_dir+'crispr-Jose.csv').rename(columns={'how_do_you_rate_the_scientific_quality_of_this_article': 'expert2'}).sort_values(by='article').drop('article', axis=1)
df_exp = pd.concat([df1,df2], axis=1)[['article', 'expert1', 'expert2']]
df_exp.article = df_exp.article.apply(lambda x: x.replace('https://', 'http://'))
df_exp['diff'] = abs(df_exp['expert1'] - df_exp['expert2'])
df_exp['experts'] = (df_exp['expert1'] + df_exp['expert2'])/2
df_exp = df_exp.set_index('article')


df_sci = pd.read_csv(scilens_dir + 'crispr_scilens.tsv', sep='\t')
df_sci = df_sci.set_index('article').rename(columns={'scilens':'SciLens'})

df = df_crowd.join(df_exp).join(df_crowd_ind).join(df_sci).reset_index()

if print_RMSE:
    print(sqrt(mean_squared_error(df[df['diff']==0]['non-experts (w/o indicators)'], df[df['diff']==0]['experts'])))
    print(sqrt(mean_squared_error(df[df['diff']==0]['non-experts (w/ indicators)'], df[df['diff']==0]['experts'])))
    print(sqrt(mean_squared_error(df[df['diff']==0]['SciLens'], df[df['diff']==0]['experts'])))
    print()
    print(sqrt(mean_squared_error(df[df['diff']==1]['non-experts (w/o indicators)'], df[df['diff']==1]['experts'])))
    print(sqrt(mean_squared_error(df[df['diff']==1]['non-experts (w/ indicators)'], df[df['diff']==1]['experts'])))
    print(sqrt(mean_squared_error(df[df['diff']==1]['SciLens'], df[df['diff']==1]['experts'])))
    print()
    print(sqrt(mean_squared_error(df[df['diff']>1]['non-experts (w/o indicators)'], df[df['diff']>1]['experts'])))
    print(sqrt(mean_squared_error(df[df['diff']>1]['non-experts (w/ indicators)'], df[df['diff']>1]['experts'])))
    print(sqrt(mean_squared_error(df[df['diff']>1]['SciLens'], df[df['diff']>1]['experts'])))
    print()
    print(sqrt(mean_squared_error(df['non-experts (w/o indicators)'], df['experts'])))
    print(sqrt(mean_squared_error(df['non-experts (w/ indicators)'], df['experts'])))
    print(sqrt(mean_squared_error(df['SciLens'], df['experts'])))

#plot
df = df.sort_values(by=['experts', 'diff'])
l = df['article'].apply(lambda x: analyze_url(x)[0]).tolist()
l = [{'futurism.com':'Futurism', 
  'motherjones.com': 'Mother Jones',
  'natureworldnews.com': 'Nature World News',
  'dailyhealthpost.com': 'Daily Health Post',
  'biotech-now.org': 'Biotech Now',
  'fooddemocracynow.org': 'Food Democracy Now (blog)',
  'huffingtonpost.com': 'Huffington Post',
  'theverge.com': 'The Verge',
  'genomealberta.ca': 'Genome Alberta',
  'labiotech.eu': 'Labiotech',
  'medicaldaily.com': 'Medical Daily',
  'mashable.com': 'Mashable',
  'newatlas.com': 'New Atlas',
  'ibtimes.co.uk': 'IBTimes',
  'shontavia.com': 'Shontavia Johnson (blog)',
  'joshmitteldorf.scienceblog.com': 'Josh Mitteldorf (blog)',
  'thebody.com': 'The Body',
 'medium.com' : 'Twist Bioscience'}.get(o, o) for o in l]
df['article'] = [v + ' (' + str(l[:i].count(v) + 1) + ')' if l.count(v) > 1 else v for i, v in enumerate(l)]
#df['article'] = df.apply(lambda x: x['article']+'*' if x['diff']==0 else x['article']+'**' if x['diff']==1 else x['article']+'***', axis = 1)

df = df[['article', 'non-experts (w/o indicators)', 'non-experts (w/ indicators)', 'experts', 'SciLens']].rename(columns={'non-experts (w/o indicators)': 'Non-Experts (No Indicators)', 'non-experts (w/ indicators)': 'Non-Experts (Indicators)', 'experts': 'Experts', 'SciLens': 'Automatic'})
df = pd.melt(df, id_vars=['article'], var_name='Rated by', value_name='Quality').rename(columns={'article': 'Outlet'})
df['Quality'] = df['Quality'] + 2

df = df.sort_values(by=['Rated by', 'Quality'], ascending=[True,True])
df = pd.concat([df[(df['Rated by'] == 'experts')],df[~(df['Rated by'] == 'experts')]])

sns.set(context='paper', style='white', color_codes=True, font_scale=1.5)
fig, ax = plt.subplots(figsize=(8,10))
ax = sns.pointplot(hue='Rated by', x='Quality', y='Outlet', data=df[(df['Rated by'] == 'Experts')], markers='o', palette=['#9FCC45'], scale=3, ax=ax)
plt.setp(ax.lines, zorder=100)
plt.setp(ax.collections, zorder=100)
ax = sns.barplot(hue='Rated by', x='Quality', y='Outlet', data=df[~(df['Rated by'] == 'Experts')], palette=['#CC4545', '#459FCC', '#2A617D'], ax=ax)
ax.set_xticks(ticks=[0, 1, 2, 3, 4])
ax.set_xticklabels(['Very Low', 'Low', 'Borderline', 'High', 'Very High'])
#plt.xticks(rotation=90)
plt.legend(loc='upper center', ncol=2, bbox_to_anchor=(0.5, 1.09))
plt.ylabel('')
sns.despine(left=True, bottom=True)
fig.savefig(scilens_dir+'figures/crispr.pdf', bbox_inches='tight')