In [1]:
import pickle
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from collections import defaultdict
import string 
from pattern.db  import Datasheet
from pattern.db  import pd
import nltk
from random import shuffle
from pattern.en import sentiment
from pattern.en.wordlist import PROFANITY
import pandas as pd
import re

In [3]:
class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        
        return data_dict[self.key]

In [4]:
#HYPOTHESIS: Sensational news contains more Punctuations compared to objective news (eg !,?......)
class Punct_Stats(BaseEstimator, TransformerMixin):
    """Extract punctuation features from each document"""

    def fit(self, x, y=None):
        return self

    def transform(self, text_fields):
        punct_stats = []
        punctuations = list(string.punctuation)
        for field in text_fields:
            if field == None:
                field = " "
            puncts = defaultdict(int)
            for ch in field:
                if ch in punctuations:
                    puncts[ch]+=1
            punct_stats.append(puncts)
        return punct_stats

In [5]:
class Text_Stats(BaseEstimator, TransformerMixin):
    """Extract text statistics from each document"""

    def fit(self, x, y=None):
        return self

    def transform(self, text_fields):
        stats = []
       
        # abbreviations are used for not to be count in the capital letters features
        abvs = ['RBI','BCCI','CM','BJP','BSP','ICU','PPE','CNN', 'FBI', 'ABC', 'MSNBC', 'GOP', 'U.S.', 'US', 'ISIS', 'DNC', 'TV', 'CIA', 'I', 'AP', 'PM', 'AM', 'EU', 'USA', 'UK', 'UN', 'CEO', 'NASA', 'LGBT', 'LGBTQ', 'NAFTA', 'ACLU']
        for field in text_fields:
            field_stats = {}
            tok_text = nltk.word_tokenize(field)
            try:
                num_upper = float(len([w for w in tok_text if w.isupper() and w not in abvs]))/len(tok_text)
            except:
                num_upper = 0
     
            try:
                sent_lengths = [len(nltk.word_tokenize(s)) for s in nltk.sent_tokenize(field)]
                av_sent_len = float(sum(sent_lengths))/len(sent_lengths)
            except:
                av_sent_len = 0
            try:
                num_prof = float(len([w for w in tok_text if w.lower() in PROFANITY]))/len(tok_text)
            except:
                num_prof = 0

            polarity, subjectivity = sentiment(field)
            field_stats['all_caps'] = num_upper
            field_stats['sent_len'] = av_sent_len
            field_stats['polarity'] = polarity
            field_stats['subjectivity'] = subjectivity
            field_stats['profanity'] = num_prof
            stats.append(field_stats)
        return stats

In [6]:
#HYPOTHESIS: sensational news uses more pronouns, adjectives
class HeadlineBodyFeaturesExtractor(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self
    
    def transform(self, posts):
        punctuation = string.punctuation
        #print(len(posts))
        features = np.recarray(shape=(len(posts),), dtype=[('headline', object), ('article_body', object), ('headline_pos', object), ('body_pos', object)])
        for i, post in enumerate(posts): 
            #if i%100 == 0:
            #print(post)
            headline, article = post[:2]
            features['headline'][i] = headline
            features['article_body'][i] = article

            tok_headline = nltk.word_tokenize(headline)
            features['headline_pos'][i] = (' ').join([x[1] for x in nltk.pos_tag(tok_headline)])

            tok_article = nltk.word_tokenize(article)
            features['body_pos'][i] = (' ').join([x[1] for x in nltk.pos_tag(tok_article)])

        return features

In [7]:
def clean_str(string):
    """
    Tokenization/string cleaning for datasets.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    #print(string)
    string = re.sub(r"^b", "", string)
    string = re.sub(r"\\n ", "", string)
    string = re.sub(r"\'s", "", string)
    string = re.sub(r"\'ve", "", string)
    string = re.sub(r"n\'t", "", string)
    string = re.sub(r"\'re", "", string)
    string = re.sub(r"\'d", "", string)
    string = re.sub(r"\'ll", "", string)
    string = re.sub(r",", "", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", "", string)
    string = re.sub(r"\)", "", string)
    string = re.sub(r"\?", "", string)
    string = re.sub(r"'", "", string)
    string = re.sub(r"[^A-Za-z0-9(),.!?\'\`]", " ", string)
    string = re.sub(r"[0-9]\w+|[0-9]","", string)
    string = re.sub(r"\s{2,}", " ", string)
    #string = ' '.join(Word(word).lemmatize() for word in string.split() if word not in STOPWORDS) # delete stopwors from text

    return string.strip()

In [13]:
def source_score(source):
    tag_score = {' unreliable' : 0,
 'Conspiracy' : 0.1,
 'Fake' : 0,
 'Political' : 0.2,
 'bias' : 0.2,
 'blog': 0.3,
 'clickbait' : 0.1,
 'conspiracy' : 0.1,
 'fake' : 0,
 'fake ' : 0 ,
 'fake news' : 0,
 'hate' : 0.3,
 'imposter site' : 0.2,
 'junksci' : 0.5,
 'least_biased' : 1,
 'left' : 0.5,
 'left_center' : 0.6,
 'parody site' : 0.1,
 'political' : 0.5,
 'pro_science' : 0.9,
 'pseudoscience' : 0.8,
 'questionable' : 0.3,
 'reliable': 0.9,
 'right' : 0.5,
 'right_center' :0.6,
 'rumor' :0.1,
 'rumor ':0.1,
 'satire':0.2,
 'satirical':0.2,
 'some fake stories':0,
 'state':0.7,
 'unrealiable':0,
 'unreliable':0,
 'on':0 } #None         

    score = 0.5
    
# need improvemnt on on the sub string part, maybe name matching    
    tag_l = domain_info[domain_info['Name'].str.find(source)==0]['tags']
    count_None = 0
    for t in tag_l:
        #print(t[1:-1].split(", "))
        t = t[1:-1].split(', ')
        score -= 0.5
        #print(t)
        for i in range(len(t)):
            #print(t[i])
            score += tag_score[t[i][1:-1]]
            #print(t[i])
            if t[i][1:-1] == 'on':
                count_None += 1
        #print(score,count_None)
        score = score/(len(t)-count_None)
    if score<0:
        score= 0.0
    return score

In [14]:
def url_rank(source):
    
    try:
        g_s = ss[ss['source_url'].str.find(source)==0]['google_pagerank'].values[0]
        g_s = float(g_s[1])
    except:
        #g_s = ss[ss['source_url'].str.find(source)==0]['google_pagerank'].values
        g_s = 0.
    try:
        c_s = ss[ss['source_url'].str.find(source)==0]['alexa_score'].values
        c_s = float(c_s)
    except:
        c_s = 0.
    #print(source,g_s,c_s)
    return g_s*0.1 , c_s

In [15]:
def load_data(eval_data, domain_info,tokenzs):

    #eval_data = pd.read_csv(data_path)
    eval_data['headline'] = eval_data['Title'].apply(lambda x: clean_str(str(x)))
    eval_data['text'] = eval_data['Content'].apply(lambda x: clean_str(str(x)))
    
    # loading_data for LSTM
    X2 = []
    maxlen = 700
    stop_words = set(nltk.corpus.stopwords.words("english"))
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    for par in eval_data["text"].values:
        tmp = []
        sentences = nltk.sent_tokenize(par)
        for sent in sentences:
            sent = sent.lower()
            tokens = tokenizer.tokenize(sent)
            tokens = trigrams[bigrams[tokens]]
            filtered_words = [w.strip() for w in tokens if w not in stop_words and len(w) > 1]
            tmp.extend(filtered_words)
        X2.append(tmp)
        #X2 = tokenzs.texts_to_sequences(X2)
        #X2 = pad_sequences(X2, maxlen=700)
        #x2 = ['']
    X2 = tokenzs.texts_to_sequences(X2)
    X2 = pad_sequences(X2, maxlen=700)   

    sources = eval_data['Source']
    

    eval_data = eval_data[['headline','text']]
    testing_data = eval_data.values.tolist()
    
    
    
    return testing_data,X2,sources

In [18]:
import time

In [19]:
import warnings
warnings.filterwarnings('ignore')
from tensorflow.keras.models import load_model
from tensorflow.keras.models import model_from_json
import json
import warnings
warnings.filterwarnings('ignore')
from tensorflow.keras.preprocessing.sequence import pad_sequences

start_time = time.time()


with open('./stored_models/fake_tokenizer.pickle', 'rb') as handle:
    tokenzs = pickle.load(handle)


with open('./stored_models/model_in_json2.json','r') as f:
    model_json = json.load(f)

l_model = model_from_json(model_json)
l_model.load_weights('./stored_models/model_weights2.h5')
from gensim.models.phrases import Phrases, Phraser
bigrams = Phraser.load("./stored_models/bigrams")
trigrams = Phraser.load("./stored_models/trigrams")

model_file = "./stored_models/xgb_pipeline.pkl"
with open(model_file, "rb") as pkl:
    l_pipeline = pickle.load(pkl)

print("--- %s seconds ---" % (time.time() - start_time))

--- 19.161622762680054 seconds ---


In [1]:
import pandas as pd
eval_data=  pd.read_csv('./data/output.csv')
eval_data1000 = pd.read_csv('./data/data_with_source_url.csv')
domain_info = pd.read_csv('./data/Domain_info.csv')
ss = pd.read_csv('./data/source_scores.csv')

In [3]:
ss[['name','source_']]

Unnamed: 0,name,google_pagerank,cpr_score,alexa_score,source_url
0,www.wsj.com,7/10,7.4/10,423,wsj.com
1,ewn.co.za,7/10,7.2/10,10608,ewn.co.za
2,g1.globo.com,8/10,9.1/10,114,g1.globo.com
3,thehimalayantimes.com,7/10,7.0/10,60929,thehimalayantimes.com
4,www.gov.uk,6/10,6.5/10,1228,gov.uk
...,...,...,...,...,...
1108,sehir.edu.tr,4/10,4.5/10,100000,sehir.edu.tr
1109,www.uni-bremen.de,6/10,6.8/10,63028,uni-bremen.de
1110,www.medstarhealth.org,6/10,5.9/10,100000,medstarhealth.org
1111,tele1.com.tr,4/10,4.3/10,2050,tele1.com.tr


In [21]:
eval_data

Unnamed: 0,Content,Description,LastUpdated,Source,Title,URL,_id,_index,_score,_type,...,name,person_apperaed_list,publishedAt,query,score,sentimentscore,summary,tag,topic,uid
0,This article is about the company as a whole. ...,,2020-05-28T06:34:24,Financial Post,SNC-Lavalin,https://en.wikipedia.org/wiki/SNC-Lavalin,75,news_test_bow1,0.0,data,...,doug lamalfa,"john cox, jr.",2005-05-20T23:58:38,Bribery,0.836888,0.162547,This article is about the company as a whole. ...,Crime,wrongdoing,a14e7e085e957a8d679aeb2bfa8eb8346a932d85b34ca1...
1,"Airbus admits to paying at least 340,000 euros...",A settlement reached by Airbus with financial ...,2020-05-28T06:34:24,Financial Post,"Airbus admits to paying at least 340,000 euros...",https://kathmandupost.com/national/2020/02/04/...,76,news_test_bow1,0.0,data,...,bonnie watson coleman,Томас Массі,2020-05-28T00:00:00,Bribery,0.382230,0.664283,"Airbus admits to paying at least 340,000 euros...",Misconduct,endorses,2b213a065845d0e58ea019e6962dcfb88fcafd5b324d1d...
2,Image copyright Getty Images Image caption The...,The singer reportedly bribed an Illinois offic...,2020-05-28T06:34:24,MTV News (UK),R. Kelly faces bribery charge over 1994 marria...,https://www.bbc.co.uk/news/entertainment-arts-...,80,news_test_bow1,0.0,data,...,negociamos mcm ltda,,2019-12-06T06:57:23,Bribery,0.554467,0.995429,Image copyright Getty Images Image caption The...,Embezzlement,sanctions,70cbdb2ee156cf93d8b971094b6fa47e910e65446f1750...
3,Australia will need to brace for a 'concerning...,Stories in Fraud include: Netanyahu becomes th...,2020-05-28T06:34:24,Business Insider,Business Insider,https://www.businessinsider.com.au/category/fraud,93,news_test_bow1,0.0,data,...,negociamos mcm ltda,liz mcinnes,2020-05-25T17:15:40,Fraud,0.627812,0.263624,Australia will need to brace for a 'concerning...,Tax Evasion,evil,00611f5ab5c100341de6f983ed8b7f9bd0438135e5242d...
4,Cyberattacks involving financial fraud saw a h...,100% increase in recorded phishing attempts li...,2020-05-28T06:34:24,TechRadar,New financial fraud attacks detected 'every tw...,https://www.techradar.com/in/news/new-financia...,95,news_test_bow1,0.0,data,...,tim hutchinson,muhammad yunis ahmad,2020-03-12T14:23:06,Fraud,0.354746,0.681167,Cyberattacks involving financial fraud saw a h...,Misconduct,wrong,8ffeacaa43ef4b5ceee136a987b688c1c4c8c987217026...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,More headlines\nIndia's Glenmark to study pote...,"Latest coverage of hackers, cyber crime, ident...",2020-05-26T12:41:15,Reuters,Cyber Crime,https://in.reuters.com/subjects/cyber-crime,512,news_test_bow1,0.0,data,...,"john cox, jr.",,2020-05-26T11:40:00,Crime,0.310891,,,Identity Fraud,,79ee3221dba9ca3e0fa3f67ca283028f4e68c55b503907...
96,There’s Another Video of the Killing of Ahmaud...,Everything with the topic 'Crime' on VICE,2020-05-26T12:41:15,Vice News,crime,https://www.vice.com/en_uk/topic/crime,520,news_test_bow1,0.0,data,...,dawood Ibrahim,,2020-05-19T00:00:00,Crime,0.488072,,,Money Laundering,,aad28c503d844ef9682daca5331daaa97d5ae6a17febae...
97,Jeff Brantingham is as close as it gets to put...,Pentagon-funded research aims to predict when ...,2020-05-26T12:41:15,The Verge,A pioneer in predictive policing is starting a...,https://www.theverge.com/2018/4/26/17285058/pr...,522,news_test_bow1,0.0,data,...,muhammad yunis ahmad,,2018-04-26T13:36:05,Crime,0.096963,,,Identity Fraud,,0142c6f308b018f9dd963d9d8b68dcbcd0410659c30a5c...
98,The National Crime Records Bureau (NCRB) publi...,NCRB publishes Crime in India Report 2018 with...,2020-05-26T12:41:15,The Hindu,Uttar Pradesh tops list in crimes against women,https://www.thehindu.com/news/national/uttar-p...,524,news_test_bow1,0.0,data,...,henry de jesus rangel silva,,2020-01-09T22:53:43,Crime,0.704759,,,Corruption,,03c276629b886163135861ba3e7471803104f570a75368...


In [22]:
def predictions(eval_data):
    
    #data_path = './data/output.csv'
    #domain_path = './data/Domain_info.csv'
    sen_obj_data, fake_data, sources = load_data(eval_data,domain_info, tokenzs)
    obj_score = l_pipeline.predict_proba(sen_obj_data)[:,0]
    #print(len(fake_data))
    
    lst = []
    for i in range(eval_data.shape[0]):
        lines = eval_data['URL'][i]
        lines = lines.replace("https://","")
        lines = lines.replace("www.", "") # May replace some false positives ('www.com')
        urls = [url.split('/')[0] for url in lines.split()]
        z = '\n'.join(urls)
        #print(z)
        lst.append(z)
        
    eval_data['source_url'] = lst
    
    final_coeff = {
        'obj_score_coeff' : 0.3,
        'fake_score_coeff' : 0.35,
        'bias_score_coeff' : 0.25,
        'google_rank_coeff' : 0.1,
        'alexa_rank_coeff': -1*0.000001
    }
    
    fake_score = l_model.predict(fake_data)
    eval_data['obj_score'] = 0. 
    eval_data['fake_score'] = 0.
    eval_data['bias_score'] = 0.
    eval_data['google_rank'] = 0.
    eval_data['alexa_rank'] = 0.
    eval_data['news_credibility'] = 0.
    #bias_score = source_score(source)
    #print("objective score, Fake Score, Bias Score ")
    for i in range(len(obj_score)):
        #print(i, obj_score[i],fake_score[i][0],source_score(sources[i]))
        eval_data['obj_score'][i] = obj_score[i]
        eval_data['fake_score'][i] = fake_score[i]
        eval_data['bias_score'][i] = source_score(sources[i])
        eval_data['google_rank'][i], eval_data['alexa_rank'][i] = url_rank(eval_data['source_url'][i]) 
        final_score = eval_data['obj_score'][i]*final_coeff['obj_score_coeff'] +  eval_data['fake_score'][i]*final_coeff['fake_score_coeff'] +  eval_data['bias_score'][i]*final_coeff['bias_score_coeff'] +  eval_data['google_rank'][i]*final_coeff['google_rank_coeff'] +  eval_data['alexa_rank'][i]*final_coeff['alexa_rank_coeff']
        eval_data['news_credibility'][i] = float(final_score)
    #print(eval_data)
    return eval_data
        
    
    #print(len(fake_score))
        

In [89]:
start_time = time.time()
eval100 = predictions(eval_data)
print("--- %s seconds ---" % (time.time() - start_time))

--- 17.429508686065674 seconds ---


In [90]:
eval100

Unnamed: 0,Content,Description,LastUpdated,Source,Title,URL,_id,_index,_score,_type,...,uid,headline,text,source_url,obj_score,fake_score,bias_score,google_rank,alexa_rank,news_credibility
0,This article is about the company as a whole. ...,,2020-05-28T06:34:24,Financial Post,SNC-Lavalin,https://en.wikipedia.org/wiki/SNC-Lavalin,75,news_test_bow1,0.0,data,...,a14e7e085e957a8d679aeb2bfa8eb8346a932d85b34ca1...,SNC Lavalin,This article is about the company as a whole. ...,en.wikipedia.org,0.578636,0.992900,0.6,0.9,13.0,0.760976
1,"Airbus admits to paying at least 340,000 euros...",A settlement reached by Airbus with financial ...,2020-05-28T06:34:24,Financial Post,"Airbus admits to paying at least 340,000 euros...",https://kathmandupost.com/national/2020/02/04/...,76,news_test_bow1,0.0,data,...,2b213a065845d0e58ea019e6962dcfb88fcafd5b324d1d...,Airbus admits to paying at least euros in brib...,Airbus admits to paying at least euros in brib...,kathmandupost.com,0.929262,0.056451,0.6,0.0,0.0,0.448536
2,Image copyright Getty Images Image caption The...,The singer reportedly bribed an Illinois offic...,2020-05-28T06:34:24,MTV News (UK),R. Kelly faces bribery charge over 1994 marria...,https://www.bbc.co.uk/news/entertainment-arts-...,80,news_test_bow1,0.0,data,...,70cbdb2ee156cf93d8b971094b6fa47e910e65446f1750...,R. Kelly faces bribery charge over marriage to...,Image copyright Getty Images Image caption The...,bbc.co.uk,0.546659,0.001768,0.5,0.9,117.0,0.378447
3,Australia will need to brace for a 'concerning...,Stories in Fraud include: Netanyahu becomes th...,2020-05-28T06:34:24,Business Insider,Business Insider,https://www.businessinsider.com.au/category/fraud,93,news_test_bow1,0.0,data,...,00611f5ab5c100341de6f983ed8b7f9bd0438135e5242d...,Business Insider,Australia will need to brace for a concerning ...,businessinsider.com.au,0.965348,0.937896,0.0,0.0,0.0,0.617868
4,Cyberattacks involving financial fraud saw a h...,100% increase in recorded phishing attempts li...,2020-05-28T06:34:24,TechRadar,New financial fraud attacks detected 'every tw...,https://www.techradar.com/in/news/new-financia...,95,news_test_bow1,0.0,data,...,8ffeacaa43ef4b5ceee136a987b688c1c4c8c987217026...,New financial fraud attacks detected every two...,Cyberattacks involving financial fraud saw a h...,techradar.com,0.875462,0.961106,0.5,0.0,0.0,0.724026
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,More headlines\nIndia's Glenmark to study pote...,"Latest coverage of hackers, cyber crime, ident...",2020-05-26T12:41:15,Reuters,Cyber Crime,https://in.reuters.com/subjects/cyber-crime,512,news_test_bow1,0.0,data,...,79ee3221dba9ca3e0fa3f67ca283028f4e68c55b503907...,Cyber Crime,More headlines India Glenmark to study potenti...,in.reuters.com,0.691138,0.999913,1.0,0.8,418.0,0.883131
96,There’s Another Video of the Killing of Ahmaud...,Everything with the topic 'Crime' on VICE,2020-05-26T12:41:15,Vice News,crime,https://www.vice.com/en_uk/topic/crime,520,news_test_bow1,0.0,data,...,aad28c503d844ef9682daca5331daaa97d5ae6a17febae...,crime,There s Another Video of the Killing of Ahmaud...,vice.com,0.849551,0.000487,0.6,0.0,0.0,0.405036
97,Jeff Brantingham is as close as it gets to put...,Pentagon-funded research aims to predict when ...,2020-05-26T12:41:15,The Verge,A pioneer in predictive policing is starting a...,https://www.theverge.com/2018/4/26/17285058/pr...,522,news_test_bow1,0.0,data,...,0142c6f308b018f9dd963d9d8b68dcbcd0410659c30a5c...,A pioneer in predictive policing is starting a...,Jeff Brantingham is as close as it gets to put...,theverge.com,0.397209,0.880805,0.6,0.0,0.0,0.577444
98,The National Crime Records Bureau (NCRB) publi...,NCRB publishes Crime in India Report 2018 with...,2020-05-26T12:41:15,The Hindu,Uttar Pradesh tops list in crimes against women,https://www.thehindu.com/news/national/uttar-p...,524,news_test_bow1,0.0,data,...,03c276629b886163135861ba3e7471803104f570a75368...,Uttar Pradesh tops list in crimes against women,The National Crime Records Bureau NCRB publish...,thehindu.com,0.814543,0.998138,0.6,0.8,844.0,0.815271


In [23]:
start_time = time.time()
data1000 = predictions(eval_data1000)
print("--- %s seconds ---" % (time.time() - start_time))

--- 147.84489107131958 seconds ---


In [24]:
eval_data1000

Unnamed: 0.1,Unnamed: 0,Content,Description,LastUpdated,Source,Title,URL,_id,_index,_score,...,uid,source_url,headline,text,obj_score,fake_score,bias_score,google_rank,alexa_rank,news_credibility
0,0,Process of transforming profits of crime and c...,,2020-05-28T06:34:24,Time,Money laundering,https://en.wikipedia.org/wiki/Money_laundering...,36,news_test_bow1,0.0,...,65cc33ddc47ab67c872bdf787d118aaa845ad79b23c1c5...,en.wikipedia.org,Money laundering,Process of transforming profits of crime and c...,0.781490,0.007826,0.0,0.9,13.0,0.327173
1,1,This was not a good week for Challenger Bank R...,,2020-05-28T06:34:24,TechCrunch,Revolut CFO returns after money laundering con...,https://hitechglitz.com/revolut-cfo-returns-af...,37,news_test_bow1,0.0,...,1677842fe491afc57027e7024b0ee7b61301fcb22e79aa...,hitechglitz.com,Revolut CFO returns after money laundering con...,This was not a good week for Challenger Bank R...,0.700519,0.972691,0.6,0.0,0.0,0.700598
2,2,Our robot colleague Satoshi Nakaboto writes ab...,,2020-05-28T06:34:24,The Next Web,Satoshi Nakaboto: ‘Man charged with operating ...,https://thenextweb.com/hardfork/2020/02/14/sat...,41,news_test_bow1,0.0,...,a3fa9c48b73ea48f32fc2bbc418a189050619597079268...,thenextweb.com,Satoshi Nakaboto Man charged with operating mo...,Our robot colleague Satoshi Nakaboto writes ab...,0.221100,0.047525,0.6,0.0,0.0,0.232964
3,3,Seven people have been arrested in Northern Ir...,Seven people have been arrested in Northern Ir...,2020-05-28T06:34:24,RTE,Seven arrested in £215m money laundering inves...,https://www.rte.ie/news/ulster/2020/0121/11098...,42,news_test_bow1,0.0,...,a076d07e5f76caa2397dd103b470c14c5bd2c614381bab...,rte.ie,Seven arrested in money laundering investigation,Seven people have been arrested in Northern Ir...,0.870310,0.999695,1.0,0.0,0.0,0.860986
4,4,This article is about the company as a whole. ...,,2020-05-28T06:34:24,Financial Post,SNC-Lavalin,https://en.wikipedia.org/wiki/SNC-Lavalin,75,news_test_bow1,0.0,...,a14e7e085e957a8d679aeb2bfa8eb8346a932d85b34ca1...,en.wikipedia.org,SNC Lavalin,This article is about the company as a whole. ...,0.578636,0.992900,0.6,0.9,13.0,0.761093
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,"Overall, violent crime has gone down 16 percen...",Shootings and homicides appear little affected...,2020-05-26T12:41:15,The Washington Post,"D.C. seeing drop in robberies during pandemic,...",https://www.washingtonpost.com/local/public-sa...,974,news_test_bow1,0.0,...,077f8193b0025660ae7a2b9e952283b5a4ec3df7b4a460...,washingtonpost.com,D.C. seeing drop in robberies during pandemic ...,Overall violent crime has gone down percent si...,0.642777,0.629371,0.6,0.9,175.0,0.652938
996,996,"In the early '90s, I grew up in a town called ...",Warning: includes details some readers may fin...,2020-05-26T12:41:15,Buzzfeed,23 Creepy True Crime Stories That'll Make You ...,https://www.buzzfeed.com/norbertobriceno/true-...,982,news_test_bow1,0.0,...,0e01f6da56e676b715e7c88076e230341a8adcaf377873...,buzzfeed.com,Creepy True Crime Stories That Make You Gasp O...,In the early I grew up in a town called Wellin...,0.774772,0.026507,0.6,0.8,449.0,0.471260
997,997,"Tiger King: Murder, Mayhem, and Madness has be...",If you binge-watched Netflix's Tiger King as q...,2020-05-26T12:41:15,IGN,Best True Crime Documentaries to Watch After T...,https://www.ign.com/articles/best-true-crime-d...,990,news_test_bow1,0.0,...,9697ab5ff62bca300980607c89b30e97d42e3fd0aada50...,ign.com,Best True Crime Documentaries to Watch After T...,Tiger King Murder Mayhem and Madness has been ...,0.562538,0.000039,0.5,0.0,0.0,0.293775
998,998,"The Hill 1625 K Street, NW Suite 900 Washingto...","The Hill is a top US political website, read b...",2020-05-26T06:59:20,The Hill,Crime,https://thehill.com/social-tags/crime,994,news_test_bow1,0.0,...,fb82c11aa09961dc711d0564188f7eca8c0f9ffb38ec7a...,thehill.com,Crime,The Hill K Street NW Suite Washington DC tel f...,0.946355,0.224599,0.6,0.8,1200.0,0.591316


In [93]:
eval_data.to_csv('final_scores.csv', index = False)

In [11]:
final = pd.read_csv('final_scores.csv')

In [25]:
eval_data1000['news_credibility'].describe()

count    1000.000000
mean        0.550301
std         0.195811
min         0.128042
25%         0.397178
50%         0.533070
75%         0.723461
max         0.960369
Name: news_credibility, dtype: float64

In [30]:
eval_data1000['text'][997]

'Tiger King Murder Mayhem and Madness has been the talk of Twitter since it premiered on March hooking people on the audacious antics of Joe Exotic. He s the gay polyamorous one time presidential candidate and former owner of tigers ligers and other wild cats whose chaotic feud with a rival big cat enthusiast leads to his WTF downfall and he already inspired Jared Leto to dress up as him to host a Tiger King watch party and Dax Shepard and Edward Norton to fan cast themselves in the inevitable movie adaptation of his life. Best True Crime Shows and Movies to Watch Online IMAGES Evil Genius The True Story of America s Most Diabolical Bank Heist The Jinx The Life and Deaths of Robert Durst Wild Wild Country Gringo The Dangerous Life of John McAfee McMillions Icarus Mommy Dead and Dearest White Boy Stranger Than Fiction The Nanny Killers American Vandal If you binge watched Tiger King as quickly as we did then you might be looking for more of the best true crime documentaries to watch so 

In [31]:
eval_data1000['text'][999]

'Google employees left their offices in Kitchener Ont. Toronto Ont. and Montreal Qc as part of an international walkout protesting the company treatment of women and alleged handling of sexual misconduct allegations. At its Canadian engineering and development headquarters in Kitchener employees reversed the protest and instead staged a walk in from inside the company lobby. When media tried to enter the building an unnamed employee met them at the door and confirmed that employees were gathering as part of the worldwide Walkout for Real Change. But he said that Google was handling this walk in as an internal affair and that no employee would be speaking to the media. Reporters were then asked to remove themselves from the company property. Montreal Toronto walkouts About two dozen workers left the tech giant Montreal office and at least one hundred exited the company Toronto office just after a.m. local time. Those in Toronto walked to a nearby park where they spent about minutes read