# Combine the Datasets

In [59]:
import pandas as pd

fake = pd.DataFrame.from_csv('./fake.csv',index_col=None)
real = pd.DataFrame.from_csv('real_news_data.csv')
real = real.iloc[:,:real.shape[1]-2]

In [60]:
fake = pd.DataFrame({"text": fake.text,
                    "title": fake.title,
                    "website": fake.site_url,
                    "authors": fake.author,
                    "image_url": fake.main_img_url,
                    "published_date": fake.published,
                    }, columns = ["text", "title","website","authors",
                                 "image_url","published_date"])

fake.head()

Unnamed: 0,text,title,website,authors,image_url,published_date
0,Print They should pay all the back all the mon...,Muslims BUSTED: They Stole Millions In Gov’t B...,100percentfedup.com,Barracuda Brigade,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,2016-10-26T21:41:00.000+03:00
1,Why Did Attorney General Loretta Lynch Plead T...,Re: Why Did Attorney General Loretta Lynch Ple...,100percentfedup.com,reasoning with facts,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,2016-10-29T08:47:11.259+03:00
2,Red State : \nFox News Sunday reported this mo...,BREAKING: Weiner Cooperating With FBI On Hilla...,100percentfedup.com,Barracuda Brigade,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,2016-10-31T01:41:49.479+02:00
3,Email Kayla Mueller was a prisoner and torture...,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,100percentfedup.com,Fed Up,http://100percentfedup.com/wp-content/uploads/...,2016-11-01T05:22:00.000+02:00
4,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,100percentfedup.com,Fed Up,http://100percentfedup.com/wp-content/uploads/...,2016-11-01T21:56:00.000+02:00


In [61]:
real.head()

Unnamed: 0,text,title,website,authors,image_url,published_date
0,Challenges for a nut-free diet\n\nIf you’re co...,Nut-free recipes and information,http://www.bbc.co.uk,[],http://static.bbci.co.uk/food/1.37.152/assets/...,
1,Lamb in Britain is called lamb if it’s markete...,Lamb recipes,http://www.bbc.co.uk,[],http://static.bbci.co.uk/food/1.37.152/assets/...,
2,This site is optimised for modern web browsers...,Irish cream and chocolate cheesecake,http://www.bbc.co.uk,"[Simon Rimmer, Mary Berry, Rob Burns]",http://ichef.bbci.co.uk/food/ic/food_16x9_448/...,
3,Preheat the oven to 180C/350F/Gas 4. Grease an...,Easy chocolate cake,http://www.bbc.co.uk,"[Rachel Manley, James Martin, The Hairy Bikers...",http://ichef.bbci.co.uk/food/ic/food_16x9_448/...,
4,Preheat the oven to 180C/350F/Gas 4 and lin 2 ...,Chocolate fairy cakes,http://www.bbc.co.uk,"[Mary Berry, Sarah Brown, Harvey Bertram-brown]",http://ichef.bbci.co.uk/food/ic/food_16x9_448/...,


In [64]:
import numpy as np

#randomly select same number from fake news as there is real
np.random.seed(442540)
fake_sample = np.random.randint(0,size=real.shape[0],high=fake.shape[0])

real = real.append(fake.iloc[fake_sample,:])

In [65]:
real.shape[0]

2092

# Add Num Typos Feature

In [66]:
import requests

In [140]:
########### Python 3.2 #############
import requests
import json
import urllib

headers = {
    # Request headers
    'Content-Type': 'application/x-www-form-urlencoded',
    'Ocp-Apim-Subscription-Key': '4d1f19cfa84c47a7851a1dd01e72443f',
}

params = urllib.parse.urlencode({
    # Request parameters
    'text': list(real.text)[2050][:500],
})

try:
    r = requests.post('https://api.cognitive.microsoft.com/bing/v5.0/spellcheck?%s' % params, "{body}",headers=headers)
    data = json.loads(r.text)
    print(data)
    
except Exception as e:
    print("[Errno {0}] {1}".format(e.errno, e.strerror))

####################################

{'_type': 'SpellCheck', 'flaggedTokens': []}


In [141]:
data['flaggedTokens']

[]

In [145]:
from json import JSONDecodeError

typo_counts = []

for t in real.text:
    
    params = urllib.parse.urlencode({
    # Request parameters
    'text': str(t)[:1000],
    })
    
    r = requests.post('https://api.cognitive.microsoft.com/bing/v5.0/spellcheck?%s' % params, "{body}",headers=headers)
    
    try:
        data = json.loads(r.text)
        typo_counts.append(len(data['flaggedTokens']))
        
    except (KeyError, JSONDecodeError):
        typo_counts.append(0)

In [146]:
set(typo_counts)

{0, 1, 2, 3, 4, 5, 6, 7, 9}

In [147]:
real['typo_counts'] = typo_counts

# Read in Subjectivity Lexicon

In [149]:
with open('./subjectivity_clues_hltemnlp05/subjclueslen1-HLTEMNLP05.tff') as tff:
    lines = tff.readlines()
    
tff.close()

In [158]:
import re

subj_frame = pd.DataFrame(columns=['type','len','word1','pos1','stemmed1','polarity','priorpolarity'],index=range(len(lines)))

i = 0
for line in lines:
    
    row = []
    
    line = line.strip()
    line_items = line.split(sep=' ')
    
    item_dict = {}
    
    for item in line_items:
        
        if '=' in item:
            pref_suff = item.split('=')
        
            item_dict[pref_suff[0]] = pref_suff[1]
    
        for col in subj_frame.columns:
            if col not in list(item_dict.keys()):
                item_dict[col] = None
    
    try:
        subj_frame.loc[i] = item_dict
        i += 1
    except ValueError:
        print(line)
        break
    

In [160]:
subj_frame.head()

Unnamed: 0,type,len,word1,pos1,stemmed1,polarity,priorpolarity
0,weaksubj,1,abandoned,adj,n,,negative
1,weaksubj,1,abandonment,noun,n,,negative
2,weaksubj,1,abandon,verb,y,,negative
3,strongsubj,1,abase,verb,y,,negative
4,strongsubj,1,abasement,anypos,y,,negative


In [180]:
subj_frame.word1 = [word.strip() for word in subj_frame.word1]

set(subj_frame.priorpolarity)

{'both', 'negative', 'neutral', 'positive', 'weakneg'}

# Calculate Sentiment and Subjectivity Scores

In [192]:
#use this lexicon to add subjectivity scores
from nltk.tokenize import word_tokenize
import math
from stop_words import get_stop_words

subjectivities = []
positives = []
negatives = []

stop_words = get_stop_words('en')

for i in real.index:
    
    subj_score = 0
    neg_score = 0
    pos_score = 0
    
    text = str(real.loc[i].text)
    
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words]
    
    intersection = []
    for token in tokens:
        if token.strip() in list(subj_frame.word1):
            intersection.append(token.strip())
            #print(token.strip())
    
    for item in intersection:
        
        subj_data = subj_frame.loc[subj_frame.word1 == str(item)]
        
        if 'strongsubj' in subj_data.type:
            subj_score += 1
            
        else:
            subj_score += .5
            
        if len(subj_data.priorpolarity) > 1:
            pol_list = list(subj_data.priorpolarity)
            pol_list = [s.strip() for s in pol_list]
            
        else:
            pol_list = str(subj_data.priorpolarity).strip()
        
        if 'both' in pol_list:
            
            neg_score += .5
            pos_score += .5
            
        elif 'negative' in pol_list:
            neg_score += 1
            
        elif 'positive' in pol_list:
            pos_score += 1
            
        elif 'weakneg' in pol_list:
            neg_score += .5
            
    subj_score = subj_score / (len(tokens) + 1)
    pos_score = pos_score / (len(tokens) + 1)
    neg_score = neg_score / (len(tokens) + 1)
    
    subjectivities.append(subj_score)
    positives.append(pos_score)
    negatives.append(neg_score)
    

In [195]:
max(subjectivities)

0.16666666666666666

In [196]:
real['text_subjectivity'] = subjectivities
real['text_positivity'] = positives
real['text_negativity'] = negatives

# Sentiment Intensity Analysis of Titles

In [197]:
from nltk.sentiment import vader

In [201]:
vader_data = []

analyzer = vader.SentimentIntensityAnalyzer()

for title in real.title:
    
    vader_data.append(analyzer.polarity_scores(str(title)))

In [203]:
vader_frame = pd.DataFrame(vader_data)
vader_frame.neu[:20]

0     1.000
1     1.000
2     1.000
3     0.408
4     1.000
5     1.000
6     1.000
7     0.328
8     1.000
9     1.000
10    1.000
11    1.000
12    1.000
13    1.000
14    1.000
15    1.000
16    1.000
17    0.746
18    1.000
19    0.504
Name: neu, dtype: float64

In [204]:
real['title_neutrality'] = vader_frame.neu

In [217]:
fake_or_real = ['real']* 1046
fake_or_real = fake_or_real + (['fake'] * 1046)
real['fake_or_real'] = fake_or_real

In [227]:
import re

real.authors = [re.sub('\[|\]','',str(author)) for author in real.authors]

In [228]:
real.head()

Unnamed: 0,text,title,website,authors,image_url,published_date,typo_counts,text_subjectivity,text_positivity,text_negativity,title_neutrality,fake_or_real
0,Challenges for a nut-free diet\n\nIf you’re co...,Nut-free recipes and information,http://www.bbc.co.uk,,http://static.bbci.co.uk/food/1.37.152/assets/...,,0,0.059809,0.038278,0.043062,1.0,real
1,Lamb in Britain is called lamb if it’s markete...,Lamb recipes,http://www.bbc.co.uk,,http://static.bbci.co.uk/food/1.37.152/assets/...,,0,0.050186,0.05948,0.02974,1.0,real
2,This site is optimised for modern web browsers...,Irish cream and chocolate cheesecake,http://www.bbc.co.uk,"Simon Rimmer, Mary Berry, Rob Burns",http://ichef.bbci.co.uk/food/ic/food_16x9_448/...,,1,0.115385,0.153846,0.0,1.0,real
3,Preheat the oven to 180C/350F/Gas 4. Grease an...,Easy chocolate cake,http://www.bbc.co.uk,"Rachel Manley, James Martin, The Hairy Bikers,...",http://ichef.bbci.co.uk/food/ic/food_16x9_448/...,,0,0.075949,0.075949,0.044304,0.408,real
4,Preheat the oven to 180C/350F/Gas 4 and lin 2 ...,Chocolate fairy cakes,http://www.bbc.co.uk,"Mary Berry, Sarah Brown, Harvey Bertram-brown",http://ichef.bbci.co.uk/food/ic/food_16x9_448/...,,0,0.038462,0.067308,0.009615,1.0,real


In [229]:
real.to_csv('./training_data.csv')