## Examine data and create features

- Dataset from https://www.kaggle.com/clmentbisaillon/fake-and-real-news-dataset

In [1]:
import nltk
import seaborn as sns
import matplotlib.pyplot as plt
import random as rn
import numpy as np
import pandas as pd

true_data = pd.read_csv("~/FakeNewsProject/data/s_1/True.csv")
false_data = pd.read_csv("~/FakeNewsProject/data/s_1/Fake.csv") 

combined_df = pd.concat([true_data, false_data])
labels_np = np.concatenate([np.ones(len(true_data), dtype=np.int), np.zeros(len(false_data), dtype=np.int)])

labels_df = pd.DataFrame(data=labels_np, index=np.arange(0,(len(labels_np))), columns=['label'])
combined_df['label'] = labels_df

combined_df.head(10)

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1
5,"White House, Congress prepare for talks on spe...","WEST PALM BEACH, Fla./WASHINGTON (Reuters) - T...",politicsNews,"December 29, 2017",1
6,"Trump says Russia probe will be fair, but time...","WEST PALM BEACH, Fla (Reuters) - President Don...",politicsNews,"December 29, 2017",1
7,Factbox: Trump on Twitter (Dec 29) - Approval ...,The following statements were posted to the ve...,politicsNews,"December 29, 2017",1
8,Trump on Twitter (Dec 28) - Global Warming,The following statements were posted to the ve...,politicsNews,"December 29, 2017",1
9,Alabama official to certify Senator-elect Jone...,WASHINGTON (Reuters) - Alabama Secretary of St...,politicsNews,"December 28, 2017",1


In [2]:
# Counting by Subjects 
for key,count in combined_df.subject.value_counts().iteritems():
    print(f"{key}:\t{count}")
    
# Getting Total Rows
print(f"Total Records:\t{combined_df.shape[0]}")

politicsNews:	11272
worldnews:	10145
News:	9050
politics:	6841
left-news:	4459
Government News:	1570
US_News:	783
Middle-east:	778
Total Records:	44898


In [3]:
# get rid of outliers
filtered_list = [i for i in range(0,len(combined_df)) if len(combined_df.iloc[i]['text'])> 50]
filtered_df = combined_df.iloc[filtered_list]
print(len(filtered_df))

44061


### NLTK Libraries

In [4]:
 # download stop words
    
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('brown')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/engineer6080/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/engineer6080/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     /home/engineer6080/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/engineer6080/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
from nltk.corpus import wordnet as wn
from nltk.corpus import brown
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from collections import defaultdict
import re


In [6]:
sample_text = (filtered_df.iloc[[0]]['text'])[0]
text_sentences = sent_tokenize(sample_text)
print(text_sentences[0])

WASHINGTON (Reuters) - The head of a conservative Republican faction in the U.S. Congress, who voted this month for a huge expansion of the national debt to pay for tax cuts, called himself a “fiscal conservative” on Sunday and urged budget restraint in 2018.


In [7]:
# remove punctuation
cleaned = re.sub(r'[^(a-zA-Z)\s]','', text_sentences[0])
print(cleaned)

WASHINGTON (Reuters)  The head of a conservative Republican faction in the US Congress who voted this month for a huge expansion of the national debt to pay for tax cuts called himself a fiscal conservative on Sunday and urged budget restraint in 


In [8]:
ps = PorterStemmer()
sentence_stem = ps.stem(cleaned)
print(sentence_stem)

washington (reuters)  the head of a conservative republican faction in the us congress who voted this month for a huge expansion of the national debt to pay for tax cuts called himself a fiscal conservative on sunday and urged budget restraint in 


In [9]:
words = word_tokenize(sentence_stem)
print(words)

['washington', '(', 'reuters', ')', 'the', 'head', 'of', 'a', 'conservative', 'republican', 'faction', 'in', 'the', 'us', 'congress', 'who', 'voted', 'this', 'month', 'for', 'a', 'huge', 'expansion', 'of', 'the', 'national', 'debt', 'to', 'pay', 'for', 'tax', 'cuts', 'called', 'himself', 'a', 'fiscal', 'conservative', 'on', 'sunday', 'and', 'urged', 'budget', 'restraint', 'in']


In [10]:
filtered_words = [word.lower() for word in words if word.isalpha()]
print(filtered_words)

['washington', 'reuters', 'the', 'head', 'of', 'a', 'conservative', 'republican', 'faction', 'in', 'the', 'us', 'congress', 'who', 'voted', 'this', 'month', 'for', 'a', 'huge', 'expansion', 'of', 'the', 'national', 'debt', 'to', 'pay', 'for', 'tax', 'cuts', 'called', 'himself', 'a', 'fiscal', 'conservative', 'on', 'sunday', 'and', 'urged', 'budget', 'restraint', 'in']


In [11]:
   
''' 
    Takes in sentence as input
    ex: text_sentences = sent_tokenize(text_body)
'''
def filterWords(sentence):
    ps = PorterStemmer()
    # remove punctuation
    cleaned = re.sub(r'[^(a-zA-Z)\s]','', sentence)
    sentence_stem = ps.stem(cleaned)
    words = word_tokenize(sentence_stem)
    # remove brackets, etc.
    filtered_words = [word.lower() for word in words if word.isalpha()]
    return filtered_words
    

In [12]:
%%time

test_words = filterWords(text_sentences[0])
print(test_words, end="\n\n")

['washington', 'reuters', 'the', 'head', 'of', 'a', 'conservative', 'republican', 'faction', 'in', 'the', 'us', 'congress', 'who', 'voted', 'this', 'month', 'for', 'a', 'huge', 'expansion', 'of', 'the', 'national', 'debt', 'to', 'pay', 'for', 'tax', 'cuts', 'called', 'himself', 'a', 'fiscal', 'conservative', 'on', 'sunday', 'and', 'urged', 'budget', 'restraint', 'in']

CPU times: user 306 µs, sys: 7 µs, total: 313 µs
Wall time: 301 µs


In [13]:
'''
    Takes in sentence
    Output ambiguity score 
'''
def ambiguity(sentence):
    ambiguousCount = 0
    for word in sentence:
        ambiguousCount += len(wn.synsets(word))
        
    return ambiguousCount

In [14]:
%%time

ambCount = ambiguity(test_words)

print(ambCount, end="\n\n")

261

CPU times: user 988 ms, sys: 24.1 ms, total: 1.01 s
Wall time: 1.01 s


In [15]:
'''
    input sentence
    out: % unique words
'''

def lexical_diversity(sentence):
        return len(set(sentence)) / len(sentence)

In [16]:
%%time

lexCount = lexical_diversity(test_words)

print(lexCount*100, end="\n\n")

80.95238095238095

CPU times: user 0 ns, sys: 74 µs, total: 74 µs
Wall time: 56.7 µs


In [17]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

def analyzeSentiment(sentence):
    sid = SentimentIntensityAnalyzer()
    ss = sid.polarity_scores(sentence) # test_words
    
    return [ss['neg'], ss['neu'], ss['pos'], ss['compound']]

for s in text_sentences:
    print(s[0:90])
    ss = analyzeSentiment(s)
    print(ss)
    '''
    for k in sorted(ss):
        print('{0}: {1}, '.format(k, ss[k]), end='')
    '''

    print("\n")


WASHINGTON (Reuters) - The head of a conservative Republican faction in the U.S. Congress,
[0.137, 0.811, 0.052, -0.4215]


In keeping with a sharp pivot under way among Republicans, U.S. Representative Mark Meadow
[0.108, 0.892, 0.0, -0.4588]


When they return from the holidays on Wednesday, lawmakers will begin trying to pass a fed
[0.052, 0.896, 0.052, 0.0]


President Donald Trump and his Republicans want a big budget increase in military spending
[0.0, 0.808, 0.192, 0.6808]


“The (Trump) administration has already been willing to say: ‘We’re going to increase non-
[0.0, 0.733, 0.267, 0.9062]


“Now, Democrats are saying that’s not enough, we need to give the government a pay raise o
[0.069, 0.931, 0.0, -0.1027]


For a fiscal conservative, I don’t see where the rationale is.
[0.0, 1.0, 0.0, 0.0]


...
[0.0, 1.0, 0.0, 0.0]


Eventually you run out of other people’s money,” he said.
[0.0, 1.0, 0.0, 0.0]


Meadows was among Republicans who voted in late December for their party’s d

In [18]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from pandas.core.common import flatten

article_num = 4
article_words = list(flatten([filterWords(s) for s in sent_tokenize(filtered_df.iloc[article_num]['text'])]))
title_words = filterWords((filtered_df.iloc[article_num]['title']))
    
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(article_words)
#print(vectorizer.get_feature_names())
print("Title size:", len(title_words), "Article size:", len(article_words))
print("Vectorizer shape:", X.shape)

out_arr = vectorizer.transform(title_words).toarray()
print(np.average(out_arr))

Title size: 11 Article size: 827
Vectorizer shape: (827, 369)
0.002463661000246366


In [19]:
def tfidVectorizer(sentence_list, title):
    article_words = list(flatten(sentence_list))
    title_words = filterWords(title)
    
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(article_words)
    out_arr = vectorizer.transform(title_words).toarray()
    
    return np.average(out_arr)

In [21]:
len(filtered_df)

44061

In [20]:
import os
from multiprocessing import Pool, Process, Manager, Lock

PROCS = os.cpu_count()
print("Processors:", PROCS)

manager = Manager()
pLock = Lock()
shared_dict = manager.dict()

quant = len(filtered_df)
#quant = 5000

sliceSize = int(quant/(PROCS))
sliceRemain = quant%(PROCS)

print(sliceSize, sliceRemain)

Processors: 16
2753 13


In [22]:

def preProcessData(myid, rng, filtered_df, shared_dict):
    
    pLock.acquire()
    print("ID:", myid, "START:", rng.start, "STOP:", rng.stop)
    pLock.release()

    output_list = []
    err_list = []
    sentence_list = []
    # title, text, subject, date 

    for i in rng:
        text_body = (filtered_df.iloc[i]['text'])
        text_sentences = sent_tokenize(text_body)

        feature_dict = {}
        
        amb_list = []
        lex_list = []
        sentiment_list = []
        
        try:

            # df columns [ambiguity, lexical_diversity, sentiment]
            words_list = []
            for s in text_sentences:
                #if(len(s) > 5):
                cleaned_words = filterWords(s)
                words_list.append(cleaned_words)

                amb_list.append(ambiguity(cleaned_words))
                lex_list.append(lexical_diversity(cleaned_words))
                sentiment_list.append(analyzeSentiment(s))

        
            sentence_list.append(words_list)
            
            feature_dict['tfid'] = tfidVectorizer(words_list, filtered_df.iloc[i]['title'])
    
            feature_dict['id'] = i
            feature_dict['ambg'] = np.mean(amb_list)
            feature_dict['lex_div'] = np.mean(lex_list)
            
            mean_sentiment = np.mean(np.array(sentiment_list), axis=0)
            feature_dict['neg'] = mean_sentiment[0]
            feature_dict['neu'] = mean_sentiment[1]
            feature_dict['pos'] = mean_sentiment[2]
            feature_dict['compound'] = mean_sentiment[3]
            feature_dict['label'] =  (filtered_df.iloc[i]['label'])
            
        except Exception as e:
            
            feature_dict['id'] = i
            feature_dict['ambg'] = 0
            feature_dict['lex_div'] = 0
            feature_dict['neg'] = 0
            feature_dict['neu'] = 0
            feature_dict['pos'] = 0
            feature_dict['compound'] = 0
            feature_dict['tfid'] = 0 
            feature_dict['label'] = -1
            
            
            err_str = "id: " + str(myid) + "iter: " + str(i) + "err: " + str(e)
            err_list.append(err_str)
        
        output_list.append(feature_dict)
    
    pLock.acquire()
    shared_dict[myid] = output_list
    shared_dict[(str(myid) + "_err")] = err_list
    shared_dict[(str(myid) + "_words")] = sentence_list
    print("ID:", myid, "FIN")
    pLock.release()

In [None]:
'''
    Debugging
    
    shared_dict = {}
    preProcessData(0, range(0,1), filtered_df, shared_dict)
'''

In [23]:
%%time

processes = []

for i in range(0, PROCS):
    
    rng = range( (i*sliceSize), ((i+1)*sliceSize)  ) 
    
    if(i == PROCS-1):
        rng = range( ((i)*sliceSize), ((i+1)*sliceSize)+sliceRemain )
        
    p = Process(target=preProcessData, args=[ i, rng, filtered_df, shared_dict ])
    processes.append(p)
    p.start()
    
    
# Wait for finish
for p in processes:
    p.join()

ID: 0 START: 0 STOP: 2753
ID: 1  START:2753 STOP: 5506
ID:  2START: 5506 STOP: 8259
ID: 3 START: 8259 STOP: 11012
ID: 4 START: 11012 STOP: 13765
ID: 5 START: 13765 STOP: 16518
ID: 6 START: 16518 STOP: 19271
ID: 7 START: 19271 STOP: 22024
ID: 8 START: 22024 STOP: 24777
ID: 9 START: 24777 STOP: 27530
ID: 10 START: 27530 STOP: 30283
ID: 11 START: 30283 STOP: 33036
ID: 12 START: 33036 STOP: 35789
ID: 13 START: 35789 STOP: 38542
ID: 14 START: 38542 STOP: 41295
ID: 15 START: 41295 STOP: 44061
ID: 12 FIN
ID: 14 FIN
ID: 5 FIN
ID: 4 FIN
ID: 7 FIN
ID: 11 FIN
ID: 13 FIN
ID: 6 FIN
ID: 1 FIN
ID: 3 FIN
ID: 10 FIN
ID: 0 FIN
ID: 9 FIN
ID: 2 FIN
ID: 8 FIN
ID: 15 FIN
CPU times: user 78.2 ms, sys: 142 ms, total: 220 ms
Wall time: 4min 3s


In [24]:
len(shared_dict['0_words'])

1728

In [25]:
'''
    Combine
'''

featureList = []

for pid in range(0, PROCS):
    print(len(shared_dict[pid]), end = " ")
    for s_dict in shared_dict[pid]:
        featureList.append(s_dict)
        

2753 2753 2753 2753 2753 2753 2753 2753 2753 2753 2753 2753 2753 2753 2753 2766 

In [36]:
'''
    Combine processed words
'''

paraList = []

for pid in range(0, PROCS):
    idx = (str(pid)+"_words")
    print("num sentences", len(shared_dict[idx]), end = " ")
    for p in shared_dict[idx]:
        paraList.append(p)
        

num sentences 1728 num sentences 1512 num sentences 1543 num sentences 1473 num sentences 1476 num sentences 1438 num sentences 1417 num sentences 1330 num sentences 1429 num sentences 1477 num sentences 1363 num sentences 1376 num sentences 1368 num sentences 1382 num sentences 1430 num sentences 1372 

In [35]:
print(len(paraList), len(featureList))

23114 44061


In [37]:
import pickle

with open('paraList.pkl', 'wb') as f:
    pickle.dump(paraList, f)

In [38]:
final_df = pd.DataFrame(featureList)
final_df = final_df[final_df.label != -1]

final_df.head(10)

Unnamed: 0,id,ambg,lex_div,neg,neu,pos,compound,tfid,label
411,411,129.5,0.958333,0.0,0.9425,0.0575,0.2553,0.025,1
502,502,196.0,0.814815,0.0,1.0,0.0,0.0,0.040909,1
506,506,281.5,0.82187,0.0,0.9225,0.0775,0.4316,0.011544,1
530,530,88.166667,0.953922,0.0,0.946667,0.053333,0.10195,0.012,1
541,541,169.5,0.910401,0.0795,0.8765,0.044,-0.08895,0.015957,1
543,543,85.666667,0.956699,0.0,1.0,0.0,0.0,0.019027,1
548,548,173.666667,0.924731,0.026333,0.973667,0.0,-0.1204,0.017296,1
585,585,100.75,0.906609,0.1385,0.8465,0.015,-0.1332,0.017825,1
629,629,226.5,0.843791,0.0435,0.9565,0.0,-0.26335,0.01455,1
654,654,127.0,0.974138,0.0,0.97625,0.02375,0.056575,0.011785,1


In [39]:
# saving the dataframe 
final_df.to_csv('feature_df.csv', index=False)

In [40]:
# saving the raw combined dataframe 
filtered_df.to_csv('combined_df.csv', index=False)