In [1]:
# cd "D:\University\FIT3162\Project\Fake-News-Detection\Feature Extraction"

In [3]:
import pandas as pd
import numpy as np
import sentiment_vader

#### Sentiment Analysis Pipeline

In [5]:
class FeatureExtraction:
    """
    This class is used to build a pipeline for sentiment feature extraction using Vader
    """
    def __init__(self):
        self.post_file = "cleaned_df.csv"
        self.comment_file = "cleaned_comments.csv"
        self.post_df = None
        self.comment_df = None
        self.comment_scores = {}
    
    def read_datasets(self):
        self.post_df = pd.read_csv(self.post_file, index_col = 0)
        self.comment_df = pd.read_csv(self.comment_file, index_col=0)
    
    def print_statistics(self):
        print("Number of Posts", len(self.post_df))
        print("Number of Comments", len(self.comment_df))
        print("Number of Fake Posts", len(self.post_df.loc[self.post_df['2_way_label'] == 0]))
        print("Number of True Posts", len(self.post_df.loc[self.post_df['2_way_label'] == 1]))
        
    def get_sample_posts(self, sample_size):
        self.post_df = self.post_df.sample(sample_size, random_state = 1).reset_index(drop=True)
        
    def filter_comments(self):
        ids = self.post_df.id.unique()
        self.comment_df = self.comment_df[self.comment_df['submission_id'].isin(ids)]
        self.comment_df = self.comment_df.reset_index(drop=True)
        
    def build_comment_score(self):
        #creating hashtables with post id as key
        for ind in self.post_df.index:
            # hastable for score of comments
            self.comment_scores[self.post_df['id'][ind]] = [0, 0]
        sentiment_vader.build_comment_dictionary(self.comment_df, self.comment_scores)
        
    def cmnt_sentiment_column(self):
        """
        Add the comment sentiment Column to Post dataset
        """
        temp = list(self.comment_scores.values())
        score = [x[0]/x[1] if x[1] > 0 else x[1] for x in temp]
        num_comments = [x[1] for x in temp]
        
        self.post_df["num_comments"] = num_comments
        self.post_df["comment_sentiment"] = score
            
    def post_sentiment_column(self):
        self.post_df['post_sentiment'] = self.post_df.apply(lambda x: sentiment_vader.post_sentiment(x['title']), axis=1)
        
    def build_pipeline(self):
        print("Step 1: Reading Dataset")
        self.read_datasets()
        print("Step 2: Filter Posts")
        self.get_sample_posts(6)
        print("Step 3: Filter Comments")
        self.filter_comments()
        print("Step 4: Building Comment Score Dictionary")
        self.build_comment_score()
        print("Step 6: Add Comment Score Column")
        self.cmnt_sentiment_column()
        print("Step 7: Add Post Score Column")
        self.post_sentiment_column()
        print("---DONE---")
        
    def get_post_dataset(self):
        return self.post_df

In [7]:
# Create a pipeline object
feature_extraction = FeatureExtraction()
# Read the datasets
feature_extraction.build_pipeline()

Step 1: Reading Dataset


  mask |= (ar1 == a)


Step 2: Filter Posts
Step 3: Filter Comments
Step 4: Building Comment Score Dictionary


ModuleNotFoundError: No module named 'regex._regex'

In [73]:
# Print Statistics
feature_extraction.print_statistics()

Number of Posts 10000
Number of Comments 820028
Number of Fake Posts 5194
Number of True Posts 4806


In [74]:
post_df = feature_extraction.get_post_dataset()

In [75]:
post_df.head(2)

Unnamed: 0,clean_title,created_utc,domain,id,num_comments,score,subreddit,title,upvote_ratio,2_way_label,comment_sentiment,post_sentiment
0,can i still cash in my brain needs to rest,2017-06-17 12:58:22,self.SubredditSimulator,6hrnqx,20,17,subredditsimulator,Can I still cash in my brain needs to rest,0.9,0,-0.034,0.0
1,how did the wheat say to its son photoshop,2019-07-27 18:58:22,self.SubredditSimulator,cig6ey,20,3,subredditsimulator,How did the wheat say to its son Photoshop?,1.0,0,0.196715,0.0


##### Domain Rank Pipeline

In [76]:
# https://www.kaggle.com/cheedcheed/top1m
# https://github.com/mozilla/cipherscan/tree/master/top1m
import re
from urllib.parse import urlparse
import os
import zipfile

class Alexa:
    '''
    this class provides access to the Alexa ranking of URLs
    usage: create a new instance of this class (ranker = Alexa()) and use the get_rank method
    '''
    __domain_list = []
    
    def __init__(self):
        try:
            # read the alexa ranking
            f_csv = open('top-1m.csv/top-1m.csv')
            csv_data = f_csv.read()
            f_csv.close()
            lines = csv_data.split("\n")
            for line in lines:
                try:
                    url = line.split(",")[1]
                    url = re.sub('^www\.', '', url)
                    self.__domain_list.append(url)
                except:
                    continue
        except:
            raise
        
    def get_rank(self, url):
        ''' getrank returns the alexa rank of the domain of the given URL, or -1 if it is over 1M'''
        parsed_url = urlparse(url)
        if parsed_url.scheme == '':
            return self.get_rank('http://'+url)
        domain = parsed_url.netloc
        domain = re.sub('^www\.', '', domain)
        if domain in self.__domain_list:
            return self.__domain_list.index(domain)+1   
        return 1000001

In [77]:
alexa = Alexa()

In [78]:
alexa.get_rank('www.cnn.com')

135

In [79]:
def add_domain_rank(df):
    alexa = Alexa()
    df['domain_rank'] = df.apply(lambda x: alexa.get_rank(x['domain']), axis=1)
    return df

In [80]:
post_df = add_domain_rank(post_df)
print("Added Domain Rank Column")

Added Domain Rank Column


##### Sample Post DF

In [81]:
post_df.head(2)

Unnamed: 0,clean_title,created_utc,domain,id,num_comments,score,subreddit,title,upvote_ratio,2_way_label,comment_sentiment,post_sentiment,domain_rank
0,can i still cash in my brain needs to rest,2017-06-17 12:58:22,self.SubredditSimulator,6hrnqx,20,17,subredditsimulator,Can I still cash in my brain needs to rest,0.9,0,-0.034,0.0,1000001
1,how did the wheat say to its son photoshop,2019-07-27 18:58:22,self.SubredditSimulator,cig6ey,20,3,subredditsimulator,How did the wheat say to its son Photoshop?,1.0,0,0.196715,0.0,1000001


In [82]:
# Save Final Dataset
post_df.to_csv('dataset.csv', encoding='utf-8-sig')

In [10]:
# # Domain Rank
# def get_alexa_ranking(url):
#     """
#     Get Alexa ranking
    
#     """
#     from bs4 import BeautifulSoup
#     import urllib.request
# #     url='9news.com.au'
#     rank_str =BeautifulSoup(urllib.request.urlopen("https://www.alexa.com/minisiteinfo/" +url),'html.parser').table.a.get_text()
#     try:    
#         rank_int=int(rank_str.replace(',',''))
#     except:
#         rank_int = 1000001
#     return rank_int

In [34]:
# from multiprocess import Pool, Manager
# import sentiment
# if __name__ == '__main__':
#     print("Feature Extraction")
#     num_processors = 6
#     pool = Pool(processes = num_processors)
    
#     manager = Manager()
#     mgr_score = manager.dict()
#     mgr_score.update(comment_scores)
#     df_split = np.array_split(comment_df, num_processors)
#     for data in df_split:
#         pool.apply_async(sentiment.build_comment_dictionary, args = (data, mgr_score, ))
#         print("done")
#     pool.close()
#     pool.join()