In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import glob, os

In [2]:
cd "d:/Semester 6/FIT3162/Fakeddit"

d:\Semester 6\FIT3162\Fakeddit


In [3]:
def remove_unwanted_columns(df, list_remove):
    df = df.drop(list_remove, axis=1)
    return df

In [4]:
def delete_duplicated_posts(df):
    """
    Duplication of posts are eliminated by checking the id of the post or comment.
    Since the same id cannot exist twice, such posts are deleted
    :param df: dataframe from which duplicated posts must be deleted
    :return: dataframe which now contains unique posts or comments only
    """
    # deleting duplicated posts
    df = df.drop_duplicates(subset=['id'], keep=False)
    return df 

In [5]:
def delete_empty_posts(df, subset_list):
    """
    This function deletes all empty posts in the dataset.
    Empty posts are NaN
    :param filename: name of the file from which the empty posts might be deleted
    :param subset list of column names to remove empty values
    :return: a dataframe holding the dataset after removing empty posts
    """
    df = df.dropna(subset = subset_list)
    return df

In [6]:
def delete_removed_comments(df):
    """
    This function deletes all removed or deleted comments in the dataset.
    Empty posts can be denoted as [deleted] or [removed] or NaN
    :param filename: name of the file from which the empty posts might be deleted
    :param subset list of column names to remove empty values
    :return: a dataframe holding the dataset after removing empty posts
    """
    # Remove all rows which are deleted or removed
    df = df.loc[(df['body'] != "[deleted]") & (df['body'] != "[removed]")]
    return df

In [7]:
def remove_subreddits(df):
    # Subreddits to Remove as they are based on images or photoshopped images.
    sub_to_remove = ["photoshopbattles", "pic" , "mildlyinteresting", "fakealbumcovers", "propagandaposters",
                 "misleadingthumbnails", "pareidolia", "psbattle_artwork", "fakehistoryporn"]
    df = df[~df['subreddit'].isin(sub_to_remove)]
    return df

In [8]:
# Change Date from UTC to Datetime
def get_date(created):
    """
    
    """
    return dt.datetime.fromtimestamp(created)

def change_date(df):
    """
    
    """
    df["created_utc"] = df["created_utc"].apply(get_date)
    return df

In [9]:
def filter_posts(post_df, min_comments):
    post_df = (post_df.loc[post_df['num_comments'] >= min_comments])
    return post_df

In [10]:
def filter_comments(df, ids):
    df = df[df['submission_id'].isin(ids)]
    return df

In [11]:
def read_post_dataset():
    train_df = pd.read_csv("datasetv2/datasetv2.0/train.tsv", sep='\t')
    test_df = pd.read_csv("datasetv2/datasetv2.0/test_public.tsv", sep='\t')
    validate_df = pd.read_csv("datasetv2/datasetv2.0/validate.tsv", sep='\t')
    post_df = pd.concat([train_df, test_df], axis=0)
    post_df = pd.concat([post_df, validate_df], axis=0)
    del train_df
    del test_df
    del validate_df
    return post_df

In [12]:
def read_comment_dataset():
    comment_df = pd.concat(map(pd.read_csv, glob.glob(os.path.join('comments', "comment*.csv"))))
    comment_df = comment_df.reset_index(drop = True)
    return comment_df

In [13]:
def preprocess_posts():
    post_df = read_post_dataset()
    remove_col = ['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'Unnamed: 0.1.1', 'hasImage',
       'image_url', 'linked_submission_id', '3_way_label', '6_way_label', 'author']
    post_df = remove_unwanted_columns(post_df, remove_col)
    post_df = delete_empty_posts(post_df, subset_list=['title'])
    post_df = delete_duplicated_posts(post_df)
    post_df = remove_subreddits(post_df)
    post_df = filter_posts(post_df, 20)
    post_df = change_date(post_df)
    post_df = post_df.reset_index(drop = True)
    return post_df

In [14]:
def preprocess_comments(ids):
    comment_df = read_comment_dataset()
    comment_df = delete_removed_comments(comment_df)
    comment_df = delete_empty_posts(comment_df, ['submission_id', 'body'])
    comment_df = remove_unwanted_columns(comment_df, ['Column1'])
    comment_df = filter_comments(comment_df, ids)
    comment_df = comment_df.reset_index(drop = True)
    return comment_df

In [15]:
if __name__ == '__main__':
    print("Data Preprocessing")
    post_df = preprocess_posts()
#     ids = post_df.id.unique()
#     comment_df = preprocess_comments(ids)
    post_df.to_csv("cleaned_df.csv", encoding='utf-8-sig')
#     comment_df.to_csv("cleaned_comments.csv", encoding='utf-8-sig')

Data Preprocessing
