In [1]:
import pandas as pd
import ast
import urllib.request, json 
import numpy as np

Here, we're going to go through all the posts, and subsetting to appropriate quanitfiers:

- Using epoch timestamps, we filtered for posts in 2020: between 1577854800 and less than 1609477199
- Length of title and body is greater than 100 characters.
- The post is not based on an article.

In [2]:
neut = pd.DataFrame()

In [3]:
rows = 0
data_rows = 0
total_rows = 16985
with open("../02_text_files/NeutralPolitics_posts.txt") as file:
    for line in file:
        rows += 1
        data = ast.literal_eval(line)
        if "selftext" in data:
            if (
                (len(data["title"] + data["selftext"]) > 100)
                & (data["domain"] == "self.NeutralPolitics")
                & (data["created_utc"] >= 1577854800)
                & (data["created_utc"] <= 1609477199)
                & (data["score"] >= 0)
            ):
                data_rows += 1
                temp = {
                    "id": data["id"],
                    "title": data["title"],
                    "text": data["selftext"],
                    "subreddit": data["subreddit"],
                    "score": data["score"],
                }
                neut = neut.append(temp, ignore_index=True)
                pass
        if rows % 500 == 0:
            print(
                f"Reading is {round(rows/total_rows*100,2)}% complete. There have been {rows} rows read, and there are {data_rows} rows included in the data"
            )
        pass
    pass


Reading is 2.94% complete. There have been 500 rows read, and there are 0 rows included in the data
Reading is 5.89% complete. There have been 1000 rows read, and there are 0 rows included in the data
Reading is 8.83% complete. There have been 1500 rows read, and there are 190 rows included in the data
Reading is 11.78% complete. There have been 2000 rows read, and there are 390 rows included in the data
Reading is 14.72% complete. There have been 2500 rows read, and there are 616 rows included in the data
Reading is 17.66% complete. There have been 3000 rows read, and there are 802 rows included in the data
Reading is 20.61% complete. There have been 3500 rows read, and there are 802 rows included in the data
Reading is 23.55% complete. There have been 4000 rows read, and there are 802 rows included in the data
Reading is 26.49% complete. There have been 4500 rows read, and there are 802 rows included in the data
Reading is 29.44% complete. There have been 5000 rows read, and there ar

After this process, there were 802 rows. Next, we combined the title and body text into one column.

Next, we removed posts containing links, tweets, etc. This resulted in 760 total rows.

In [4]:
neut["total_post"] = ""
neut.loc[neut.loc[:, "text"] == "[removed]", "total_post"] = neut.loc[
    neut.loc[:, "text"] == "[removed]", "title"
]
neut.loc[~neut.loc[:, "text"].str.contains("\[removed\]"), "total_post"] = (
    neut.loc[~neut.loc[:, "text"].str.contains("\[removed\]"), "title"]
    + " "
    + neut.loc[~neut.loc[:, "text"].str.contains("\[removed\]"), "text"]
)


In [5]:
neut = neut[~neut["total_post"].str.contains("http")]
neut = neut[~neut["total_post"].str.contains("@")]
neut = neut[~neut["total_post"].str.contains("www")]


In [6]:
neut.shape

(760, 6)

Now let's read in comments as well, with similar qualifiers.

In [7]:
comments = pd.DataFrame()

In [8]:
rows = 0
data_rows = 0
total_rows = 496810
with open("../02_text_files/NeutralPolitics_comments.txt") as file:
    for line in file:
        rows += 1
        data = ast.literal_eval(line)
        if (
            (len(data["body"]) > 100)
            & (data["created_utc"] >= 1577854800)
            & (data["created_utc"] <= 1609477199)
            & (data["score"] > 0)
        ):
            data_rows += 1
            temp = {
                "id": data["id"],
                "text": data["body"],
                "subreddit": data["subreddit"],
                "score": data["score"],
            }
            comments = comments.append(temp, ignore_index=True)
            pass
        if rows % 1000 == 0:
            print(
                f"Reading is {round(rows/total_rows*100,2)}% complete. There have been {rows} rows read, and there are {data_rows} rows included in the data"
            )
        pass
    pass


Reading is 0.2% complete. There have been 1000 rows read, and there are 0 rows included in the data
Reading is 0.4% complete. There have been 2000 rows read, and there are 0 rows included in the data
Reading is 0.6% complete. There have been 3000 rows read, and there are 0 rows included in the data
Reading is 0.81% complete. There have been 4000 rows read, and there are 0 rows included in the data
Reading is 1.01% complete. There have been 5000 rows read, and there are 0 rows included in the data
Reading is 1.21% complete. There have been 6000 rows read, and there are 0 rows included in the data
Reading is 1.41% complete. There have been 7000 rows read, and there are 0 rows included in the data
Reading is 1.61% complete. There have been 8000 rows read, and there are 0 rows included in the data
Reading is 1.81% complete. There have been 9000 rows read, and there are 0 rows included in the data
Reading is 2.01% complete. There have been 10000 rows read, and there are 0 rows included in t

In [9]:
comments.shape

(29545, 4)

In [10]:
comments = comments[~comments["text"].str.contains("http")]
comments = comments[~comments["text"].str.contains("www")]
comments = comments[~comments["text"].str.contains("@")]


In [11]:
comments.shape

(15915, 4)

After again removing links, we have approximately 111,000 rows. For our purposes, we do not need this many, so we'll randomly select 4,000 and add those to our dataset.

In [12]:
comments.rename({"text":"total_post"}, axis = 1, inplace = True)

In [13]:
comments["type"] = "comment"
neut["type"] = "post"

In [14]:
np.random.seed(3320)

In [15]:
comments_sample = comments.sample(4000)

In [16]:
final_neut = pd.concat([comments_sample,neut])

In [17]:
final_neut.shape

(4760, 7)

In [18]:
final_neut.to_parquet("../10_datasets/neutral.parquet", engine = "fastparquet")