In [1]:
import pandas as pd
import ast
import urllib.request, json 
import numpy as np

Here, we're going to go through all the posts, and subsetting to appropriate quanitfiers:

- Using epoch timestamps, we filtered for posts in 2020: between 1577854800 and less than 1609477199
- Length of title and body is greater than 0.
- The post is not based on an article.

In [2]:
reps = pd.DataFrame()

In [3]:
rows = 0
data_rows = 0
total_rows = 116142
with open("../02_text_files/Republican_posts.txt") as file:
    for line in file:
        rows += 1
        data = ast.literal_eval(line)
        if "selftext" in data:
            if (
                (len(data["title"] + data["selftext"]) > 100)
                & (data["domain"] == "self.Republican")
                & (data["created_utc"] >= 1577854800)
                & (data["created_utc"] <= 1609477199)
                & (data["score"] >= 0)
            ):
                data_rows += 1
                temp = {
                    "id": data["id"],
                    "title": data["title"],
                    "text": data["selftext"],
                    "subreddit": data["subreddit"],
                    "score": data["score"],
                }
                reps = reps.append(temp, ignore_index=True)
                pass
        if rows % 500 == 0:
            print(
                f"Reading is {round(rows/total_rows*100,2)}% complete. There have been {rows} rows read, and there are {data_rows} rows included in the data"
            )
        pass
    pass


Reading is 0.43% complete. There have been 500 rows read, and there are 0 rows included in the data
Reading is 0.86% complete. There have been 1000 rows read, and there are 0 rows included in the data
Reading is 1.29% complete. There have been 1500 rows read, and there are 0 rows included in the data
Reading is 1.72% complete. There have been 2000 rows read, and there are 0 rows included in the data
Reading is 2.15% complete. There have been 2500 rows read, and there are 0 rows included in the data
Reading is 2.58% complete. There have been 3000 rows read, and there are 0 rows included in the data
Reading is 3.01% complete. There have been 3500 rows read, and there are 0 rows included in the data
Reading is 3.44% complete. There have been 4000 rows read, and there are 0 rows included in the data
Reading is 3.87% complete. There have been 4500 rows read, and there are 0 rows included in the data
Reading is 4.31% complete. There have been 5000 rows read, and there are 0 rows included in 

After this process, there were 882 rows. Next, we combined the title and body text into one column.

Next, we removed posts containing links, tweets, etc. This resulted in 798 total rows.

In [4]:
reps["total_post"] = ""
reps.loc[reps.loc[:, "text"] == "[removed]", "total_post"] = reps.loc[
    reps.loc[:, "text"] == "[removed]", "title"
]
reps.loc[~reps.loc[:, "text"].str.contains("\[removed\]"), "total_post"] = (
    reps.loc[~reps.loc[:, "text"].str.contains("\[removed\]"), "title"]
    + " "
    + reps.loc[~reps.loc[:, "text"].str.contains("\[removed\]"), "text"]
)


In [5]:
reps = reps[~reps["total_post"].str.contains("http")]
reps = reps[~reps["total_post"].str.contains("@")]
reps = reps[~reps["total_post"].str.contains("www")]


In [6]:
reps.shape

(809, 6)

Now let's read in comments as well, with similar qualifiers.

In [7]:
comments = pd.DataFrame()

In [8]:
rows = 0
data_rows = 0
total_rows = 881031
with open("../02_text_files/Republican_comments.txt") as file:
    for line in file:
        rows += 1
        data = ast.literal_eval(line)
        if (
            (len(data["body"]) > 100)
            & (data["created_utc"] >= 1577854800)
            & (data["created_utc"] <= 1609477199)
            & (data["score"] >= 0)
        ):
            data_rows += 1
            temp = {
                "id": data["id"],
                "text": data["body"],
                "subreddit": data["subreddit"],
                "score": data["score"],
            }
            comments = comments.append(temp, ignore_index=True)
            pass
        if rows % 1000 == 0:
            print(
                f"Reading is {round(rows/total_rows*100,2)}% complete. There have been {rows} rows read, and there are {data_rows} rows included in the data"
            )
        pass
    pass


Reading is 0.11% complete. There have been 1000 rows read, and there are 0 rows included in the data
Reading is 0.23% complete. There have been 2000 rows read, and there are 0 rows included in the data
Reading is 0.34% complete. There have been 3000 rows read, and there are 0 rows included in the data
Reading is 0.45% complete. There have been 4000 rows read, and there are 0 rows included in the data
Reading is 0.57% complete. There have been 5000 rows read, and there are 0 rows included in the data
Reading is 0.68% complete. There have been 6000 rows read, and there are 0 rows included in the data
Reading is 0.79% complete. There have been 7000 rows read, and there are 0 rows included in the data
Reading is 0.91% complete. There have been 8000 rows read, and there are 0 rows included in the data
Reading is 1.02% complete. There have been 9000 rows read, and there are 0 rows included in the data
Reading is 1.14% complete. There have been 10000 rows read, and there are 0 rows included i

In [9]:
comments.shape

(114300, 4)

In [10]:
comments = comments[~comments["text"].str.contains("http")]
comments = comments[~comments["text"].str.contains("www")]
comments = comments[~comments["text"].str.contains("@")]


In [11]:
comments.shape

(94609, 4)

After again removing links, we have approximately 94,000 rows. For our purposes, we do not need this many, so we'll randomly select 4,000 and add those to our dataset.

In [12]:
comments.rename({"text":"total_post"}, axis = 1, inplace = True)

In [13]:
comments["type"] = "comment"
reps["type"] = "post"

In [14]:
np.random.seed(3320)

In [15]:
comments_sample = comments.sample(4000)

In [16]:
final = pd.concat([comments_sample,reps])

In [17]:
final.shape

(4809, 7)

In [18]:
final.to_parquet("../10_datasets/republican", engine = "fastparquet")