In [1]:
import pandas as pd
import ast
import urllib.request, json 
import numpy as np

Here, we're going to go through all the posts, and subsetting to appropriate quanitfiers:

- Using epoch timestamps, we filtered for posts in 2020: between 1577854800 and less than 1609477199
- Length of title and body is greater than 100 characters.
- The post is not based on an article.

In [2]:
dems = pd.DataFrame()

In [3]:
rows = 0
data_rows = 0
total_rows = 131484

with open("../02_text_files/Democrats_posts.txt") as file:
    for line in file:
        rows += 1
        data = ast.literal_eval(line)
        if "selftext" in data:
            if (
                (len(data["title"] + data["selftext"]) > 100)
                & (data["domain"] == "self.democrats")
                & (data["created_utc"] >= 1577854800)
                & (data["created_utc"] <= 1609477199)
                & (data["score"] >= 0)
            ):
                data_rows += 1
                temp = {
                    "id": data["id"],
                    "title": data["title"],
                    "text": data["selftext"],
                    "subreddit": data["subreddit"],
                    "score": data["score"],
                }
                dems = dems.append(temp, ignore_index=True)
                pass
        if rows % 500 == 0:
            print(
                f"Reading is {round(rows/total_rows*100,2)}% complete. There have been {rows} rows read, and there are {data_rows} rows included in the data"
            )
        pass
    pass


Reading is 0.38% complete. There have been 500 rows read, and there are 0 rows included in the data
Reading is 0.76% complete. There have been 1000 rows read, and there are 0 rows included in the data
Reading is 1.14% complete. There have been 1500 rows read, and there are 0 rows included in the data
Reading is 1.52% complete. There have been 2000 rows read, and there are 0 rows included in the data
Reading is 1.9% complete. There have been 2500 rows read, and there are 0 rows included in the data
Reading is 2.28% complete. There have been 3000 rows read, and there are 0 rows included in the data
Reading is 2.66% complete. There have been 3500 rows read, and there are 0 rows included in the data
Reading is 3.04% complete. There have been 4000 rows read, and there are 0 rows included in the data
Reading is 3.42% complete. There have been 4500 rows read, and there are 0 rows included in the data
Reading is 3.8% complete. There have been 5000 rows read, and there are 0 rows included in th

After this process, there were 1744 rows. Next, we combined the title and body text into one column.

Next, we removed posts containing links, tweets, etc. This resulted in 1637 total rows.

In [4]:
dems["total_post"] = ""
dems.loc[dems.loc[:, "text"] == "[removed]", "total_post"] = dems.loc[
    dems.loc[:, "text"] == "[removed]", "title"
]
dems.loc[~dems.loc[:, "text"].str.contains("\[removed\]"), "total_post"] = (
    dems.loc[~dems.loc[:, "text"].str.contains("\[removed\]"), "title"]
    + " "
    + dems.loc[~dems.loc[:, "text"].str.contains("\[removed\]"), "text"]
)


In [5]:
dems = dems[~dems["total_post"].str.contains("http")]
dems = dems[~dems["total_post"].str.contains("@")]
dems = dems[~dems["total_post"].str.contains("www")]


In [6]:
dems.shape

(1637, 6)

In [7]:
dems.sample().iloc[0,1]

'Republicans when they realize their argument is no longer making sense: ...The Democrats are actually the racist ones'

Now let's read in comments as well, with similar qualifiers.

In [8]:
comments = pd.DataFrame()

In [9]:
rows = 0
data_rows = 0
total_rows = 728491
with open("../02_text_files/Democrats_comments.txt") as file:
    for line in file:
        rows += 1
        data = ast.literal_eval(line)
        if (
            (len(data["body"]) > 100)
            & (data["created_utc"] >= 1577854800)
            & (data["created_utc"] <= 1609477199)
            & (data["score"] > 1)
        ):
            data_rows += 1
            temp = {
                "id": data["id"],
                "text": data["body"],
                "subreddit": data["subreddit"],
                "score": data["score"],
            }
            comments = comments.append(temp, ignore_index=True)
            pass
        if rows % 1000 == 0:
            print(
                f"Reading is {round(rows/total_rows*100,2)}% complete. There have been {rows} rows read, and there are {data_rows} rows included in the data"
            )
        pass
    pass


Reading is 0.14% complete. There have been 1000 rows read, and there are 0 rows included in the data
Reading is 0.27% complete. There have been 2000 rows read, and there are 0 rows included in the data
Reading is 0.41% complete. There have been 3000 rows read, and there are 0 rows included in the data
Reading is 0.55% complete. There have been 4000 rows read, and there are 0 rows included in the data
Reading is 0.69% complete. There have been 5000 rows read, and there are 0 rows included in the data
Reading is 0.82% complete. There have been 6000 rows read, and there are 0 rows included in the data
Reading is 0.96% complete. There have been 7000 rows read, and there are 0 rows included in the data
Reading is 1.1% complete. There have been 8000 rows read, and there are 0 rows included in the data
Reading is 1.24% complete. There have been 9000 rows read, and there are 0 rows included in the data
Reading is 1.37% complete. There have been 10000 rows read, and there are 0 rows included in

In [10]:
comments.shape

(6181, 4)

In [11]:
comments = comments[~comments["text"].str.contains("http")]
comments = comments[~comments["text"].str.contains("www")]
comments = comments[~comments["text"].str.contains("@")]


In [12]:
comments.shape

(5812, 4)

After again removing links, we have approximately 72,000 rows. For our purposes, we do not need this many, so we'll randomly select 3,300 and add those to our dataset.

In [13]:
comments.rename({"text":"total_post"}, axis = 1, inplace = True)

In [14]:
comments["type"] = "comment"
dems["type"] = "post"

In [15]:
np.random.seed(3320)

In [16]:
comments_sample = comments.sample(3300)

In [17]:
final_democrats = pd.concat([comments_sample,dems])

In [18]:
final_democrats.shape

(4937, 7)

In [19]:
final_democrats.to_parquet("../10_datasets/democrats.parquet", engine = "fastparquet")