## Coding Finalization

In [None]:
import os
import pandas as pd
import sqlite3

## Load Data

In [None]:
data_dir = os.path.join(
    "..",
    "data",
    "coding",
    "doccano",
    "final"
)

file = "final_coded_sample.xlsx"

df = pd.read_excel(os.path.join(data_dir, file))

# fetch all SQL records for computing aggregate stats:
sql_path = os.path.join(
    "..", 
    "data", 
    "sqlite", 
    "idw_reddit.db"
)

sql_df = pd.DataFrame()

conn = sqlite3.connect(sql_path)
for table in ["comments", "posts"]:
    tbl_data = pd.read_sql(
        f"SELECT id, full_id, unique_id, author, date FROM {table}",
        con=conn
    )
    
    tbl_data["table"] = table
    
    sql_df = pd.concat([sql_df, tbl_data])
    del tbl_data
conn.close()

sql_df["date"] = pd.to_datetime(sql_df["date"])
sql_df["month_year"] = sql_df["date"].dt.strftime("%Y-%m")

In [None]:
df.head()

In [None]:
sql_df.head()

**Add usernames to posts and comments:**

In [None]:
author_mapper = dict(zip(sql_df["full_id"], sql_df["author"]))
df["author"] = df["full_id"].map(author_mapper)

In [None]:
df.head()

## Recodes
- Merge `vaccine hesitancy` into main vaccine category.
  - It makes more sense to discuss overall vaccine concerns under one label for this data.
- Merge `masking efficacy & necessity` into `public health policies`.
- Remove `conspiracy theorizing`
  - This topic is redundant and does not add sufficient information.
  - Important points are adequately represented by other categories and covered by the Anti-Contrarianism and Contrarianism indicators.
- Merge `big pharma` into `trust in institutions & experts`
  - Somewhat redundant; core themes are about trusting pharmaceuticals.

In [None]:
df.loc[df["Anti-vaxx & Vaccine Hesitancy"]==1, "Vaccine Safety & Efficacy"] = 1
df.loc[df["Masking Efficacy & Necessity"]==1, "Government & Public Health Policies"] = 1
df.loc[df["Big Pharma"]==1, "Trust in Institutions & Experts"] = 1

In [None]:
assert(len(df.loc[(df["Anti-vaxx & Vaccine Hesitancy"]==1) & (df["Vaccine Safety & Efficacy"]==0)])) == 0, "Error: length mismatch for Anti-vaxx recode."
assert(len(df.loc[(df["Masking Efficacy & Necessity"]==1) & (df["Government & Public Health Policies"]==0)])) == 0, "Error: length mismatch for Masking Efficacy recode."
assert(len(df.loc[(df["Big Pharma"]==1) & (df["Trust in Institutions & Experts"]==0)])) == 0, "Error: length mismatch for Big Pharma recode."

In [None]:
# removals:
drop_columns = [
    "Anti-vaxx & Vaccine Hesitancy", 
    "Masking Efficacy & Necessity",
    "Conspiracy Theorizing",
    "Big Pharma"
]

# renames:
category_mapper = {
    "Vaccine Safety & Efficacy": "Vaccine Safety, Efficacy & Hesitancy",
    "Government & Public Health Policies": "Public Health Policies",
    "Politicization and \"Issue Creep\"": "Politicization & Issue Creep",
    "IDW & Collective Identity": "The IDW",
    "new_topic": "topic",
    "Comments": "comments"
}

# drop & rename:
df.drop(columns=drop_columns, inplace=True)
df.rename(columns=category_mapper, inplace=True)

# check:
assert len(df[(df["Contrarianism"]==1) & (df["Anti-Contrarianism"]==1)]) == 0, "Error!"

## Save Data

### Columns

In [None]:
contrarian_cols = ["Anti-Contrarianism", "Contrarianism"]

category_cols = [col for col in df.columns[13:] if col not in contrarian_cols and col != "author"]

cols_to_keep = [
    "doccano_id", 
    "full_id",
    "unique_id",
    "comments",
    "url",
    "author",
    "text", 
    "sample_source", 
    "score", 
    "topic",
    "month_year"
]

df = df[cols_to_keep + contrarian_cols + category_cols]

In [None]:
df.head()

### Export

In [None]:
out_dir = os.path.join(
    "..",
    "data",
    "coding",
    "analysis_sample"
)

out_file = "idw_reddit_posts.csv"

df.to_csv(os.path.join(out_dir, out_file), index=False)