In [1]:
import os
import numpy as np
import pandas as pd

**This code calculates quantiles, mean and sd while resampling to a daily sampling. It should be applied once we have created the ordered, county-level tweet files.**

A check a la std = 0 could be used to drop all columns that have zero variation over time. This is the case for several quantiles of the different sentiment scores.

In [3]:
in_dir = "/Users/felix/Downloads/fips_sorted"
out_dir = "/Users/felix/Downloads/processed_daily"

in_flist = os.listdir(in_dir)

# here, on the very top we would have a loop over the raw files:
for fname in in_flist:
    print("\n--------------- Resampling file {} ---------------".format(fname))

    # read file, auto-convert dtypes and set date time index
    df = pd.read_pickle(os.path.join(in_dir, fname))

    # copy
    df_out = df.copy()

    # iterate over the sentiment score columns
    dfs_sentiments = []
    sentiment_cols = ['polarity', 'subjectivity', 'positive', 'negative', 'neutral']

    for col in sentiment_cols:
        # quantiles
        df_q = df_out[col].resample("D").quantile(q=[0., 0.025, 0.25, 0.5, 0.75, 0.095, 1.])
        df_q.index = df_q.index.set_names(["date", "quantile"])
        df_q = df_q.unstack()
        new_col = ["{}_{}".format(col, q) for q in df_q.columns]
        df_q = df_q.rename(columns=dict(zip(df_q.columns, new_col)))

        # mean and sd
        df_q["{}_mean".format(col)] = df_out[col].resample("D").mean()
        df_q["{}_sd".format(col)] = df_out[col].resample("D").std()

        # sum of retweets
        dfs_sentiments.append(df_q)

    # concatenate
    df_merged_daily = pd.concat(dfs_sentiments, axis=1)

    # add the daily sum of retweets
    df_merged_daily["retweets_total".format(col)] = df_out["Retweets"].resample("D").sum()
    
    # store daily resampled data to pickle file
    df_merged_daily.to_pickle(os.path.join(out_dir, fname))


--------------- Resampling file 06019.pkl ---------------

--------------- Resampling file 05077.pkl ---------------

--------------- Resampling file 05117.pkl ---------------

--------------- Resampling file 05107.pkl ---------------

--------------- Resampling file 01003.pkl ---------------

--------------- Resampling file 01001.pkl ---------------

--------------- Resampling file 12033.pkl ---------------

--------------- Resampling file 05001.pkl ---------------

--------------- Resampling file 06041.pkl ---------------

--------------- Resampling file 06055.pkl ---------------

--------------- Resampling file 06095.pkl ---------------

--------------- Resampling file 06081.pkl ---------------

--------------- Resampling file 06075.pkl ---------------

--------------- Resampling file 01097.pkl ---------------

--------------- Resampling file 05147.pkl ---------------

--------------- Resampling file 06077.pkl ---------------

--------------- Resampling file 05095.pkl -------------