In [1]:
import os
import numpy as np
import pandas as pd

**This code calculates quantiles, mean and sd while resampling to a daily sampling. It should be applied once we have created the ordered, county-level tweet files.**

A check a la std = 0 could be used to drop all columns that have zero variation over time. This is the case for several quantiles of the different sentiment scores.

In [2]:
in_dir = "/Volumes/Extreme/100/Counties_Felix"
out_dir = "/Volumes/Extreme/100/Counties_Resampled_Felix"

in_flist = os.listdir(in_dir)

# here, on the very top we would have a loop over the raw files:
for fname in in_flist:
    print("\n--------------- Resampling file {} ---------------".format(fname))
    fpath_in = os.path.join(in_dir, fname)
    fpath_out = os.path.join(out_dir, fname)
    
    # skip files that are done already
    if os.path.exists(fpath_out):
        "File was already resampled"
        continue

    # read file, auto-convert dtypes and set date time index
    df = pd.read_pickle(fpath_in)

    # copy
    df_out = df.copy()

    # iterate over the sentiment score columns
    dfs_sentiments = []
    sentiment_cols = ['polarity', 'subjectivity', 'positive', 'negative', 'neutral']

    for col in sentiment_cols:
        # quantiles
        df_q = df_out[col].resample("D").quantile(q=[0., 0.025, 0.25, 0.5, 0.75, 0.095, 1.])
        df_q.index = df_q.index.set_names(["date", "quantile"])
        df_q = df_q.unstack()
        new_col = ["{}_{}".format(col, q) for q in df_q.columns]
        df_q = df_q.rename(columns=dict(zip(df_q.columns, new_col)))

        # mean and sd
        df_q["{}_mean".format(col)] = df_out[col].resample("D").mean()
        df_q["{}_sd".format(col)] = df_out[col].resample("D").std()

        # sum of retweets
        dfs_sentiments.append(df_q)

    # concatenate
    df_merged_daily = pd.concat(dfs_sentiments, axis=1)

    # add the daily sum of retweets
    df_merged_daily["retweets_total".format(col)] = df_out["Retweets"].resample("D").sum()
    
    # store daily resampled data to pickle file
    df_merged_daily.to_pickle(fpath_out)


--------------- Resampling file 55125.pkl ---------------

--------------- Resampling file 18025.pkl ---------------

--------------- Resampling file 11001.pkl ---------------

--------------- Resampling file 18031.pkl ---------------

--------------- Resampling file 55131.pkl ---------------

--------------- Resampling file 18019.pkl ---------------

--------------- Resampling file 55119.pkl ---------------

--------------- Resampling file 37005.pkl ---------------

--------------- Resampling file 28125.pkl ---------------

--------------- Resampling file 21101.pkl ---------------

--------------- Resampling file 21115.pkl ---------------

--------------- Resampling file 37011.pkl ---------------

--------------- Resampling file 28131.pkl ---------------

--------------- Resampling file 29207.pkl ---------------

--------------- Resampling file 37039.pkl ---------------

--------------- Resampling file 28119.pkl ---------------

--------------- Resampling file 24045.pkl -------------