**This code sorts the raw tweets with the computed sentiment scores into county-level files based on their FIPS codes.**

In [2]:
import os
import numpy as np
import pandas as pd

In [None]:
in_dir = "/Users/felix/ETH/Shared/100/Geolocated"
out_dir = "/Users/felix/ETH/Shared/100/Counties_Felix"

in_flist = os.listdir(in_dir)

np.random.seed(42)
np.random.shuffle(in_flist)

# to store a summary of number of tweets dropped due to missing geolocation 
missing_geolocation = {}

# here, on the very top we would have a loop over the raw files:
for fname in in_flist:
    print("\n--------------- Processing file {} ---------------".format(fname))

    # read file, auto-convert dtypes and set date time index
    df = pd.read_pickle(os.path.join(in_dir, fname))
    df = df.infer_objects()
    df = df.set_index("Date")

    # TODO: It seems that the geolocation script did not locate all tweets. 
    # have to deal with those somehow. Drop them for now to experiment further.
    df_cleaned = df.drop(df.loc[df.FIPS == ""].index, axis=0)
    
    # count missing tweets
    missing_geolocation[fname] = [df.shape[0], 
                                  df_cleaned.shape[0], 
                                  df.shape[0] - df_cleaned.shape[0], 
                                  1 - (df_cleaned.shape[0] / df.shape[0])]
    
    print("Dropped {} tweets due to missing geolocation.\n".format(df.shape[0] - df_cleaned.shape[0]))

    # get unique fips codes within the raw file
    fips_codes = df_cleaned.FIPS.unique()

    # iterate over fips codes and store the corresponding tweets in seperate, county-level data frames
    for fips_code in fips_codes:

        # fetch the subset of the tweet data frame corresponding to each unique FIPS code
        sub_df_county = df_cleaned.loc[df_cleaned.FIPS == fips_code]
        assert len(sub_df_county.FIPS.unique()) == 1

        # save or else open and append
        fname = "{}.pkl".format(fips_code)
        fpath = os.path.join(out_dir, fname)
        if os.path.exists(fpath):
            print("county file for fips code {} already exists.".format(fips_code))

            # open existing file and append tweets as new rows
            existing_df_county = pd.read_pickle(fpath)

            # this step could create new duplicates if ran twice for the same files.
            # after ordering all tweets by county, we should run a drop-duplicates 
            # script BEFORE resampling to daily sentiment distributions.
            merged_df_county = pd.concat([existing_df_county, sub_df_county], axis=0)

            # drop potential duplicate rows based on unique tweet ID (ESSENTIAL STEP)
            merged_df_county = merged_df_county.drop_duplicates(subset="ID")        

            # re-sort tweets by date
            merged_df_county = merged_df_county.sort_values("Date", ascending=True)
            merged_df_county.to_pickle(fpath)

            # number of new unique tweets appended
            new_tweets = existing_df_county.shape[0] - merged_df_county.shape[0]
            if new_tweets > 0:
                print("appended {} new tweets to county with fips code {}\n".format(new_tweets, fips_code))
            else:
                print("appended no new tweets to county with fips code {}\n".format(fips_code))
        else:
            print("creating new file for county with fips code {}.".format(fips_code))
            sub_df_county.to_pickle(fpath)
            
# save summary of number of tweets dropped due to missing geolocation 
df_missing_geolocation = pd.DataFrame(missing_geolocation, index=["n_before", "n_after", "n_missing", "n_missing_ratio"]).transpose()
df_missing_geolocation.to_excel("/Users/felix/ETH/Shared/100/info_missing_geolocation_felix.xlsx")