# Filtering

Traverse the tweets and output a filtered subset of the tweets to use.

In [1]:
# Libraries

%run utilities.py
from multiprocessing import Pool  # faster
from time import time as time     # timing

#### Functions

In [2]:
# Functions
def parallelize_series(series, func):
    pool = Pool(6)

    df = pool.map(func, series)

    pool.close()
    pool.join()
    return df

def filter_tweet_df(in_f):
    """
    Formats and returns a dataframe of tweets
    """
    conversions = {'retweet': lambda x: 1 if (x=='RT') else x,
                  'date': lambda t: t.split()[1] +'-'+ t.split()[2] +'-'+ t.split()[-1]
                  }
    dt_parser = lambda x: pd.to_datetime(x, format="%b-%d-%Y")
    
    raw_df = pd.read_csv(in_f, usecols=cols_scrape,    # filtering
                         low_memory=False, engine='c', # optimization
                        converters=conversions,        # format attrs
                        nrows=2000)                     # More filtering
    
    # Drop rows that'll give us trouble
    raw_df.dropna(subset=['date', 'longitude', 'latitude'], how='any', inplace=True)
    raw_df['date'] = dt_parser(raw_df['date'])
    
    return raw_df

def process_file(f):
    """
    Takes a file, gets all of the names/dirs and filters it. 
    """
    raw_f = f.split('/')[-1]
    new_f_name = format_new_file(f, 'filtered')
    new_f_path = name_file_path(new_f_name, outdir) ## HARD coded directory, just need to rerun
    
    try:
        df = filter_tweet_df(f)
        df.to_csv(new_f_path, index=False)
        
        print (raw_f + ': x')
        return {'f': raw_f, 'succeed': True, 'new_f': new_f_path}
    except Exception as e:
        print ("Couldn't read: \t{}".format(f), e)
        return {'f': raw_f, 'succeed': False, 'new_f': new_f_path}

#### Variables

In [3]:
# Grab all of the files
raw_tweets = ls_files_list(external_scrape_dir)

# new directory
outdir = make_new_dir_date(processed_finals_dir)

# combined files in the new dir
combined_f = name_file_path('combined.csv', outdir)

Directory already exists, but you can still have the file name


#### Run. That. Shit.

In [4]:
# set up
start = time()  # timer
num_files = len(raw_tweets)

# execution
log_list = parallelize_series(raw_tweets, process_file)
log_df = pd.DataFrame(log_list)

# clean up
end = time()
num_converted = log_df[log_df['succeed'] == True].shape[0]

# reporting
print ('{} / {} files were filtered.'.format(num_converted, (num_files)))
print ('{} files took: {} seconds.'.format(5, end-start))

tweets_immigrant_34315.csv: x
tweets_immigrant_34375.csv: x
tweets_immigrant_34363.csv: x
tweets_immigrant_34339.csv: x
tweets_immigrant_34316.csv: x
tweets_immigrant_34376.csv: x
tweets_immigrant_34364.csv: x
tweets_immigrant_34351.csv: x
tweets_immigrant_34340.csv: x
tweets_immigrant_34327.csv: x
tweets_immigrant_34317.csv: x
tweets_immigrant_34377.csv: x
tweets_immigrant_34365.csv: x
tweets_immigrant_34341.csv: x
tweets_immigrant_34318.csv: x
tweets_immigrant_34352.csv: x
tweets_immigrant_34378.csv: x
tweets_immigrant_34366.csv: x
tweets_immigrant_34328.csv: x
tweets_immigrant_34342.csv: x
tweets_immigrant_34319.csv: x
tweets_immigrant_34379.csv: x
tweets_immigrant_34367.csv: x
tweets_immigrant_34343.csv: x
tweets_immigrant_34353.csv: x
tweets_immigrant_34320.csv: x
tweets_immigrant_34380.csv: x
tweets_immigrant_34329.csv: x
tweets_immigrant_34368.csv: x
tweets_immigrant_34344.csv: x
tweets_immigrant_34381.csv: x
tweets_immigrant_34369.csv: x
tweets_immigrant_34321.csv: x
tweets_imm

#### Log File

In [5]:
log_f = name_file_path('log.csv', outdir)
log_df.to_csv(log_f, index=False)

# take a peek
log_df.head()

Unnamed: 0,f,new_f,succeed
0,tweets_immigrant_34315.csv,../../data/processed/finals/5-7/tweets_immigra...,True
1,tweets_immigrant_34316.csv,../../data/processed/finals/5-7/tweets_immigra...,True
2,tweets_immigrant_34317.csv,../../data/processed/finals/5-7/tweets_immigra...,True
3,tweets_immigrant_34318.csv,../../data/processed/finals/5-7/tweets_immigra...,True
4,tweets_immigrant_34319.csv,../../data/processed/finals/5-7/tweets_immigra...,True


#### Combine all files

In [6]:
# grab only the succesful files
good_fs = log_df[log_df.succeed == True]['new_f'].tolist()

In [7]:
## write it out
# first to create headers
(pd.read_csv(good_fs[0], low_memory=False, engine='c')).to_csv(combined_f, index=False)

with open(combined_f, 'a') as f:
    for f_ in good_fs:
        # skip the log
        if (f_.split('/')[-1] != 'log.csv'):
            (pd.read_csv(f_, low_memory=False, engine='c')).to_csv(f, index=False, header=False)