# Summary

Exploration of aggreagting data into combined datasets.

In [8]:
import os, sys
import pandas as pd

# Time
from datetime import datetime, timedelta



## Define functions

In [29]:
def read_data_files(path: str,
                    keywords: list,
                    verbose: bool = True):
    '''
    Function read a collection of files from the path
    all having the keywords in their name and to return
    a dataframe.  This assumes the columns are the same for 
    the data in each file.
    
    '''
    # Read individual unscored files
    dfs = []

    for dfile in os.listdir(path):
        
        # Assume file should be read
        file_name_match = True
        
        # Check for keywords
        for kw in keywords:
            if dfile.find(kw) < 0:
                if verbose:
                    print('file name does not match: {}'.format(dfile))
                file_name_match = False
                
        if file_name_match:
            if verbose:
                print('reading: {}'.format(dfile))
            df = pd.read_csv(filepath_or_buffer=os.path.join(path, dfile))
            dfs.append(df)

    # Create a single dataframe        
    df = pd.concat(objs=dfs) 
    df = df.reset_index(drop=True)
    
    return df

## Aggregate reddit data

In [30]:
posts_file_path = "../../../data/reddit/posts"

df_red1 = read_data_files(path=posts_file_path,
                          keywords=['.csv', 'posts'])


file name does not match: .DS_Store
file name does not match: .DS_Store
reading: uvic_posts_data.csv
reading: VancouverIsland_posts_data.csv
reading: saltspring_posts_data.csv
reading: Sooke_posts_data.csv
reading: britishcolumbia_posts_data.csv
reading: Metchosin_posts_data.csv
reading: WestShoreBC_posts_data.csv
reading: SidneyBC_posts_data.csv
reading: Esquimalt_posts_data.csv
reading: OakBayBritishColumbia_posts_data.csv
reading: SaanichPeninsula_posts_data.csv


In [35]:
df_red1['created_utc'].min()
df_red1


Unnamed: 0,search_term,id,created_utc,author,subreddit,title,selftext,url,num_comments
0,backpack project victoria bc,19a6lp0,2024-01-19 01:13:49,uvic,uvic,Campus opening: Decision by 7 a.m. Jan. 19,UVic is closely monitoring [Environment Canada...,https://www.reddit.com/r/uvic/comments/19a6lp0...,20
1,lived experience,19cbv9c,2024-01-21 19:46:23,Consistent_Reply_240,uvic,Honest opinions,I recently got in to Uvic as well as a few oth...,https://www.reddit.com/r/uvic/comments/19cbv9c...,58
2,lived experience,19fkr6l,2024-01-25 21:46:56,Hungry_Awareness3174,uvic,Law Program/ Campus Experience,I have recently been accepted for my 1L in fal...,https://www.reddit.com/r/uvic/comments/19fkr6l...,0
3,mental health society greater victoria,1amr2xr,2024-02-09 15:46:11,koshka4life,uvic,How do you deal with period cramps and exam week?,I was studying for an exam last night and I fe...,https://www.reddit.com/r/uvic/comments/1amr2xr...,39
4,mental health society greater victoria,19a0rbm,2024-01-18 21:02:34,CriticalSecret1417,uvic,Multitudo sapientium sanitas orbis,"Fun Uvic Fact: Our Latin motto translates as, ...",https://i.redd.it/ef9l6enfj9dc1.jpeg,6
...,...,...,...,...,...,...,...,...,...
361,overdose,own79d,2021-08-02 20:33:09,RedditCareResources,u_RedditCareResources,Get support for yourself or other people,"If you’d like to talk to someone, confidential...",https://www.reddit.com/r/u_RedditCareResources...,2
362,reaching,19bekl9,2024-01-20 15:45:41,Emotional_Union6971,OakBayBritishColumbia,SMUS summer volleyball camp?,I'm considering registering my 15-year-old dau...,https://www.reddit.com/r/OakBayBritishColumbia...,2
363,overdose,own79d,2021-08-02 20:33:09,RedditCareResources,u_RedditCareResources,Get support for yourself or other people,"If you’d like to talk to someone, confidential...",https://www.reddit.com/r/u_RedditCareResources...,2
364,mental health recovery partners island,1akej76,2024-02-06 16:59:26,wallpaint101,SaanichPeninsula,Seeking Research Participants,"Hello. My name is Liam, and I am a fifth-year ...",https://www.reddit.com/r/SaanichPeninsula/comm...,0


In [34]:
# Write this combined file
redit_datasets_path = '../../data_tests/reddit_tests/datasets'

first_date = df_red1['created_utc'].min()[:10]
last_date = df_red1['created_utc'].max()[:10]

out_file = "rd_dataset_{}_{}.csv".format(first_date, last_date)

df_red1.to_csv(path_or_buf=os.path.join(redit_datasets_path, out_file), index=False)


## Aggregate Twitter data

In [4]:
twitter_phase2_file_path_all = "../../../../SWB-GVCEH/data/processed/twitter"

twitter_phase2_file_path_scored = "../../../../SWB-GVCEH/Tests"

# os.listdir(twitter_phase2_file_path_unscored)
# os.listdir(twitter_phase2_file_path_scored)


df_tall = read_data_files(path=twitter_phase2_file_path_all,
                          keywords=['.csv', 'GVCEH'],
                          verbose=False)

df_ts = read_data_files(path=twitter_phase2_file_path_scored,
                        keywords=['.csv', 'GVCEH'],
                        verbose=False)

print('Unscored and scored dataframe length: {}'.format(len(df_tall)))
print('Scored dataframe length: {}'.format(len(df_ts)))




Unscored and scored dataframe length: 3204
Scored dataframe length: 427


In [5]:
df_ts.head()

Unnamed: 0.1,Unnamed: 0,text,scrape_time,tweet_id,created_at,reply_count,quote_count,like_count,retweet_count,geo_full_name,geo_id,username,user_location,num_followers,search_keywords,search_neighbourhood,sentiment,score
0,61,The second home crisis in Arisaig is about as ...,2024-02-08 17:22:00.526308,1755128018878464455,2024-02-07 07:15:03+00:00,0,0,1,0,,,AmericanS53312,"Florida, USA",40,(900-block pandora avenue OR esquimalt OR high...,900-block pandora avenue OR esquimalt OR highl...,negative,0.589752
1,70,Small business family farmers should have more...,2024-02-08 17:22:00.526503,1754996179262382352,2024-02-06 22:31:10+00:00,0,0,0,0,,,NH_BoysHoops,,59,(900-block pandora avenue OR esquimalt OR high...,900-block pandora avenue OR esquimalt OR highl...,neutral,0.578374
2,78,Imagine Janice KNOWING a Woman in #Saanich vio...,2024-02-08 17:22:00.696670,1754928897131741411,2024-02-06 18:03:48+00:00,0,0,0,0,,,BleachBruv,,0,(900-block pandora avenue OR esquimalt OR high...,900-block pandora avenue OR esquimalt OR highl...,negative,0.898957
3,112,"No state money for doing so, but a North Highl...",2024-02-08 17:22:03.660465,1755419930453852237,2024-02-08 02:35:00+00:00,0,0,0,0,,,benvsacbiz,"Sacramento, CA",2819,(900-block pandora avenue OR esquimalt OR high...,900-block pandora avenue OR esquimalt OR highl...,neutral,0.74997
4,113,Hard to understate the significance of a wave ...,2024-02-08 17:22:03.660499,1754978578804908244,2024-02-06 21:21:13+00:00,13,5,332,97,,,andrew_thin,,4516,(900-block pandora avenue OR esquimalt OR high...,900-block pandora avenue OR esquimalt OR highl...,positive,0.947558


In [6]:
# Drop a colun
# df_ts = df_ts.drop(labels=['Unnamed: 0'], axis=1)


# Add a column for relevance to the scored tweets
df_ts['is_relevant'] = True

# Filter out the irrelevant tweets
mask = ~df_tall['tweet_id'].isin(df_ts['tweet_id'].tolist())
df_tall = df_tall[mask]

# Add scoring columns
df_tall['score'] = -1.
df_tall['sentiment'] = 'no value'
df_tall['is_relevant'] = False


# 
df_tw = pd.concat(objs=[df_ts, df_tall])
df_tw = df_tw.reset_index(drop=True)

print('Unscored dataframe length: {}'.format(len(df_tall)))
print('Scored dataframe length: {}'.format(len(df_ts)))

print('Combined dataframe length: {}'.format(len(df_tw)))


Unscored dataframe length: 2777
Scored dataframe length: 427
Combined dataframe length: 3204


In [28]:
# Write this combined file
twitter_datasets_path = '../../data_tests/xtwitter_tests/datasets'

first_date = df_tw['created_at'].min()[:10]
last_date = df_tw['created_at'].max()[:10]

out_file = "xt_dataset_{}_{}.csv".format(first_date, last_date)

df_tw.to_csv(path_or_buf=os.path.join(twitter_datasets_path, out_file), index=False)


