# Summary

Exploration of aggreagting data into combined datasets.

In [1]:
import os, sys
import json

import pandas as pd

# Time
from datetime import datetime, timedelta



## Define functions

In [2]:
def read_data_files(path: str,
                    keywords: list,
                    verbose: bool = True):
    '''
    Function read a collection of files from the path
    all having the keywords in their name and to return
    a dataframe.  This assumes the columns are the same for 
    the data in each file.
    
    '''
    # Read individual unscored files
    dfs = []

    for dfile in os.listdir(path):
        
        # Assume file should be read
        file_name_match = True
        
        # Check for keywords
        for kw in keywords:
            if dfile.find(kw) < 0:
                if verbose:
                    print('file name does not match: {}'.format(dfile))
                file_name_match = False
                
        if file_name_match:
            if verbose:
                print('reading: {}'.format(dfile))
            df = pd.read_csv(filepath_or_buffer=os.path.join(path, dfile))
            dfs.append(df)

    # Create a single dataframe        
    df = pd.concat(objs=dfs) 
    df = df.reset_index(drop=True)
    
    return df

## Aggregate reddit data

In [3]:
posts_file_path = "../../../data/reddit/posts"
posts_file_path = "../../data_tests/reddit_tests/posts"

df_red1 = read_data_files(path=posts_file_path,
                          keywords=['.csv', 'posts'])


file name does not match: .DS_Store
file name does not match: .DS_Store
reading: britishcolumbia_posts_data.csv
file name does not match: .ipynb_checkpoints
file name does not match: .ipynb_checkpoints


In [4]:
df_red1['created_utc'].min()
df_red1

print(len(df_red1['id'].unique()))
print(len(df_red1['id'].tolist()))

df_red1


305
305


Unnamed: 0,search_term,id,created_utc,author,subreddit,title,selftext,url,num_comments
0,esquimalt nation office,1acco4p,2024-01-27 15:01:20,AutoModerator,britishcolumbia,Regions of BC: Revelstoke,Here at [/r/BritishColumbia](https://old.reddi...,https://www.reddit.com/r/britishcolumbia/comme...,9
1,society,1apm3yy,2024-02-13 05:39:57,CapableSecretary420,britishcolumbia,The draft B.C. Biodiversity and Ecosystem Heal...,,https://www.timescolonist.com/opinion/trevor-h...,3
2,society,1acxhkg,2024-01-28 07:50:03,Professional-Arm-667,britishcolumbia,Is it just me or is it getting harder and hard...,"Vision checked, etc, all is well there. But it...",https://www.reddit.com/r/britishcolumbia/comme...,219
3,society,1ake11d,2024-02-06 16:38:21,CapableSecretary420,britishcolumbia,Crypto mining company loses bid to force B.C. ...,,https://vancouversun.com/business/local-busine...,145
4,society,1aomefz,2024-02-11 23:57:02,Blacksmith-Confident,britishcolumbia,Hi! I'm currently searching for a design/ prod...,I am a second year product design student curr...,https://www.reddit.com/r/britishcolumbia/comme...,3
...,...,...,...,...,...,...,...,...,...
300,all_new_posts,1az9qaa,2024-02-25 00:08:12,CapableSecretary420,britishcolumbia,Alaskan tribes claim B.C. mining threatens hea...,,https://www.richmond-news.com/highlights/alask...,40
301,all_new_posts,1az5et1,2024-02-24 21:07:21,ubcstaffer123,britishcolumbia,"BC Ukrainians, newcomers rally to protest two ...",,https://vancouversun.com/news/local-news/b-c-u...,20
302,all_new_posts,1az3cc9,2024-02-24 19:41:43,lamdefinitelynotadog,britishcolumbia,Northern B.C. dog survives run-in with road-si...,,https://www.todayinbc.com/news/northern-bc-dog...,5
303,all_new_posts,1az2xrs,2024-02-24 19:24:36,zuqwaylh,britishcolumbia,Anyone have the history about this “cup” I got...,,https://www.reddit.com/gallery/1az2xrs,74


In [5]:
# Write this combined file
redit_datasets_path = '../../data_tests/reddit_tests/datasets'

first_date = df_red1['created_utc'].min()[:10]
last_date = df_red1['created_utc'].max()[:10]

out_file = "rd_dataset_{}_{}.csv".format(first_date, last_date)

# df_red1.to_csv(path_or_buf=os.path.join(redit_datasets_path, out_file), index=False)


In [7]:
first_date
last_date


'2024-03-02'

## Aggregate Twitter data

In [4]:
twitter_phase2_file_path_all = "../../../../SWB-GVCEH/data/processed/twitter"

twitter_phase2_file_path_scored = "../../../../SWB-GVCEH/Tests"

# os.listdir(twitter_phase2_file_path_unscored)
# os.listdir(twitter_phase2_file_path_scored)


df_tall = read_data_files(path=twitter_phase2_file_path_all,
                          keywords=['.csv', 'GVCEH'],
                          verbose=False)

df_ts = read_data_files(path=twitter_phase2_file_path_scored,
                        keywords=['.csv', 'GVCEH'],
                        verbose=False)

print('Unscored and scored dataframe length: {}'.format(len(df_tall)))
print('Scored dataframe length: {}'.format(len(df_ts)))




Unscored and scored dataframe length: 3204
Scored dataframe length: 427


In [5]:
df_ts.head()

Unnamed: 0.1,Unnamed: 0,text,scrape_time,tweet_id,created_at,reply_count,quote_count,like_count,retweet_count,geo_full_name,geo_id,username,user_location,num_followers,search_keywords,search_neighbourhood,sentiment,score
0,61,The second home crisis in Arisaig is about as ...,2024-02-08 17:22:00.526308,1755128018878464455,2024-02-07 07:15:03+00:00,0,0,1,0,,,AmericanS53312,"Florida, USA",40,(900-block pandora avenue OR esquimalt OR high...,900-block pandora avenue OR esquimalt OR highl...,negative,0.589752
1,70,Small business family farmers should have more...,2024-02-08 17:22:00.526503,1754996179262382352,2024-02-06 22:31:10+00:00,0,0,0,0,,,NH_BoysHoops,,59,(900-block pandora avenue OR esquimalt OR high...,900-block pandora avenue OR esquimalt OR highl...,neutral,0.578374
2,78,Imagine Janice KNOWING a Woman in #Saanich vio...,2024-02-08 17:22:00.696670,1754928897131741411,2024-02-06 18:03:48+00:00,0,0,0,0,,,BleachBruv,,0,(900-block pandora avenue OR esquimalt OR high...,900-block pandora avenue OR esquimalt OR highl...,negative,0.898957
3,112,"No state money for doing so, but a North Highl...",2024-02-08 17:22:03.660465,1755419930453852237,2024-02-08 02:35:00+00:00,0,0,0,0,,,benvsacbiz,"Sacramento, CA",2819,(900-block pandora avenue OR esquimalt OR high...,900-block pandora avenue OR esquimalt OR highl...,neutral,0.74997
4,113,Hard to understate the significance of a wave ...,2024-02-08 17:22:03.660499,1754978578804908244,2024-02-06 21:21:13+00:00,13,5,332,97,,,andrew_thin,,4516,(900-block pandora avenue OR esquimalt OR high...,900-block pandora avenue OR esquimalt OR highl...,positive,0.947558


In [6]:
# Drop a colun
# df_ts = df_ts.drop(labels=['Unnamed: 0'], axis=1)


# Add a column for relevance to the scored tweets
df_ts['is_relevant'] = True

# Filter out the irrelevant tweets
mask = ~df_tall['tweet_id'].isin(df_ts['tweet_id'].tolist())
df_tall = df_tall[mask]

# Add scoring columns
df_tall['score'] = -1.
df_tall['sentiment'] = 'no value'
df_tall['is_relevant'] = False


# 
df_tw = pd.concat(objs=[df_ts, df_tall])
df_tw = df_tw.reset_index(drop=True)

print('Unscored dataframe length: {}'.format(len(df_tall)))
print('Scored dataframe length: {}'.format(len(df_ts)))

print('Combined dataframe length: {}'.format(len(df_tw)))


Unscored dataframe length: 2777
Scored dataframe length: 427
Combined dataframe length: 3204


In [28]:
# Write this combined file
twitter_datasets_path = '../../data_tests/xtwitter_tests/datasets'

first_date = df_tw['created_at'].min()[:10]
last_date = df_tw['created_at'].max()[:10]

out_file = "xt_dataset_{}_{}.csv".format(first_date, last_date)

df_tw.to_csv(path_or_buf=os.path.join(twitter_datasets_path, out_file), index=False)




## Review labeling data

In [8]:
labeled_data_path = "../../data_tests/modeling_tests/labeling/data"


os.listdir(labeled_data_path)


['reddit_manual_labels_sample01.json',
 'reddit_data_sample.json',
 'Complete_Data.json']

In [40]:
# df = pd.read_json(path_or_buf=os.path.join(labeled_data_path,"reddit_manual_labels_sample01.json"))







# with open(os.path.join(labeled_data_path,file_name), "r") as f:
#     jdata = f.read()
    
# data = json.loads(jdata)

# jdata

# jdata[937:]

# df = pd.read_json(path_or_buf=jdata)
                  

In [58]:
# df_man1


In [56]:
file_name = "reddit_manual_labels_sample01.json"
df_man1 = pd.read_json(path_or_buf=os.path.join(labeled_data_path,file_name), lines=True)
df_man1.head()

df_man1


mask1 = df_man1['manual_label'] == 1

print('Relevant')
for idx in df_man1[mask1].index[:3]:
    print(idx)
    print(df_man1.loc[idx, 'TitleText'])
    
mask0 = df_man1['manual_label'] == 0

print('Irrelevant')
for idx in df_man1[mask0].index[:3]:
    print(idx)
    print(df_man1.loc[idx, 'TitleText'])   

Relevant
61
The mobile shower paid 142 000 $ to clean the homeless does not show up.. Resident of Beacon Hill Park since June 2020 never had the opportunity to see or clean themselves in the [mobile shower](https://www.timescolonist.com/news/local/salvation-army-to-run-mobile-shower-service-for-people-without-homes-1.24250177) as it does not show up.

 

Away from the absurdity that 142 000 $ is the cost to run a 4 months mobile shower that does not appear, grace and rewards goes to "Food not bomb", "the Red Cedar Cafe" distributing food and care on a volunteer base shows once it for all the calumny of the societal service capitalized on the mentally damaged and handicapped that are named homeless.




The homeless community stepped out of the mud like a lotus flower and built a shower and a care tent in Beacon Hill Park to provide heat, clothing and moral support for free run 24/7 was taken down instead  of improving it to regulation.




The ethical dimension spirals down further on 

In [52]:
file_name = "reddit_data_sample.json"
df_man2 = pd.read_json(path_or_buf=os.path.join(labeled_data_path,file_name), lines=True)
df_man2.head()

df_man2


for idx in df_man2.index:
    print(df_man2.loc[idx, 'TitleText'])



Motorcycle stolen right out of driveway. [Stolen Cafe-racer style motorcycle](http://imgur.com/a/VrlnH)

~~My good friend's bike was taken last night, right out of his driveway near Elk Lake. It's reasonably unique looking, so if you happen to see this bike around, please notify Saanich PD, as they have a file open.~~


**Bike was recovered by Saanich PD and is back home with it's owner! Thank you all very much for your assistance. This is a great community.** 
Saanich’s famous Tuxedo Drive Christmas display for sale - Last year for famous Tuxedo Drive Christmas display, unless you want to buy it. 
Worst Uber Ever. 
Did anyone listen to MOVE 103.5 FM this morning, November 21st?. I usually catch the Nat and Drew show on a daily basis in hopes of winning the Swift tickets, but I missed it this morning. Did anyone catch who the winner was this morning? And did they have to redraw a name?

&#x200B;

My daughter is a huge swiftie and I would love to win it for this Christmas
BC Rugby - Vik

In [46]:
file_name = 'Complete_Data.json'
df2 = pd.read_json(path_or_buf=os.path.join(labeled_data_path, file_name), lines=True)
df2.head()


Unnamed: 0,index,Subreddit,Title,Text,TitleText,relevance_score,most_common_centroid_id,Score_model2,label_model2,label_model1,relevant_sentences,topic_num,Relevant_document,Relevant_topic,topic_label,Sentiment_Full,Sentence_Level_Sentiment_Compund,Relevent_Sentence_Sentiment_Compund
0,"{'0': '0', '1': '1', '2': '2', '3': '3', '4': ...","{'0': 'VictoriaBC', '1': 'VictoriaBC', '2': 'V...",{'0': 'True change around homelessness from th...,{'0': 'I've started my own organization to exp...,{'0': 'True change around homelessness from th...,"{'0': 0.6363636364, '1': 1.0, '2': 0.0, '3': 0...","{'0': [1], '1': [5], '2': None, '3': None, '4'...","{'0': 63.64, '1': 100.0, '2': 0.0, '3': 0.0, '...","{'0': 1, '1': 1, '2': 0, '3': 0, '4': 0, '5': ...","{'0': 1, '1': 1, '2': 0, '3': 0, '4': 0, '5': ...",{'0': 'True change around homelessness from th...,"{'0': 34, '1': 11, '2': 36, '3': 36, '4': 66, ...","{'0': 0, '1': 0, '2': 0, '3': 0, '4': 0, '5': ...","{'0': 1, '1': 1, '2': 0, '3': 0, '4': 0, '5': ...",{'0': 'Topic Label: Homelessness and Housing S...,"{'0': 'Positive', '1': 'Neutral', '2': 'Neutra...","{'0': 'Positive', '1': 'Neutral', '2': 'Neutra...","{'0': 'Positive', '1': 'Neutral', '2': 'Neutra..."
