# Data augmentation

In [1]:
# Import packages

import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np

import random
random.seed(32)
from time import time, gmtime
from tqdm.notebook import tqdm

3000
8500


In [2]:
# Import data, only b5:
df = pd.read_csv('/home/sophia/ma_py/pandora_bigfive.csv')
# Import dataset authors and delete not needed columns (big five labels)
authors = pd.read_csv('/home/sophia/ma_py/author_profiles.csv')
bigfive = authors[['author','agreeableness','openness','conscientiousness','extraversion','neuroticism']]
bigfive = bigfive[bigfive['openness'].notna()]
bigfive = bigfive[bigfive['conscientiousness'].notna()]
bigfive = bigfive[bigfive['extraversion'].notna()]
bigfive = bigfive[bigfive['agreeableness'].notna()]
bigfive = bigfive[bigfive['neuroticism'].notna()]
del authors

authorlst = bigfive['author'].unique()
print(len(authorlst))
df = df[df.author.isin(authorlst)]
df

1568


Unnamed: 0,author,author_flair_text,body,downs,created_utc,subreddit_id,link_id,parent_id,score,controversiality,gilded,id,subreddit,ups,word_count,word_count_quoteless,lang
0,-Areopagan-,,Your first and second question is the same que...,,1513882848,t5_32jqy,t3_72l671,t1_drizkiv,1.0,0,0,drkz7z6,JordanPeterson,,201,201,en
1,-Areopagan-,,"I have two friends. I alienate everyone, event...",,1513744846,t5_32jqy,t3_72l671,t1_drhpj7t,1.0,0,0,dri9x7l,JordanPeterson,,123,123,en
2,-Areopagan-,,I suggest the future. You aren't going back in...,,1522253427,t5_32jqy,t3_80q8vm,t1_dwf58q7,5.0,0,0,dwfb6pa,JordanPeterson,,123,123,en
3,-Areopagan-,,I am smarter than you and will work you into d...,,1513704382,t5_32jqy,t3_72l671,t3_72l671,2.0,0,0,drh9s6q,JordanPeterson,,70,55,en
4,-Areopagan-,,Yeah I wouldnt want to deal with someone like ...,,1515531740,t5_32jqy,t3_72l671,t1_dsfnzvi,1.0,0,0,dsfo2zk,JordanPeterson,,14,14,en
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3103203,yrelav_dnomyar,,What are your qualities and traits preferred i...,,1471556597,t5_2qowo,t3_4ycvdt,t3_4ycvdt,1.0,0,0,d6n7gwr,intj,1.0,915,896,en
3103204,yrelav_dnomyar,,"Thanks for your input, I too enjoy that the T ...",,1471984713,t5_2qowo,t3_4z35f6,t1_d6tjjmy,2.0,0,0,d6tr1ka,intj,2.0,62,61,en
3103205,yrelav_dnomyar,,WHYYYYYYY,,1515442412,t5_3db06,t3_7p0ihe,t1_dsdluzd,4.0,0,0,dsdqfai,GH5,,1,1,
3103206,yrelav_dnomyar,,"Yes, of course I do! As long as you are super ...",,1522258616,t5_2tm8b,t3_868uag,t1_dw5bdck,2.0,0,0,dwfh3ss,weddingvideography,,27,27,en


In [6]:
# deterministic

def augment_comments(df):
    newdf = df
    t0 = time()
    originalauthors = df['author'].unique()
    currentn = len(originalauthors)
    values = np.arange(0.1, 1.0, 0.1, float)
    print("Number of authors at the beginning: ", currentn)
    for index, person in enumerate(tqdm(originalauthors)):
#         newtime = time() - t0
#         newtime = newtime/60
#         print("Author", index, ", time %0.1fmin" % newtime)
        oneauthordf = df.loc[df['author'] == person]
        if len(oneauthordf) > 100:
            for number in values:
                newcomments = oneauthordf.sample(frac=number, replace=False, random_state=1)
                newcomments.reset_index(drop=True, inplace=True)
                newcomments['author'] = person + '_new' + str(number)
                newdf = newdf.append(newcomments)
        
    mul = len(newdf['author'].unique())/len(originalauthors)
    endtime = time() - t0
    printtime = endtime/3600
    print("\n\nAugmentation done in  %0.1fs" % (time() - t0), ", in hours %0.1fh" % printtime,
          "\nNew number of authors: ", len(newdf['author'].unique()), 
          ", Multiplication factor: ", mul)
    return newdf

aug_df = augment_comments(df)
aug_df.to_pickle("pandora_b5_deter.pkl")
aug_df

Number of authors at the beginning:  1568


  0%|          | 0/1568 [00:00<?, ?it/s]



Augmentation done in  49203.7s , in hours 13.7h 
New number of authors:  11594 , Multiplication factor:  7.394132653061225


Unnamed: 0,author,author_flair_text,body,downs,created_utc,subreddit_id,link_id,parent_id,score,controversiality,gilded,id,subreddit,ups,word_count,word_count_quoteless,lang
0,-Areopagan-,,Your first and second question is the same que...,,1513882848,t5_32jqy,t3_72l671,t1_drizkiv,1.0,0,0,drkz7z6,JordanPeterson,,201,201,en
1,-Areopagan-,,"I have two friends. I alienate everyone, event...",,1513744846,t5_32jqy,t3_72l671,t1_drhpj7t,1.0,0,0,dri9x7l,JordanPeterson,,123,123,en
2,-Areopagan-,,I suggest the future. You aren't going back in...,,1522253427,t5_32jqy,t3_80q8vm,t1_dwf58q7,5.0,0,0,dwfb6pa,JordanPeterson,,123,123,en
3,-Areopagan-,,I am smarter than you and will work you into d...,,1513704382,t5_32jqy,t3_72l671,t3_72l671,2.0,0,0,drh9s6q,JordanPeterson,,70,55,en
4,-Areopagan-,,Yeah I wouldnt want to deal with someone like ...,,1515531740,t5_32jqy,t3_72l671,t1_dsfnzvi,1.0,0,0,dsfo2zk,JordanPeterson,,14,14,en
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185,turncloak471_new0.9,,"Alright, I feel a consensus has been reached.",,1488723971,t5_2ugo7,t3_5xmwul,t1_dejatn3,1.0,0,0,dejaxo8,MechanicalKeyboards,,8,8,en
186,turncloak471_new0.9,,The first time I saw an attached tail was duri...,0.0,1497497156,t5_34o9s,t3_6hc0ji,t1_dix7pg0,17.0,0,0,dix9imr,starterpacks,0.0,28,28,en
187,turncloak471_new0.9,,"I walk by that fucking flier every day, thank ...",,1489875216,t5_35j1r,t3_603ceb,t1_df3dswg,7.0,0,0,df3vk14,CringeAnarchy,,18,18,en
188,turncloak471_new0.9,INTJ,But don't you think it's starting to become ac...,0.0,1499471808,t5_2qlr2,t3_6luzxt,t1_djwwzwt,1.0,0,0,djxdkcj,introvert,0.0,29,29,en
