# Data augmentation

In [1]:
# Import packages

import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np

import random
random.seed(32)
from time import time, gmtime
from tqdm.notebook import tqdm

import pickle

In [2]:
# Import data, only b5:
df = pd.read_csv('/home/sophia/ma_py/pandora_bigfive.csv')
# Import dataset authors and delete not needed columns (big five labels)
authors = pd.read_csv('/home/sophia/ma_py/author_profiles.csv')
bigfive = authors[['author','agreeableness','openness','conscientiousness','extraversion','neuroticism']]
bigfive = bigfive[bigfive['openness'].notna()]
bigfive = bigfive[bigfive['conscientiousness'].notna()]
bigfive = bigfive[bigfive['extraversion'].notna()]
bigfive = bigfive[bigfive['agreeableness'].notna()]
bigfive = bigfive[bigfive['neuroticism'].notna()]
del authors

authorlst = bigfive['author'].unique()
print(len(authorlst))
df = df[df.author.isin(authorlst)]
df

1568


Unnamed: 0,author,author_flair_text,body,downs,created_utc,subreddit_id,link_id,parent_id,score,controversiality,gilded,id,subreddit,ups,word_count,word_count_quoteless,lang
0,-Areopagan-,,Your first and second question is the same que...,,1513882848,t5_32jqy,t3_72l671,t1_drizkiv,1.0,0,0,drkz7z6,JordanPeterson,,201,201,en
1,-Areopagan-,,"I have two friends. I alienate everyone, event...",,1513744846,t5_32jqy,t3_72l671,t1_drhpj7t,1.0,0,0,dri9x7l,JordanPeterson,,123,123,en
2,-Areopagan-,,I suggest the future. You aren't going back in...,,1522253427,t5_32jqy,t3_80q8vm,t1_dwf58q7,5.0,0,0,dwfb6pa,JordanPeterson,,123,123,en
3,-Areopagan-,,I am smarter than you and will work you into d...,,1513704382,t5_32jqy,t3_72l671,t3_72l671,2.0,0,0,drh9s6q,JordanPeterson,,70,55,en
4,-Areopagan-,,Yeah I wouldnt want to deal with someone like ...,,1515531740,t5_32jqy,t3_72l671,t1_dsfnzvi,1.0,0,0,dsfo2zk,JordanPeterson,,14,14,en
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3103203,yrelav_dnomyar,,What are your qualities and traits preferred i...,,1471556597,t5_2qowo,t3_4ycvdt,t3_4ycvdt,1.0,0,0,d6n7gwr,intj,1.0,915,896,en
3103204,yrelav_dnomyar,,"Thanks for your input, I too enjoy that the T ...",,1471984713,t5_2qowo,t3_4z35f6,t1_d6tjjmy,2.0,0,0,d6tr1ka,intj,2.0,62,61,en
3103205,yrelav_dnomyar,,WHYYYYYYY,,1515442412,t5_3db06,t3_7p0ihe,t1_dsdluzd,4.0,0,0,dsdqfai,GH5,,1,1,
3103206,yrelav_dnomyar,,"Yes, of course I do! As long as you are super ...",,1522258616,t5_2tm8b,t3_868uag,t1_dw5bdck,2.0,0,0,dwfh3ss,weddingvideography,,27,27,en


In [6]:
# deterministic

def augment_comments(df):
    newdf = df
    t0 = time()
    originalauthors = df['author'].unique()
    currentn = len(originalauthors)
    values = np.arange(0.1, 1.0, 0.1, float)
    print("Number of authors at the beginning: ", currentn)
    for index, person in enumerate(tqdm(originalauthors)):
#         newtime = time() - t0
#         newtime = newtime/60
#         print("Author", index, ", time %0.1fmin" % newtime)
        oneauthordf = df.loc[df['author'] == person]
        if len(oneauthordf) > 100:
            for number in values:
                newcomments = oneauthordf.sample(frac=number, replace=False, random_state=1)
                newcomments.reset_index(drop=True, inplace=True)
                newcomments['author'] = person + '_new' + str(number)
                newdf = newdf.append(newcomments)
        
    mul = len(newdf['author'].unique())/len(originalauthors)
    endtime = time() - t0
    printtime = endtime/3600
    print("\n\nAugmentation done in  %0.1fs" % (time() - t0), ", in hours %0.1fh" % printtime,
          "\nNew number of authors: ", len(newdf['author'].unique()), 
          ", Multiplication factor: ", mul)
    return newdf

aug_df = augment_comments(df)
aug_df.to_pickle("pandora_b5_deter.pkl")
aug_df

Number of authors at the beginning:  1568


  0%|          | 0/1568 [00:00<?, ?it/s]



Augmentation done in  49203.7s , in hours 13.7h 
New number of authors:  11594 , Multiplication factor:  7.394132653061225


Unnamed: 0,author,author_flair_text,body,downs,created_utc,subreddit_id,link_id,parent_id,score,controversiality,gilded,id,subreddit,ups,word_count,word_count_quoteless,lang
0,-Areopagan-,,Your first and second question is the same que...,,1513882848,t5_32jqy,t3_72l671,t1_drizkiv,1.0,0,0,drkz7z6,JordanPeterson,,201,201,en
1,-Areopagan-,,"I have two friends. I alienate everyone, event...",,1513744846,t5_32jqy,t3_72l671,t1_drhpj7t,1.0,0,0,dri9x7l,JordanPeterson,,123,123,en
2,-Areopagan-,,I suggest the future. You aren't going back in...,,1522253427,t5_32jqy,t3_80q8vm,t1_dwf58q7,5.0,0,0,dwfb6pa,JordanPeterson,,123,123,en
3,-Areopagan-,,I am smarter than you and will work you into d...,,1513704382,t5_32jqy,t3_72l671,t3_72l671,2.0,0,0,drh9s6q,JordanPeterson,,70,55,en
4,-Areopagan-,,Yeah I wouldnt want to deal with someone like ...,,1515531740,t5_32jqy,t3_72l671,t1_dsfnzvi,1.0,0,0,dsfo2zk,JordanPeterson,,14,14,en
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185,turncloak471_new0.9,,"Alright, I feel a consensus has been reached.",,1488723971,t5_2ugo7,t3_5xmwul,t1_dejatn3,1.0,0,0,dejaxo8,MechanicalKeyboards,,8,8,en
186,turncloak471_new0.9,,The first time I saw an attached tail was duri...,0.0,1497497156,t5_34o9s,t3_6hc0ji,t1_dix7pg0,17.0,0,0,dix9imr,starterpacks,0.0,28,28,en
187,turncloak471_new0.9,,"I walk by that fucking flier every day, thank ...",,1489875216,t5_35j1r,t3_603ceb,t1_df3dswg,7.0,0,0,df3vk14,CringeAnarchy,,18,18,en
188,turncloak471_new0.9,INTJ,But don't you think it's starting to become ac...,0.0,1499471808,t5_2qlr2,t3_6luzxt,t1_djwwzwt,1.0,0,0,djxdkcj,introvert,0.0,29,29,en


## Read in preprocessed augmented data and correct trait scores for the fake authors

In [2]:
# read in aug_df
filepath = "aug_b5feat.pkl"
with open(filepath, 'rb') as f:
    aug_df = pickle.load(f)
aug_df.name = 'augmented_df'

In [3]:
# read in authorlist
filepath = "originalauthors.pkl"
with open(filepath, 'rb') as f:
    authors = pickle.load(f)

del filepath
del f

In [4]:
for original in tqdm(authors):
    res = [idx for idx in aug_df.index if idx[0:(len(original))] == original]
    if len(res) >1:
        # create df with trait values of original author and multiindex
        r = aug_df.loc[original, 'trait']
        r = pd.DataFrame(r)
        r = r.transpose()
        head = 15*['trait']
        columns = r.columns.values
        arrays = [head] + [columns]
        r.columns=pd.MultiIndex.from_arrays(arrays)
        # delete original author from list
        res.pop(0)
        # copy the row of the original author as many times as fake authors exist
        rows = pd.concat([r]*(len(res)))
        # change index to fake authors' names
        rows.index = res
        # update aug_df such that all fake authors copy their trait values from originala uthor
        aug_df.update(rows)

  0%|          | 0/1568 [00:00<?, ?it/s]

In [22]:
aug_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11594 entries, -Areopagan- to zyzee
Columns: 21207 entries, ('post', 'score') to ('lda100', 99)
dtypes: float32(97), float64(5150), int16(15944), object(16)
memory usage: 814.2+ MB


In [5]:
print("Create pickle")
filepath = "aug_b5feat_label.pkl"
with open(filepath, "wb") as f:
    pickled = pickle.dumps(aug_df, protocol=-1)
    f.write(pickled)

del f
del filepath
del pickled

Create pickle


In [8]:
aug_df['trait']

Unnamed: 0_level_0,agreeableness,openness,conscientiousness,extraversion,neuroticism,big5_a,big5_o,big5_c,big5_e,big5_n,big5_a_multi,big5_o_multi,big5_c_multi,big5_e_multi,big5_n_multi
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
-Areopagan-,0.0,99.0,96.0,60.0,1.0,0,1,1,1,0,0,4,4,3,0
-BigSexy-,39.0,92.0,1.0,18.0,4.0,0,1,0,0,0,1,4,0,0,0
-BigSexy-_new0.1,39.0,92.0,1.0,18.0,4.0,0,1,0,0,0,1,4,0,0,0
-BigSexy-_new0.2,39.0,92.0,1.0,18.0,4.0,0,1,0,0,0,1,4,0,0,0
-BigSexy-_new0.30000000000000004,39.0,92.0,1.0,18.0,4.0,0,1,0,0,0,1,4,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zymmaster_new0.6,28.0,47.0,62.0,21.0,49.0,0,0,1,0,0,1,2,3,1,2
zymmaster_new0.7000000000000001,28.0,47.0,62.0,21.0,49.0,0,0,1,0,0,1,2,3,1,2
zymmaster_new0.8,28.0,47.0,62.0,21.0,49.0,0,0,1,0,0,1,2,3,1,2
zymmaster_new0.9,28.0,47.0,62.0,21.0,49.0,0,0,1,0,0,1,2,3,1,2


### Split df: one version with only original authors, one with only new authors

In [16]:
new_augdf = aug_df.copy()
new_augdf = new_augdf.query('index not in @authors')

In [21]:
print("Create pickle")
filepath = "aug_b5feat_label_new.pkl"
with open(filepath, "wb") as f:
    pickled = pickle.dumps(new_augdf, protocol=-1)
    f.write(pickled)

del f
del filepath
del pickled

Create pickle


In [24]:
original_augdf = aug_df.copy()
original_augdf = original_augdf.query('index in @authors')
original_augdf

Unnamed: 0_level_0,post,post,post,post,subtf,subtf,subtf,subtf,subtf,post,...,lda100,lda100,lda100,lda100,lda100,lda100,lda100,lda100,lda100,lda100
Unnamed: 0_level_1,score,controversiality,gilded,ratio_en,num_subreddit,entropy,mean_time,median_time,max_time,lang,...,90,91,92,93,94,95,96,97,98,99
author,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
-Areopagan-,2.000000,0.000000,0.000000,1.000000,1,0.000000,2.137261e+06,893447.0,-28521,1,...,0.010000,0.010000,0.010000,0.010000,0.010000,0.010000,0.010000,0.010000,0.010000,0.010000
-BigSexy-,4.266715,0.020737,0.000000,18.697184,147,4.811834,1.003843e+04,760.0,-18659,22,...,0.000007,0.000007,0.530073,0.000007,0.000007,0.000007,0.000007,0.000007,0.000007,0.000007
-BlitzN9ne,9.644956,0.014159,0.000000,7.883648,116,4.865813,4.830648e+04,793.5,25124,38,...,0.000004,0.000004,0.000004,0.000004,0.000004,0.000004,0.000004,0.000004,0.000004,0.000004
-CrestiaBell,24.890661,0.017687,0.000866,18.204275,149,3.784498,1.220542e+04,1365.0,4466,54,...,0.000004,0.000004,0.000004,0.000004,0.000004,0.000004,0.000004,0.000004,0.000004,0.000004
-dyad-,7.234043,0.000000,0.000000,32.571430,5,1.484707,3.799737e+05,57538.0,-32559,2,...,0.000044,0.000044,0.000044,0.000044,0.769480,0.000044,0.000044,0.000044,0.000044,0.000044
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zugzwang_03,9.599347,0.011709,0.000291,205.831329,146,3.449831,6.396812e+03,785.0,32009,16,...,0.000054,0.000054,0.000054,0.000054,0.000054,0.000054,0.000054,0.000054,0.000054,0.000054
zuluthrone,12.150923,0.018458,0.000000,26.088236,46,4.125733,1.478935e+05,56420.5,-17435,8,...,0.000209,0.000209,0.021390,0.000209,0.000209,0.000209,0.000209,0.000209,0.000209,0.000209
zwelg,1.000000,0.000000,0.000000,1.000000,1,0.000000,-1.000000e+00,-1.0,-1,1,...,0.010000,0.010000,0.010000,0.010000,0.010000,0.010000,0.010000,0.010000,0.010000,0.010000
zymmaster,5.640209,0.010444,0.000000,29.396826,99,4.455701,6.970164e+04,679.5,-800,13,...,0.000109,0.000109,0.000109,0.000109,0.000109,0.000109,0.000109,0.000109,0.000109,0.000109


In [25]:
print("Create pickle")
filepath = "aug_b5feat_label_original.pkl"
with open(filepath, "wb") as f:
    pickled = pickle.dumps(original_augdf, protocol=-1)
    f.write(pickled)

del f
del filepath
del pickled

Create pickle
