# Data augmentation

In [29]:
# Import packages

import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.notebook import tqdm
tqdm.pandas()
import datetime
import random
random.seed(32)
from time import time

In [2]:
# Import dataset with comments
df = pd.read_csv('/home/sophia/ma_py/pandora_bigfive.csv')

# Import dataset authors and delete not needed columns
authors = pd.read_csv('/home/sophia/ma_py/author_profiles.csv')
bigfive = authors[['author', 'mbti', 'agreeableness','openness','conscientiousness','extraversion','neuroticism']]
bigfive = bigfive[bigfive['agreeableness'].notna()]
uniondf = bigfive[bigfive['mbti'].notna()]

In [3]:
uniondf.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 402 entries, 7 to 10294
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   author             402 non-null    object 
 1   mbti               402 non-null    object 
 2   agreeableness      402 non-null    float64
 3   openness           397 non-null    float64
 4   conscientiousness  400 non-null    float64
 5   extraversion       401 non-null    float64
 6   neuroticism        401 non-null    float64
dtypes: float64(5), object(2)
memory usage: 25.1+ KB


In [4]:
uniondf = uniondf.reset_index(drop=True)
uniondf.head()

Unnamed: 0,author,mbti,agreeableness,openness,conscientiousness,extraversion,neuroticism
0,-BlitzN9ne,entp,50.0,85.0,15.0,50.0,30.0
1,-dyad-,infp,60.0,67.0,45.0,10.0,47.0
2,12345jk12345,estp,0.0,37.0,72.0,72.0,3.0
3,ACE_C0ND0R,intj,10.0,80.0,74.0,27.0,18.0
4,A_Bra_and_a_Ham,enfp,1.0,50.0,61.0,90.0,3.0


In [5]:
namelist = ['mbti_ei', 'mbti_ns', 'mbti_tf', 'mbti_jp']

for i in tqdm(range(len(namelist))):
    uniondf[namelist[i]] = uniondf['mbti'].apply(lambda x:[x[i]])
    uniondf[namelist[i]] = [item[0] for item in uniondf[namelist[i]]]

uniondf

  0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0,author,mbti,agreeableness,openness,conscientiousness,extraversion,neuroticism,mbti_ei,mbti_ns,mbti_tf,mbti_jp
0,-BlitzN9ne,entp,50.0,85.0,15.0,50.0,30.0,e,n,t,p
1,-dyad-,infp,60.0,67.0,45.0,10.0,47.0,i,n,f,p
2,12345jk12345,estp,0.0,37.0,72.0,72.0,3.0,e,s,t,p
3,ACE_C0ND0R,intj,10.0,80.0,74.0,27.0,18.0,i,n,t,j
4,A_Bra_and_a_Ham,enfp,1.0,50.0,61.0,90.0,3.0,e,n,f,p
...,...,...,...,...,...,...,...,...,...,...,...
397,quakeroaks,enfp,3.0,85.0,19.0,50.0,92.0,e,n,f,p
398,rrgjl,intj,0.0,92.0,52.0,1.0,79.0,i,n,t,j
399,seldomvanilla,intj,21.0,96.0,86.0,32.0,0.0,i,n,t,j
400,turncloak471,intj,6.0,91.0,88.0,15.0,4.0,i,n,t,j


In [6]:
uniondf['mbti_ei'].replace('e', 1, inplace=True)
uniondf['mbti_ei'].replace('i', 0, inplace=True)

uniondf['mbti_ns'].replace('n', 1, inplace=True)
uniondf['mbti_ns'].replace('s', 0, inplace=True)

uniondf['mbti_tf'].replace('t', 1, inplace=True)
uniondf['mbti_tf'].replace('f', 0, inplace=True)

uniondf['mbti_jp'].replace('j', 1, inplace=True)
uniondf['mbti_jp'].replace('p', 0, inplace=True)

uniondf

Unnamed: 0,author,mbti,agreeableness,openness,conscientiousness,extraversion,neuroticism,mbti_ei,mbti_ns,mbti_tf,mbti_jp
0,-BlitzN9ne,entp,50.0,85.0,15.0,50.0,30.0,1,1,1,0
1,-dyad-,infp,60.0,67.0,45.0,10.0,47.0,0,1,0,0
2,12345jk12345,estp,0.0,37.0,72.0,72.0,3.0,1,0,1,0
3,ACE_C0ND0R,intj,10.0,80.0,74.0,27.0,18.0,0,1,1,1
4,A_Bra_and_a_Ham,enfp,1.0,50.0,61.0,90.0,3.0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
397,quakeroaks,enfp,3.0,85.0,19.0,50.0,92.0,1,1,0,0
398,rrgjl,intj,0.0,92.0,52.0,1.0,79.0,0,1,1,1
399,seldomvanilla,intj,21.0,96.0,86.0,32.0,0.0,0,1,1,1
400,turncloak471,intj,6.0,91.0,88.0,15.0,4.0,0,1,1,1


In [7]:
uniondf.to_pickle("uniondf.pkl")

In [8]:
authorlist = uniondf['author'].tolist()
authorlist.sort()
authorlist

['-BlitzN9ne',
 '-dyad-',
 '12345jk12345',
 '64BitCoffee',
 'ACE_C0ND0R',
 'A_Bra_and_a_Ham',
 'AbiRNormal',
 'Aiichai',
 'AmazonExplorer',
 'Ambedo_1',
 'AncientSwordRage',
 'Anon_Logic',
 'Aocast',
 'ArchPrime',
 'Astronomy1',
 'AuraofLight1',
 'Autodidact420',
 'Avosia',
 'BadgerKid96',
 'BigMcK_',
 'BlackBourgeoisBat',
 'BlackCombos',
 'Blehhh55',
 'Bobowo12',
 'Bonhand',
 'Brewer_Matt',
 'BubblesAndSass',
 'BubolKawaiiFace',
 'BurnedOutInAJar',
 'CLEMENTZ_',
 'CLENVENMETINS',
 'CaptainDudeGuy',
 'Captaindecius',
 'Cat-Nipped',
 'CaymanFifth',
 'Ciryher',
 'CleanDevelopment',
 'ClosetedAndQueer',
 'CmdBelial',
 'ColdOxygen',
 'ColinTheWicked',
 'Composer1989',
 'CoriOreo',
 'Corspin',
 'CoryTV',
 'Craig',
 'CrateredMoon',
 'Crups',
 'Crysicia',
 'Curt04',
 'Cynaria',
 'DSMan195276',
 'DakotaRayne',
 'Dangalf-',
 'DeeMI5I0',
 'DejWoSWK',
 'Denasmee',
 'Descending_',
 'DialMforMuffins',
 'Disrupturous',
 'Djzongreethesecond',
 'Dmanadatory',
 'Doc_Mharti',
 'DoctorMolotov',
 'Doeido'

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3103208 entries, 0 to 3103207
Data columns (total 17 columns):
 #   Column                Dtype  
---  ------                -----  
 0   author                object 
 1   author_flair_text     object 
 2   body                  object 
 3   downs                 float64
 4   created_utc           int64  
 5   subreddit_id          object 
 6   link_id               object 
 7   parent_id             object 
 8   score                 float64
 9   controversiality      int64  
 10  gilded                int64  
 11  id                    object 
 12  subreddit             object 
 13  ups                   float64
 14  word_count            int64  
 15  word_count_quoteless  int64  
 16  lang                  object 
dtypes: float64(3), int64(5), object(9)
memory usage: 402.5+ MB


In [10]:
# commentdf = df.apply(lambda x: [row for row in x if (x.author in authorlst)])
commentdf = df[df.author.isin(authorlist)]
commentdf.head()

Unnamed: 0,author,author_flair_text,body,downs,created_utc,subreddit_id,link_id,parent_id,score,controversiality,gilded,id,subreddit,ups,word_count,word_count_quoteless,lang
2802,-BlitzN9ne,[3w4... 4w3?] Genius billionaire playboy phila...,I'm currently in the middle of making a Payday...,0.0,1422166355,t5_2qoy3,t3_2thhzd,t1_cnz5rpc,3.0,0,0,co01mwf,entp,3.0,36,35,en
2803,-BlitzN9ne,[3w4] Genius billionaire playboy philanthropist.,"You are the hero we all need right now, thank ...",0.0,1423504286,t5_2qoy3,t3_2vam8x,t3_2vam8x,1.0,0,0,cog51p8,entp,1.0,13,13,en
2804,-BlitzN9ne,,The term you're looking for is 'virtue signall...,0.0,1449881503,t5_2s7yq,t3_3wd4k5,t1_cxvlqoo,3.0,0,0,cxvziyi,ImGoingToHellForThis,3.0,8,8,en
2805,-BlitzN9ne,Hi! How can I oppress you today?,I wish them luck with that one; I'm seriously ...,0.0,1455215679,t5_2vizz,t3_455vbj,t1_czw4rya,1.0,0,0,czwaf20,TumblrInAction,1.0,24,24,en
2806,-BlitzN9ne,DM,Hey man! I love the heck out of the work you d...,0.0,1468271510,t5_2r9ei,t3_4sc55k,t3_4sc55k,2.0,0,0,d58dn12,DnD,2.0,111,109,en


In [11]:
newauthorlist = commentdf['author'].tolist()
newauthorlist = list(set(newauthorlist))

newauthorlist.sort()
print(authorlist == newauthorlist)

True


In [12]:
commentdf.to_pickle("comments_uniondf.pkl")

In [65]:
counts = commentdf['author'].value_counts()
print(type(counts))
counts['-BlitzN9ne']

<class 'pandas.core.series.Series'>


2825

In [20]:
commentdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1098737 entries, 2802 to 3103207
Data columns (total 17 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   author                1098737 non-null  object 
 1   author_flair_text     524438 non-null   object 
 2   body                  1098734 non-null  object 
 3   downs                 382409 non-null   float64
 4   created_utc           1098737 non-null  int64  
 5   subreddit_id          1098737 non-null  object 
 6   link_id               1098737 non-null  object 
 7   parent_id             1098737 non-null  object 
 8   score                 1098729 non-null  float64
 9   controversiality      1098737 non-null  int64  
 10  gilded                1098737 non-null  int64  
 11  id                    1098737 non-null  object 
 12  subreddit             1098737 non-null  object 
 13  ups                   564756 non-null   float64
 14  word_count            1098737 n

In [23]:
# tests
newrows = commentdf.loc[commentdf['author'] == '-BlitzN9ne']
print(type(newrows))
newrows
newrows['author'] = '-BlitzN9ne' + '_new8'
newrows
testdf = commentdf
testdf =  testdf.append(newrows, ignore_index=True)
testdf.info()

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1101562 entries, 0 to 1101561
Data columns (total 17 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   author                1101562 non-null  object 
 1   author_flair_text     525702 non-null   object 
 2   body                  1101559 non-null  object 
 3   downs                 383923 non-null   float64
 4   created_utc           1101562 non-null  int64  
 5   subreddit_id          1101562 non-null  object 
 6   link_id               1101562 non-null  object 
 7   parent_id             1101562 non-null  object 
 8   score                 1101554 non-null  float64
 9   controversiality      1101562 non-null  int64  
 10  gilded                1101562 non-null  int64  
 11  id                    1101562 non-null  object 
 12  subreddit             1101562 non-null  object 
 13  ups                   567101 non-null   float64
 

In [55]:
len(set(commentdf['author']))

402

In [63]:
def augment_data(df, total):
    newdf = df
    lst = []
    i = 1
    t0 = time()
    currentn = len(set(df['author']))
    print("Number of authors at the beginning ", currentn)
    while currentn < total:
        number = random.randint(0, len(df))
        sampleauthor = df.iloc[number]['author']
        allrows = df.loc[df['author'] == sampleauthor]
        if len(allrows) > 100:
            fraction = round(random.uniform(0.1, 0.9), 1)
            newrows = allrows.sample(frac=fraction)
            # due to this new naming if the same author and fraction would be chosen again the former will be
            # overwritten hence there cannot be two identical authors
            newrows['author'] = sampleauthor + '_new' + str(fraction)
            newdf = newdf.append(newrows, ignore_index=True)
            currentn+=1
            if i == 1:
                print("First iteration done in %0.1fs" % (time() - t0))
            i+=1
            if i % 100 == 0:
                print("Iteration", i, ", time needed so far:  %0.1fs" % (time() - t0))
    print("Augmentation done in  %0.1fs" % (time() - t0), ", Total iterations: ", i, 
          "\nNew number of authors: ", len(set(newdf['author'])), 
          ", Multiplication factor: ", str(len(set(newdf['author']))/len(set(df['author']))))
    return newdf

aug_df = augment_data(commentdf, 10000)
aug_df.to_pickle("aug_comments_uniondf.pkl")
aug_df

Number of authors at the beginning  402
First iteration done in 0.8s
Iteration 100 time needed so far:  197.4s
Iteration 200 time needed so far:  511.2s
Iteration 300 time needed so far:  976.5s
Iteration 400 time needed so far:  1616.2s
Iteration 500 time needed so far:  2410.5s
Iteration 600 time needed so far:  3342.0s
Iteration 700 time needed so far:  4372.2s
Iteration 800 time needed so far:  5105.4s
Iteration 900 time needed so far:  5494.7s
Iteration 1000 time needed so far:  5921.8s
Iteration 1100 time needed so far:  6393.4s
Iteration 1200 time needed so far:  6913.3s
Iteration 1300 time needed so far:  7469.0s
Iteration 1400 time needed so far:  8062.8s
Iteration 1500 time needed so far:  8692.7s
Iteration 1600 time needed so far:  9347.5s
Iteration 1700 time needed so far:  10036.2s
Iteration 1800 time needed so far:  10766.3s
Iteration 1900 time needed so far:  11533.7s
Iteration 2000 time needed so far:  12344.1s
Iteration 2100 time needed so far:  13183.6s
Iteration 2200

Unnamed: 0,author,author_flair_text,body,downs,created_utc,subreddit_id,link_id,parent_id,score,controversiality,gilded,id,subreddit,ups,word_count,word_count_quoteless,lang
0,-BlitzN9ne,[3w4... 4w3?] Genius billionaire playboy phila...,I'm currently in the middle of making a Payday...,0.0,1422166355,t5_2qoy3,t3_2thhzd,t1_cnz5rpc,3.0,0,0,co01mwf,entp,3.0,36,35,en
1,-BlitzN9ne,[3w4] Genius billionaire playboy philanthropist.,"You are the hero we all need right now, thank ...",0.0,1423504286,t5_2qoy3,t3_2vam8x,t3_2vam8x,1.0,0,0,cog51p8,entp,1.0,13,13,en
2,-BlitzN9ne,,The term you're looking for is 'virtue signall...,0.0,1449881503,t5_2s7yq,t3_3wd4k5,t1_cxvlqoo,3.0,0,0,cxvziyi,ImGoingToHellForThis,3.0,8,8,en
3,-BlitzN9ne,Hi! How can I oppress you today?,I wish them luck with that one; I'm seriously ...,0.0,1455215679,t5_2vizz,t3_455vbj,t1_czw4rya,1.0,0,0,czwaf20,TumblrInAction,1.0,24,24,en
4,-BlitzN9ne,DM,Hey man! I love the heck out of the work you d...,0.0,1468271510,t5_2r9ei,t3_4sc55k,t3_4sc55k,2.0,0,0,d58dn12,DnD,2.0,111,109,en
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66946444,LaV-Man_new0.3,INTP,Yeah I kind of do that. I hate sleeping on fo...,0.0,1436964170,t5_2qhvl,t3_3d71ze,t1_ct3dzn9,2.0,0,0,ct42kqd,INTP,2.0,26,25,en
66946445,LaV-Man_new0.3,,In the world? Because that includes you.Are y...,,1542225982,t5_2qie6,t3_9rg318,t1_e9p64b5,1.0,0,0,e9p98c4,mensa,,40,38,en
66946446,LaV-Man_new0.3,,"It's not, ""I don't care about you enough to ju...",,1448940410,t5_2ve1u,t3_3uvfiv,t1_cxinfqv,2.0,0,0,cxiq1gt,TheRedPill,2.0,234,229,en
66946447,LaV-Man_new0.3,,Well shit... me too.,0.0,1432733215,t5_31llc,t3_37ckby,t1_crmg5qr,-1.0,0,0,crmg7qb,TexasCHL,-1.0,4,4,it


In [53]:
newlist = len(set(aug_df['author']))
print(newlist)

548


In [59]:
405/402

1.007462686567164

In [64]:
smallaug_df = augment_data(commentdf, 1000)
smallaug_df.to_pickle("smallaug_comments_uniondf.pkl")
smallaug_df

Number of authors at the beginning  402
First iteration done in 0.3s
Iteration 100 time needed so far:  72.0s
Iteration 200 time needed so far:  182.7s
Iteration 300 time needed so far:  331.5s
Iteration 400 time needed so far:  519.5s
Iteration 500 time needed so far:  753.1s
Augmentation done in  1021.1s , Total iterations:  599 
New number of authors:  838 , Multiplication factor:  2.084577114427861


Unnamed: 0,author,author_flair_text,body,downs,created_utc,subreddit_id,link_id,parent_id,score,controversiality,gilded,id,subreddit,ups,word_count,word_count_quoteless,lang
0,-BlitzN9ne,[3w4... 4w3?] Genius billionaire playboy phila...,I'm currently in the middle of making a Payday...,0.0,1422166355,t5_2qoy3,t3_2thhzd,t1_cnz5rpc,3.0,0,0,co01mwf,entp,3.0,36,35,en
1,-BlitzN9ne,[3w4] Genius billionaire playboy philanthropist.,"You are the hero we all need right now, thank ...",0.0,1423504286,t5_2qoy3,t3_2vam8x,t3_2vam8x,1.0,0,0,cog51p8,entp,1.0,13,13,en
2,-BlitzN9ne,,The term you're looking for is 'virtue signall...,0.0,1449881503,t5_2s7yq,t3_3wd4k5,t1_cxvlqoo,3.0,0,0,cxvziyi,ImGoingToHellForThis,3.0,8,8,en
3,-BlitzN9ne,Hi! How can I oppress you today?,I wish them luck with that one; I'm seriously ...,0.0,1455215679,t5_2vizz,t3_455vbj,t1_czw4rya,1.0,0,0,czwaf20,TumblrInAction,1.0,24,24,en
4,-BlitzN9ne,DM,Hey man! I love the heck out of the work you d...,0.0,1468271510,t5_2r9ei,t3_4sc55k,t3_4sc55k,2.0,0,0,d58dn12,DnD,2.0,111,109,en
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5296829,PM_ME_MY_JUNG_TYPE_new0.1,ENFP,"Yes, I know all about that. However, this ove...",,1548831308,t5_2s90r,t3_al073w,t1_efblgfu,2.0,0,0,efc4zkk,mbti,,163,162,en
5296830,PM_ME_MY_JUNG_TYPE_new0.1,,"Ahhh fuck, it's me. Fuck. Yes, and it's gross,...",,1554357835,t5_2r4yi,t3_b97jjf,t3_b97jjf,5.0,0,0,ek30zic,ENFP,,17,17,en
5296831,PM_ME_MY_JUNG_TYPE_new0.1,ENFP,It's hard. I can more say what he's not versus...,,1526791584,t5_2s90r,t3_8ko87h,t1_dz9ieoh,2.0,0,0,dz9qujx,mbti,,24,24,en
5296832,PM_ME_MY_JUNG_TYPE_new0.1,ENTP,"In case you haven't caught on, I only make fun...",,1509329916,t5_2s90r,t3_79hnnb,t1_dp2qxi7,1.0,0,0,dp2ripe,mbti,,159,159,en
