# Data augmentation

In [None]:
# Import packages

import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np

import random
random.seed(32)
from time import time, gmtime
from tqdm.notebook import tqdm

import pickle

In [None]:
# Import data, only b5:
df = pd.read_csv('/home/sophia/ma_py/pandora_bigfive.csv')
# Import dataset authors and delete not needed columns (big five labels)
authors = pd.read_csv('/home/sophia/ma_py/author_profiles.csv')
bigfive = authors[['author','agreeableness','openness','conscientiousness','extraversion','neuroticism']]
bigfive = bigfive[bigfive['openness'].notna()]
bigfive = bigfive[bigfive['conscientiousness'].notna()]
bigfive = bigfive[bigfive['extraversion'].notna()]
bigfive = bigfive[bigfive['agreeableness'].notna()]
bigfive = bigfive[bigfive['neuroticism'].notna()]
del authors

authorlst = bigfive['author'].unique()
print(len(authorlst))
df = df[df.author.isin(authorlst)]
df

In [None]:
# deterministic

def augment_comments(df):
    newdf = df
    t0 = time()
    originalauthors = df['author'].unique()
    currentn = len(originalauthors)
    values = np.arange(0.1, 1.0, 0.1, float)
    print("Number of authors at the beginning: ", currentn)
    for index, person in enumerate(tqdm(originalauthors)):
#         newtime = time() - t0
#         newtime = newtime/60
#         print("Author", index, ", time %0.1fmin" % newtime)
        oneauthordf = df.loc[df['author'] == person]
        if len(oneauthordf) > 100:
            for number in values:
                newcomments = oneauthordf.sample(frac=number, replace=False, random_state=1)
                newcomments.reset_index(drop=True, inplace=True)
                newcomments['author'] = person + '_new' + str(number)
                newdf = newdf.append(newcomments)
        
    mul = len(newdf['author'].unique())/len(originalauthors)
    endtime = time() - t0
    printtime = endtime/3600
    print("\n\nAugmentation done in  %0.1fs" % (time() - t0), ", in hours %0.1fh" % printtime,
          "\nNew number of authors: ", len(newdf['author'].unique()), 
          ", Multiplication factor: ", mul)
    return newdf

aug_df = augment_comments(df)
aug_df.to_pickle("pandora_b5_deter.pkl")
aug_df

## Read in preprocessed augmented data and correct trait scores for the new authors

In [None]:
# read in aug_df
filepath = "aug_b5feat.pkl"
with open(filepath, 'rb') as f:
    aug_df = pickle.load(f)
aug_df.name = 'augmented_df'

In [None]:
# read in authorlist
filepath = "originalauthors.pkl"
with open(filepath, 'rb') as f:
    authors = pickle.load(f)

del filepath
del f

In [None]:
for original in tqdm(authors):
    res = [idx for idx in aug_df.index if idx[0:(len(original))] == original]
    if len(res) >1:
        # create df with trait values of original author and multiindex
        r = aug_df.loc[original, 'trait']
        r = pd.DataFrame(r)
        r = r.transpose()
        head = 15*['trait']
        columns = r.columns.values
        arrays = [head] + [columns]
        r.columns=pd.MultiIndex.from_arrays(arrays)
        # delete original author from list
        res.pop(0)
        # copy the row of the original author as many times as fake authors exist
        rows = pd.concat([r]*(len(res)))
        # change index to fake authors' names
        rows.index = res
        # update aug_df such that all fake authors copy their trait values from originala uthor
        aug_df.update(rows)

In [None]:
print("Create pickle")
filepath = "aug_b5feat_label.pkl"
with open(filepath, "wb") as f:
    pickled = pickle.dumps(aug_df, protocol=-1)
    f.write(pickled)

del f
del filepath
del pickled

In [None]:
aug_df['trait']

## Split df for cv: one version with only original authors, one with only new authors

In [None]:
new_augdf = aug_df.copy()
new_augdf = new_augdf.query('index not in @authors')

In [None]:
print("Create pickle")
filepath = "aug_b5feat_label_new.pkl"
with open(filepath, "wb") as f:
    pickled = pickle.dumps(new_augdf, protocol=-1)
    f.write(pickled)

del f
del filepath
del pickled

In [None]:
original_augdf = aug_df.copy()
original_augdf = original_augdf.query('index in @authors')
original_augdf

In [None]:
print("Create pickle")
filepath = "aug_b5feat_label_original.pkl"
with open(filepath, "wb") as f:
    pickled = pickle.dumps(original_augdf, protocol=-1)
    f.write(pickled)

del f
del filepath
del pickled