In [1]:
import numpy as np
import pandas as pd
import os
import sys
import random, string
import nltk
from sklearn.utils import shuffle

In [29]:
root = '../../../../../'

In [30]:
sys.path.append(root)

In [31]:
data = os.path.join(root, 'resources', 'data', 'TC', 'social_rom')

In [32]:
train = pd.read_csv(os.path.join(data, 'train.csv'), sep='|')
val = pd.read_csv(os.path.join(data, 'val.csv'), sep='|')
test = pd.read_csv(os.path.join(data, 'test.csv'), sep='|')

In [33]:
train.shape, val.shape, test.shape

((8920, 13), (785, 13), (737, 13))

In [34]:
def create_dir(d):
    if not os.path.exists(d):
        os.mkdir(d)

In [40]:
def process(df, out_dir):
    def get_id(row):
        return ''.join(random.choices(string.ascii_letters + string.digits, k=16))
    def tokenize(sentence):
        return nltk.word_tokenize(sentence)
    def labelize(label):
        if label == 'n':
            return '0'
        elif label == '0':
            return '1'
        elif label == 'p':
            return '2'
    def join(words):
        return ' '.join(words)
    df = df[df.Domain_Relevance != 0]
    df = df[df.Sentiment.isin(['0', 'p', 'n'])]
    df = df[~df.duplicated(['Sentence'])]
    df['sentence_tokenized'] = df['Sentence'].apply(tokenize)
    df['sentence_tokenized_joined'] = df['sentence_tokenized'].apply(join)
    df['label_new'] = df['Sentiment'].apply(labelize)
    df['ID'] = [''.join(random.choices(string.ascii_letters + string.digits, k=16)) for i in range(df.shape[0])]
    df = shuffle(df)
    sentences_file = os.path.join(out_dir, 'sentences.txt')
    labels_file = os.path.join(out_dir, 'labels.txt')
    id_file = os.path.join(out_dir, 'ID.txt')
    with open(sentences_file, 'a', encoding='utf-8') as s_f, open(labels_file, 'a', encoding='utf-8') as l_f, open(id_file, 'a', encoding='utf-8') as i_f:
        np.savetxt(s_f, df['sentence_tokenized_joined'].values, fmt='%s')
        np.savetxt(l_f, df['label_new'].values, fmt='%s')
        np.savetxt(i_f, df['ID'].values, fmt='%s')

In [41]:
data_dir = os.path.join(root, 'src', 'tc', 'data', 'organic')
create_dir(data_dir)

In [42]:
# TRAIN
out_dir = os.path.join(data_dir, 'train')
create_dir(out_dir)
process(train, out_dir)

In [43]:
# VAL
out_dir = os.path.join(data_dir, 'val')
create_dir(out_dir)
process(val, out_dir)

In [44]:
# TEST
out_dir = os.path.join(data_dir, 'test')
create_dir(out_dir)
process(test, out_dir)