In [1]:
import pandas as pd
from tqdm import tqdm
import src.utils as utils

In [2]:
# chunk your dataframes in small portions
chunks = pd.read_csv("comments_trustpilot_v2.csv",
                         usecols=['comment', 'rating'],
                         chunksize=50000)
texts = []
labels = []
for df_chunk in tqdm(chunks):
    aux_df = df_chunk.copy()
    aux_df = aux_df.sample(frac=1)
    aux_df = aux_df[~aux_df['comment'].isnull()]
    aux_df = aux_df[(aux_df['comment'].map(len) > 1)]
    aux_df['processed_text'] = (aux_df['comment'].map(lambda text: utils.process_text(['lower'], text)))
    texts += aux_df['processed_text'].tolist()
    labels += aux_df['rating'].tolist()

14it [00:04,  3.04it/s]


In [7]:
labels=list(map(lambda l: {1: 0, 2: 0,3:1,4: 2, 5: 2}[l], labels))

In [8]:
from collections import Counter
import numpy as np

In [9]:
# label balance
counter = Counter(labels)
keys = list(counter.keys())
values = list(counter.values())
count_minority = np.min(values)

balanced_labels = []
balanced_texts = []

for key in keys: 
    balanced_texts += [text for text, label in zip(texts, labels) if label == key][:int(count_minority)]
    balanced_labels += [label for text, label in zip(texts, labels) if label == key][:int(count_minority)] 

texts = balanced_texts
labels = balanced_labels

In [10]:
Counter(labels)

Counter({2: 24563, 0: 24563, 1: 24563})

In [11]:
counter = Counter(labels)
counter = dict(counter)
for k in counter:
    counter[k] = 1 / counter[k]
sample_weights = np.array([counter[l] for l in labels])

In [12]:
len(sample_weights)

73689

In [13]:
sample_weights

array([4.07116395e-05, 4.07116395e-05, 4.07116395e-05, ...,
       4.07116395e-05, 4.07116395e-05, 4.07116395e-05])

In [14]:
class_names = sorted(list(set(labels)))
class_names = [str(class_name) for class_name in class_names]

In [15]:
class_names

['0', '1', '2']

In [16]:
vocabulary="abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+ =<>()[]{}"
number_of_characters=69
max_length=150
identity_mat=np.identity(number_of_characters)

In [19]:
raw_text = texts[0]
data = np.array([identity_mat[vocabulary.index(i)] for i in list(raw_text)[::-1] if i in vocabulary],dtype=np.float32)

In [29]:
np.array([i for i in list(raw_text)[::-1] if i not in vocabulary])

array(['.', 'm', 'e', 'h', 't', ' ', 'f', 'o', ' ', 'y', 'n', 'a', ' ',
       'e', 's', 'o', 'l', ' ', 't', '’', 'n', 'o', 'd', ' ', 'i', ' ',
       'e', 'p', 'o', 'h', ' ', 'i', ' ', 'd', 'n', 'a', ' ', 'r', 'e',
       'v', 'e', 'w', 'o', 'h', ' ', 'p', 'i', 'r', 't', ' ', 'r', 'i',
       'e', 'h', 't', ' ', 'm', 'o', 'r', 'f', ' ', 'd', 'e', 's', 's',
       'e', 'r', 't', 's', ' ', 'y', 'r', 'e', 'v', ' ', 'e', 'r', 'e',
       'w', ' ', 'y', 'e', 'h', 't', ' ', '.', 'h', 's', 'i', 'f', ' ',
       's', 'u', 'c', 's', 'i', 'd', ' ', 'w', 'e', 'n', ' ', 'y', 'm',
       ' ', 'e', 'v', 'o', 'l', ' ', 'r', 'e', 'd', 'r', 'o', ' ', 'y',
       'm', ' ', 'h', 't', 'i', 'w', ' ', 'y', 'p', 'p', 'a', 'h', ' ',
       'y', 'l', 'l', 'a', 'e', 'r'], dtype='<U1')

In [28]:
identity_mat

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])