In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [3]:
%cd "/content/drive/MyDrive/01 - Thesis/"

/content/drive/MyDrive/01 - Thesis


In [4]:
%cd "/content/drive/MyDrive/01 - Thesis/1000 - Data"

/content/drive/MyDrive/01 - Thesis/1000 - Data


In [5]:
import string
import pandas as pd
import tqdm.notebook as tq
import numpy as np
from IPython.utils import io
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
import re
import html

# Loading data

In [6]:
#Reading the TSV files and verifying the number of rows
df_books_unclean = pd.read_csv('./01 - Books and Online Resources.tsv', delimiter='\t', encoding='utf-8')

print(f'Sentences count:               {df_books_unclean.shape[0]}')

Sentences count:               4663


In [7]:
# reset the index
df_books_unclean.reset_index(drop=True, inplace=True)

In [8]:
# consolidate sentences together
sentences = df_books_unclean['non-distorted'].tolist() + df_books_unclean['distorted'].tolist()

# create a new dataframe with consolidated sentences and labels
df_books_online_resources = pd.DataFrame({
    'sentence': sentences,
    'label': ['non-distorted']*len(df_books_unclean) + ['distorted']*len(df_books_unclean),
    'encoded_label': [0]*len(df_books_unclean) + [1]*len(df_books_unclean)
})

In [9]:
# drop NaN values and reset the index
df_books_online_resources.dropna(inplace = True)
df_books_online_resources.reset_index(drop=True, inplace=True)

In [10]:
df_books_online_resources['encoded_label'].value_counts()

0    4663
1     991
Name: encoded_label, dtype: int64

In [11]:
#Reading the TSV files and verifying the number of rows
df_therapistQA = pd.read_csv('./02 - therapistQA_labeled.tsv', delimiter='\t', encoding='utf-8')

print(f'Sentences count:               {df_therapistQA.shape[0]}')

Sentences count:               2150


In [12]:
df_therapistQA.head(5)

Unnamed: 0,sentence,label,encoded_label
0,it was only within the past 2 years that my cr...,distorted,1
1,i feel it since i was kid and until now it's g...,distorted,1
2,probably because i hate myself for how arrogan...,distorted,1
3,is this simply my adhd causing memory problems...,distorted,1
4,"nothing matters to me, genocides, war, politic...",distorted,1


In [13]:
df_corpus_labeled = pd.concat([df_books_online_resources, df_therapistQA], ignore_index=True)

# Clean the data

In [14]:
# Define a function to decode HTML entities
def clean_text(text):
    text = html.unescape(text)
    text = text.replace('&#8220;', '"')
    text = text.replace('&#8221;', '"')
    text = text.replace('&ldquo;', '"')
    text = text.replace('&rdquo;', '"')
    text = text.replace('&quot;', '"')
    text = text.replace('<SQUOTE>', "'")
    text = text.replace('<DQUOTE>', '"')
    text = text.replace("&#8216;", "'")
    text = text.replace("&#8217;", "'")
    text = text.replace("&#8216;", "'")
    text = text.replace("&#8217;", "'")
    text = text.replace("&#8218;", "'")
    text = text.replace("&#8219;", "'")
    text = text.replace("&#0145;", "'")
    text = text.replace("&#39;", "'")
    text = text.replace("&#8242;", "'")
    text = text.replace("&#8245;", "'")
    text = text.replace("&#039;", "'")
    text = text.replace("&#0146;", "'")
    text = text.replace("&lsquo;", "'")
    text = text.replace("&rsquo;", "'")
    text = text.replace("&sbquo;", "'")
    text = text.replace("&apos;", "'")
    text = text.replace("&prime;", "'")
    text = text.replace("&lsquor;", "'")
    text = text.replace("&rsquor;", "'")
    text = text.replace("&ldquo;", '"')
    text = text.replace("&rdquo;", '"')
    text = text.replace("&bdquo;", '"')
    text = text.replace("&quot;", '"')
    text = text.replace("&ldquor;", '"')
    text = text.replace("&rdquor;", '"')
    # replace single quotes with straight quotes
    text = text.replace("'", "'")

    # replace curly quotes with straight quotes
    text = text.replace("‘", "'")
    text = text.replace("’", "'")

    # replace double quotes with straight quotes
    text = text.replace('"', '"')

    # replace curly quotes with straight quotes
    text = text.replace("“", '"')
    text = text.replace("”", '"')

    text = text.lower()

    # If the text doesn't end with a punctuation mark, add a period at the end
    if text[-1] not in string.punctuation:
        text += "."
    else:
        # If the text ends with a non-valid punctuation mark, replace it with a period
        if text[-1] not in [".", "?", "!"]:
            text = text[:-1] + "."
        # If the text ends with a valid punctuation mark, leave it as is
        else:
            text = text[:-1] + text[-1]

    return text

In [15]:
# Apply the function to the tweets column using apply() function to normalize the quotes type
df_corpus_labeled['sentence'] = df_corpus_labeled['sentence'].apply(clean_text)

In [16]:
# drop NaN values and reset the index
df_corpus_labeled.dropna(inplace = True)
df_corpus_labeled.reset_index(drop=True, inplace=True)

In [17]:
df_corpus_labeled['encoded_label'].value_counts()

0    5366
1    2438
Name: encoded_label, dtype: int64

# Splitting Train and Test dataset

In [None]:
# Split the dataset into training and validation sets
train_data, test_data = train_test_split(
    df_corpus_labeled,
    test_size=0.25,
    random_state=42,
    stratify=df_corpus_labeled["label"].values
)

In [None]:
# Convert the training and validation sets into pandas dataframes
train_df = pd.DataFrame(train_data, columns=df_corpus_labeled.columns)
test_df = pd.DataFrame(test_data, columns=df_corpus_labeled.columns)

In [None]:
print(f'the sentences count of train dataset: {train_df.shape[0]}')
print(f'the sentences count of test dataset:  {test_df.shape[0]}')

the sentences count of train dataset: 5853
the sentences count of test dataset:  1951


In [None]:
test_df['label'].value_counts()

non-distorted    1342
distorted         609
Name: label, dtype: int64

# Storing Combined dataset

In [None]:
df_corpus_labeled.to_csv('./03 - corpus_labeled.tsv',sep='\t',index=False)

# Storing training and testing dataset

In [None]:
train_df.to_csv('./Train and Test Data/train_sentences.tsv',sep='\t',index=False)
test_df.to_csv('./Train and Test Data/test_sentences.tsv',sep='\t',index=False)