In [1]:
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Generamos un diccionario asignando un label en función del subreddit
label = {
    'ptsd': 7,
    'EDAnonymous': 6,
    'adhd': 1,
    'depression': 5,
    'schizophrenia': 8,
    'bpd': 4,
    'bipolarreddit': 3,
    'anxiety': 2
}

In [4]:
# Cargamos el dataset combinado
combined = pd.read_csv('/content/drive/Shareddrives/DL + NLP/Proyecto DL + NLP/Entrega_Final/data/raw/combined_balanced.csv')

# Limpieza de todos los post del combined

In [5]:
import re

In [6]:
# Definimos las expresiones regulares de lo que queremos eliminar
mention_regex = r"@[A-Za-z0-9_]+"
link_regex = r"https?://[A-Za-z0-9./]+"
hashtag_regex = r"#[A-Za-z0-9_]+"
emoji_regex = re.compile("["
         u"\U0001F600-\U0001F64F"  # emoticonos
         u"\U0001F300-\U0001F5FF"  # símbolos y pictogramas
         u"\U0001F680-\U0001F6FF"  # transportes y símbolos varios
         u"\U0001F1E0-\U0001F1FF"  # banderas del mundo
         u"\U00002702-\U000027B0"  # símbolos varios
         u"\U000024C2-\U0001F251" 
         "]+", flags=re.UNICODE)
special_chars_regex = r"[^a-zA-Z0-9\s\-\.\?\!\,]+"
url_regex = r"\[(.*?)\]\((.*?)\)"

In [7]:
# Definimos una función para eliminar lo definido previamente
def clean_text(text):
    # Remover menciones
    text = re.sub(mention_regex, "", text)
    # Remover enlaces
    text = re.sub(link_regex, "", text)
    # Remover hashtags
    text = re.sub(hashtag_regex, "", text)
    # Remover emojis
    text = emoji_regex.sub(r'', text)
    # Remover caracteres especiales
    text = re.sub(special_chars_regex, "", text)
    return text

In [8]:
# Aplicar la limpieza y asignar el resultado 
combined['post'] = combined['post'].apply(clean_text)

In [9]:
# Definimos una función de limpieza extra para el corpus de anxiety, para el término amp identificado
def remove_amp(text):
    # Reemplazar "amp" cuando aparece sola y no forma parte de ninguna palabra
    cleaned_text = re.sub(r'\bamp\b', '', text, flags=re.IGNORECASE)
    
    # Eliminar espacios duplicados resultantes
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    
    return cleaned_text

In [10]:
# Definimos una función de limpieza extra para el corpus de anxiety, para el término TLDR identificado
def remove_tldr(text):
    # Reemplazar "TLDR" o "TL,DR" en cualquier forma
    cleaned_text = re.sub(r'\bTLDR\b|\bTL,DR\b', '', text, flags=re.IGNORECASE)
    
    # Eliminar espacios duplicados resultantes
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    
    return cleaned_text

In [11]:
# Aplicar la limpieza del corpus de anxiety y asignar el resultado 
combined['post'] = combined['post'].apply(remove_amp)
combined['post'] = combined['post'].apply(remove_tldr)

# Anxiety

In [12]:
filt = combined['label'] == 2

In [13]:
df_anxiety = combined[filt]

In [14]:
corpus_anxiety = df_anxiety['post']

In [15]:
corpus_anxiety

32000    Extreme anxiety after Road Rage incident Almos...
32001    Cant sleep because I cant breathe I am current...
32002    Is this anxiety? I need help. Hello. Not tryin...
32003    DAE get really anxious on lazy days? I just ha...
32004    Best way to deal with new job anxiety? I have ...
                               ...                        
47995    Anyone try TMS? I am starting TMS on Monday fr...
47996    So Ive been thinking of getting Propranolol fo...
47997    Organic, natural, vegan CBD Hey guys, youve pr...
47998    Anxiety or MS? 25M. Over the last four months ...
47999    Does anyone get numbess in there face? This ju...
Name: post, Length: 16000, dtype: object

In [16]:
corpus_anxiety = '\n'.join(corpus_anxiety.values.astype(str))

In [17]:
with open('/content/drive/Shareddrives/DL + NLP/Proyecto DL + NLP/Entrega_Final/data/generative/ANXIETY/corpus_anxiety.txt', 'w') as archivo:
    archivo.write(corpus_anxiety)

# ADHD

In [None]:
filt = combined['label'] == 1

In [None]:
df_adhd = combined[filt]

In [None]:
corpus_adhd = df_adhd['post']

In [None]:
corpus_adhd

48000    Has anyone tried the GeneSight genetic test fo...
48001    I was doing so well... I was diagnosed back in...
48002    Is it possible to have ADHD and not typically ...
48003    Is it possible to treat ADD, anxiety, and depr...
48004    Everything seems very complicated. I was diagn...
                               ...                        
63995    Cant get proper medication where I live rant I...
63996    Last night I poured myself a smoothie in the k...
63997    How do I bring up ADHD to my doctor? Hi,\n\nI ...
63998    I have TV addiction. What should I do? Not act...
63999    Why it feels so hard to study? Its like, I rea...
Name: post, Length: 16000, dtype: object

In [None]:
corpus_adhd = '\n'.join(corpus_adhd.values.astype(str))

In [None]:
with open('/content/drive/Shareddrives/DL + NLP/Proyecto DL + NLP/Entrega_Final/data/generative/ADHD/corpus_adhd.txt', 'w') as archivo:
    archivo.write(corpus_adhd)

# Depression

In [None]:
filt = combined['label'] == 5

In [None]:
df_depression = combined[filt]

In [None]:
corpus_depression = df_depression['post']

In [None]:
corpus_depression

16000    I want to enjoy relationships with people but ...
16001    I didnt ask to be here. I didnt ask to be born...
16002    I dont know, If i cant handle anything anymore...
16003    I want to know what it feels like to feel genu...
16004    It doesnt even matter I know this will most li...
                               ...                        
31995    I cant even ask for help I hate the sound of m...
31996    Reflections Ive always felt like I didnt belon...
31997    Going to work tomorrow feels impossible If I w...
31998    Its 3 am in germany, in my town is currently t...
31999    Depressed, heartbroken and confused Im a 33 yr...
Name: post, Length: 16000, dtype: object

In [None]:
corpus_depression = '\n'.join(corpus_depression.values.astype(str))

In [None]:
with open('/content/drive/Shareddrives/DL + NLP/Proyecto DL + NLP/Entrega_Final/data/generative/DEPRESSION/corpus_depression.txt', 'w') as archivo:
    archivo.write(corpus_depression)

# Schizophrenia

In [None]:
filt = combined['label'] == 8

In [None]:
df_schizophrenia = combined[filt]

In [None]:
corpus_schizophrenia = df_schizophrenia['post']

In [None]:
corpus_schizophrenia

96000     Doesdid anyone else believe that they arewere ...
96001     My illness has taken everything from me but my...
96002     How to get a better therapist in a shit hole s...
96003     Decided to quit my job today I decided to quit...
96004     why can i smoke weed all day and not get psych...
                                ...                        
111995    Careful, it says Brought the voice back today ...
111996    literally cannot deal with having thoughts i w...
111997    Drowning in my mind. Everything is a program n...
111998    PORTAL I KEEP SEEING WHEATLEY FROM PORTAL AND ...
111999    ......... \n\n.\n\nWhats the chances for anxie...
Name: post, Length: 16000, dtype: object

In [None]:
corpus_schizophrenia = '\n'.join(corpus_schizophrenia.values.astype(str))

In [None]:
with open('/content/drive/Shareddrives/DL + NLP/Proyecto DL + NLP/Entrega_Final/data/generative/SCHIZOPHRENIA/corpus_schizophrenia.txt', 'w') as archivo:
    archivo.write(corpus_schizophrenia)