### Importing the Dataset

In [None]:
import numpy as np
import pandas as pd

posts = pd.read_csv("../data/Suicide_Detection.csv")
posts.drop(["Unnamed: 0"], axis=1, inplace=True)
posts[["class"]] = (posts[["class"]] == "suicide").astype("int16")

### Exploring the Dataset

In [None]:
posts.head()

In [None]:
posts.describe()

### Splitting the Dataset

In [None]:
from sklearn.model_selection import train_test_split

strat_train_set, strat_test_set = train_test_split(posts, test_size=0.1, random_state=42)

#### Analyzing the Dataset

In [None]:
posts = strat_train_set.copy()
posts.head()

In [None]:
post_lengths = [len(post.split()) for post in posts["text"] if len(post.split()) < 1500]

In [None]:
import matplotlib.pyplot as plt

plt.hist(post_lengths, bins=100)
plt.show()

### Preprocessing

#### Creating validation data

In [None]:
strat_train_set, strat_val_set = train_test_split(posts, test_size=1/9, random_state=1)

strat_train_set[:5]

#### Tokenizing the posts

In [None]:
import spacy

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
spacy.require_gpu()
nlp.pipe_names

In [None]:
doc = nlp('I was reading the paper.')
print([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

In [None]:
import pickle
# from alive_progress import alive_it

def preprocess_set(set, directory):
    print(f'Preprocessing {directory} data')

    texts = set.copy()['text']
    labels = set.copy()['class']
    texts = [' '.join(text.split()[:500]) for text in texts]

    docs = (doc for doc in (nlp.pipe(texts)))
    processed_texts = []

    for doc in docs:
        lemmas = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
        processed_texts.append(' '.join(lemmas))

    # for doc in alive_it(docs, total=len(texts)):
    #     lemmas = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    #     processed_texts.append(' '.join(lemmas))
    
    labels = np.array(labels)

    from pathlib import Path
    Path(f"{directory}").mkdir(parents=True, exist_ok=True)

    with open(f"{directory}/texts.pkl", "wb") as fp:
        pickle.dump(processed_texts, fp)
    
    with open(f"{directory}/labels.pkl", "wb") as fp:
        pickle.dump(labels, fp)

    return processed_texts, labels

In [None]:
len(strat_train_set)

In [None]:
processed_texts, labels = preprocess_set(strat_train_set, 'train')
preprocess_set(strat_val_set, 'val')
preprocess_set(strat_test_set, 'test')