In [None]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import  train_test_split

In [None]:
nltk.download('stopwords')
#nltk.download('all')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
#printing stopwatch
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [None]:
#load the dataset
suicide_data = pd.read_csv('/content/drive/MyDrive/Thesis/suicide_data_main.csv')

In [None]:
#first 10 row from dataset
suicide_data.head(10)

Unnamed: 0.1,Unnamed: 0,post_id,user_id,timestamp,post_title,post_body,text,labels,suicide_class
0,0,wfimt,22002,1342075703,[real] motivation,This is my first post on reddit. Some time ago...,[real] motivation This is my first post on red...,0,Suicide
1,1,1bsqv3,22002,1365261010,simple question about transfering acc to anoth...,Hi.. What will happen with my ranked rating? I...,simple question about transfering acc to anoth...,1,Non_Suicide
2,2,1dr0xf,22002,1367787358,simple question: Did you get unnbaned?,Hi. Simple question. Did you get unban from a ...,simple question: Did you get unnbaned? Hi. Sim...,1,Non_Suicide
3,3,1e0noi,22002,1368125785,I can't win. Why... and it's noy my fault.,Hi... Am playing at Eu west... am diamond 5 ri...,I can't win. Why... and it's noy my fault. Hi....,1,Non_Suicide
4,4,1f0y6g,22002,1369483647,Diamond 5. Mrr rest,If i will switch server to EU and then go back...,Diamond 5. Mrr rest If i will switch server to...,1,Non_Suicide
5,5,1g4s52,22002,1370970723,Scared of next ban. What can i do?,Hi guys. I already got a one perm ban. I creat...,Scared of next ban. What can i do? Hi guys. I ...,1,Non_Suicide
6,6,1it5ts,22002,1374498763,Too all players who had huge impact on losing ...,"Instead of ""you are reported"" you should say t...",Too all players who had huge impact on losing ...,1,Non_Suicide
7,7,1sge5s,22002,1386584670,"Short question, short story, Pls help me decide.",Hi am LOL player.. I think i dont understand n...,"Short question, short story, Pls help me decid...",1,Non_Suicide
8,8,1sjbf9,22002,1386664896,Maybe it's the right time to fa...,Maybe it's the right moment TO **FACE** to fu...,Maybe it's the right time to fa... Maybe it's ...,0,Suicide
9,9,226mcd,22002,1396611845,My conclusion after thinking about suicide in ...,"Well, i want to share it with somebody... If i...",My conclusion after thinking about suicide in ...,0,Suicide


In [None]:
suicide_data.shape

(698997, 9)

In [None]:
suicide_data.isnull().sum()

Unnamed: 0       0
post_id          0
user_id          0
timestamp        0
post_title       0
post_body        0
text             0
labels           0
suicide_class    0
dtype: int64

In [None]:
def cleanText(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('@[A-Za-z0-9]+','',text)
    text = re.sub('https?://\S+|www\.\S+', '', text) 
    text = re.sub('<.*?>+', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)   
    return text

In [None]:
#cleaning the text
suicide_data['text'] = suicide_data['text'].apply(cleanText)

In [None]:
#tokenization
def tokenize(txt):
  tokens = re.split('\W+', txt)
  return tokens

suicide_data['text'] = suicide_data['text'].apply(lambda x : tokenize(x.lower()))

In [None]:
#removing stopword
STOPWORDS = set(stopwords.words('english')) 
def remove_stopwords(text):
  return " ".join([word for word in text if word not in STOPWORDS])

suicide_data['text'] = suicide_data['text'].apply(lambda x: remove_stopwords(x))

In [None]:
from nltk.stem.snowball import SnowballStemmer
# Download the Snowball stemmer
nltk.download('snowball_data')
nltk.download('punkt')

[nltk_data] Downloading package snowball_data to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Initialize the stemmer
stemmer = SnowballStemmer('english')

In [None]:
#Stemming
def stem_text(text):
    words = nltk.word_tokenize(text)
    stemmed_words = [stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)

In [None]:
# Apply stemming to the selected column
suicide_data['text'] = suicide_data['text'].apply(stem_text)

In [None]:
#Lemmatization
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [None]:
# Download the WordNet lemmatizer and wordnet data
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
# Define a function to perform lemmatization on a given text
def lemmatize_text(text):
    # Convert part-of-speech tags from Penn Treebank format to WordNet format
    def get_wordnet_pos(treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN
    # Tokenize the text into words
    words = nltk.word_tokenize(text)
    # Tag the words with part-of-speech tags
    tagged_words = nltk.pos_tag(words)
    
    # Lemmatize each word using its part-of-speech tag
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in tagged_words]
    
    # Join the lemmatized words back into a single string
    return ' '.join(lemmatized_words)

In [None]:
# Apply lemmatization to the selected column
suicide_data['text'] = suicide_data['text'].apply(lemmatize_text)

In [None]:
suicide_data

Unnamed: 0.1,Unnamed: 0,post_id,user_id,timestamp,post_title,post_body,text,labels,suicide_class
0,0,wfimt,22002,1342075703,[real] motivation,This is my first post on reddit. Some time ago...,motiv first post reddit time ago want chang li...,0,Suicide
1,1,1bsqv3,22002,1365261010,simple question about transfering acc to anoth...,Hi.. What will happen with my ranked rating? I...,simpl question transfer acc anoth server hi ha...,1,Non_Suicide
2,2,1dr0xf,22002,1367787358,simple question: Did you get unnbaned?,Hi. Simple question. Did you get unban from a ...,simpl question get unnban hi simpl question ge...,1,Non_Suicide
3,3,1e0noi,22002,1368125785,I can't win. Why... and it's noy my fault.,Hi... Am playing at Eu west... am diamond 5 ri...,win noy fault hi play eu west diamond right dr...,1,Non_Suicide
4,4,1f0y6g,22002,1369483647,Diamond 5. Mrr rest,If i will switch server to EU and then go back...,diamond mrr rest switch server eu go back west...,1,Non_Suicide
...,...,...,...,...,...,...,...,...,...
698992,698992,2rglgg,-39973,1420505138,"[Offline] San Fernando Valley, _PERSON_. Looki...",I DM a group of 3 at the moment. A Tiefling fi...,san fernando valley look player dm group mome...,1,Non_Suicide
698993,698993,2uqxiv,-39973,1423052014,Exhaustion.,Just wondering how other DMs play with it. Whe...,exhaust wonder dm play pc gain level tell play...,1,Non_Suicide
698994,698994,21hlxs,-39990,1395905138,I had this crazy idea. Pick 2 pictures and I w...,"So if you've seen my latest posts, I like to t...",crazi idea pick pictur combin see late post li...,1,Non_Suicide
698995,698995,25311d,-39990,1399591174,There's not enough animated futa. I plan to fi...,"Fellow futanari fanatics, as I'm sure you woul...",enough anim futa plan fix need help fellow fut...,1,Non_Suicide


In [None]:
columns_to_export = suicide_data[['post_id','text','labels','suicide_class']]

In [None]:
from google.colab import files
columns_to_export.to_csv('suicide_data_cleaned.csv', encoding = 'utf-8-sig') 
files.download('suicide_data_cleaned.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>