# Preprocessing Text to be used in models

# Importing the required libraries

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
from nltk.stem.snowball import SnowballStemmer
import tqdm


# Read the csv and extract the comments and the labels

In [None]:
dir = "../input/sarcastic-comments-on-reddit/train-balanced-sarcasm.csv"
data = pd.read_csv(dir)
data.head(5)

In [None]:
comments = data['comment'].values
labels = data['label'].values

# Cleaning the text and removing links/ punctuation and so on..

In [None]:
text_cleaning = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
stemmer = SnowballStemmer('english', ignore_stopwords=False)

In [None]:
def preprocess_data(text):
    text = re.sub(text_cleaning, ' ', str(text).lower()).strip()
    text = stemmer.stem(str(text))
    return text

X = []
for i in tqdm.tqdm(range(len(comments))):
    X.append(preprocess_data(comments[i]))

# Tokenize the cleaned text and pad them accordingly

In [None]:
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(X)

In [None]:
sequences = tokenizer.texts_to_sequences(X)
padded = pad_sequences(sequences, padding='post', maxlen=20)

# Split the data into train and test sets

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(np.array(padded), np.array(labels))

# Now the data is ready to be used as inputs in a neural network

In [None]:
print(f"Actual Sentence: {comments[860]}\nStemmed Sentence: {X[860]}\nTokenized: {sequences[860]}\nPadded: {padded[860]}")