In [None]:
!pip install transformers

In [None]:
# importing the Dataset
import pandas as pd
messages = pd.read_csv("/content/drive/MyDrive/INDOML Dataset/Datasets/INDOMLNLPTEXT.csv")
from transformers import pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization, Dropout, Layer
from tensorflow.keras.layers import Embedding, Input, GlobalAveragePooling1D, Dense
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential, Model
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.optimizers.legacy import Adam
import numpy as np
import re
import warnings
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)

In [None]:
class TransformerBlock(Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = Sequential(
            [Dense(ff_dim, activation="relu"),
             Dense(embed_dim),]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)
class TokenAndPositionEmbedding(Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [None]:
import pandas as pd
import nltk
from collections import Counter

# download the required nltk resources
nltk.download('punkt')
nltk.download('stopwords')

# define a function to preprocess the text data
def preprocess_text(text):
    """
    Preprocesses a given text string by tokenizing it into words,
    removing stop words, and lowercasing all words.
    """
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = nltk.word_tokenize(text)
    filtered_tokens = [token.lower() for token in tokens if token.lower() not in stop_words]
    return filtered_tokens

# define a function to compute the word frequencies and their corresponding indices
def compute_word_frequencies(messages):
    """
    Computes the word frequencies and their corresponding indices
    for all the messages in the given dataframe.
    """
    word_counts = Counter()
    for message in messages['Text']:
        filtered_tokens = preprocess_text(message)
        word_counts.update(filtered_tokens)
    word_freqs = {word: idx for idx, (word, _) in enumerate(word_counts.most_common(), 1)}
    return word_freqs

# compute the word frequencies and their indices
word_freqs = compute_word_frequencies(messages)

# replace the words in the original dataset with their corresponding indices
messages['Text'] = messages['Text'].apply(lambda message: ' '.join(str(word_freqs.get(word.lower(), 0)) for word in preprocess_text(message)))


In [None]:
messages

In [None]:
messages.to_csv('nlp_vector.csv', index=False)

In [None]:
lemmatizer = WordNetLemmatizer()
def lament(a):
    delimiters = '.',' ',',',';','!',':','?','\t','\n','\0'
    regex_pattern = '|'.join(map(re.escape, delimiters))
    splits = re.split(regex_pattern, a)
    lament = []
    for i in splits:
        lament.append(lemmatizer.lemmatize(i))
    return ' '.join(lament)
messages['Lemmatized'] = messages.apply(lambda row : lament(row['Text']), axis = 1)
messages

In [None]:
str=" "
for index, row in messages.iterrows():
    str+=row['Lemmatized']

In [None]:
from nltk import FreqDist
words = str.split()
fdist1 = FreqDist(words)
print(fdist1)
print(fdist1.most_common())

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
maxlen = 2500
cv = CountVectorizer(max_features=maxlen)
X = pd.DataFrame(cv.fit_transform(messages['Lemmatized']).toarray())
y = messages['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [None]:
X_train

In [None]:
y_train

In [None]:
embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer
vocab_size=1278

inputs = Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = GlobalAveragePooling1D()(x)
x = Dropout(0.1)(x)
x = Dense(20, activation="relu")(x)
x = Dropout(0.1)(x)
outputs = Dense(16, activation="softmax")(x)

model = Model(inputs=inputs, outputs=outputs)

In [None]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

history = model.fit(X_train, y_train,
                    batch_size=64, epochs=2,
                    validation_data=(X_test, y_test)
                   )