In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report,f1_score
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding,LSTM

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

import os

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
print(tf.__version__)
print(np.__version__)

2.17.0
1.26.4


In [None]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
train_path = '/content/drive/MyDrive/Analytics/NLP/Cyber_Crime/data/train.csv'
test_path='/content/drive/MyDrive/Analytics/NLP/Cyber_Crime/data/test.csv'
train=pd.read_csv(train_path)
test=pd.read_csv(test_path)
print(train.shape,test.shape)
train.head()

(93686, 3) (31229, 3)


Unnamed: 0,category,sub_category,crimeaditionalinfo
0,Online and Social Media Related Crime,Cyber Bullying Stalking Sexting,I had continue received random calls and abusi...
1,Online Financial Fraud,Fraud CallVishing,The above fraudster is continuously messaging ...
2,Online Gambling Betting,Online Gambling Betting,He is acting like a police and demanding for m...
3,Online and Social Media Related Crime,Online Job Fraud,In apna Job I have applied for job interview f...
4,Online Financial Fraud,Fraud CallVishing,I received a call from lady stating that she w...


In [2]:
all_labels=list(set(list(test["category"].unique())+list(train["category"].unique())))

In [None]:
from nltk.corpus import stopwords
import re
import spacy
nlp=spacy.load('en_core_web_sm')
stopwords_list=stopwords.words('english')
def preprocess_text(text):
    # Lower Casing
    if not isinstance(text, str):
      text = str(text)
    text = text.lower()
    text=re.sub("[^a-zA-Z]"," ",text)
    # Tokenization
    #tokens = word_tokenize(text)
    tokens = text.split()

    # Remove stop words and numbers
    tokens = [word for word in tokens if word not in stopwords_list]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Stemming
    #stemmer = PorterStemmer()
    #tokens = [stemmer.stem(word) for word in tokens]

    return " ".join(tokens)

#Second implemenation is For better speed using spacy lib
"""
def preprocess_text2(text):
    # Lower Casing
    if not isinstance(text, str):
        text = str(text)
    text = text.lower()

    # Remove non-alphabetic characters
    text = re.sub("[^a-zA-Z]", " ", text)

    # Tokenization and lemmatization using spaCy
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop]

    return " ".join(tokens)
"""
from tqdm import tqdm
# Apply preprocessing to the 'text' column
train['preprocessed_text'] = train["crimeaditionalinfo"].apply(preprocess_text)
test['preprocessed_text'] = test["crimeaditionalinfo"].apply(preprocess_text)


In [None]:
all_labels

['Sexually Obscene material',
 'Online and Social Media Related Crime',
 'Any Other Cyber Crime',
 'Cryptocurrency Crime',
 'RapeGang Rape RGRSexually Abusive Content',
 'Online Financial Fraud',
 'Cyber Terrorism',
 'Crime Against Women & Children',
 'Hacking  Damage to computercomputer system etc',
 'Cyber Attack/ Dependent Crimes',
 'Report Unlawful Content',
 'Online Cyber Trafficking',
 'Online Gambling  Betting',
 'Ransomware',
 'Child Pornography CPChild Sexual Abuse Material CSAM',
 'Sexually Explicit Act']

In [None]:
from sklearn.preprocessing import LabelEncoder
Encoder = LabelEncoder()
Encoder.fit(pd.Series(all_labels))
train['category'] = Encoder.transform(train['category'] )


In [None]:
test['category'] = Encoder.transform(test['category'] )

In [None]:
import pickle
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(Encoder, f)
with open('label_encoder.pkl', 'rb') as f:
  loaded_encoder = pickle.load(f)

In [None]:
for category in train['category'].unique():
       if len(train[train['category'] == category]) == 1:
           print(f"Category '{category}' has only 1 sample.")

Category '13' has only 1 sample.


In [None]:
X_train, X_val, y_train, y_val = train_test_split(train['preprocessed_text'],
                                                  train['category'],
                                                  test_size=0.15,
                                                  random_state=42,)
                                                  #stratify=train['category'].values)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dropout

In [None]:
# Tokenize the text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train['preprocessed_text'])
word_index = tokenizer.word_index

# Convert text to sequences of integers
X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)
X_test = tokenizer.texts_to_sequences(test['preprocessed_text']) # This line was missing



# Pad sequences to a fixed length
max_len = 100  # Adjust as needed
X_train = pad_sequences(X_train, maxlen=max_len)
X_val =  pad_sequences(X_val, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

X_train = np.array(X_train)
X_val = np.array(X_val)  # Assuming X_val is also preprocessed
X_test = np.array(X_test)

In [None]:
import pickle
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)
with open('tokenizer.pkl', 'rb') as f:
  loaded_tokenizer = pickle.load(f)

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint,EarlyStopping

checkpoint_filepath = "/content/drive/MyDrive/Analytics/NLP/Cyber_Crime/LSTM_checkpoints/best_model.keras" #'best_model.hdf5'
checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True
)
# Add Early Stopping Callback
early_stopping_callback = EarlyStopping(
    monitor='val_accuracy',  # Monitor validation accuracy
    patience=10,              # Number of epochs with no improvement after which training will be stopped
    mode='max',              # Maximize validation accuracy
    restore_best_weights=True # Restore model weights from the epoch with the best value of the monitored quantity.
)


In [None]:
model = Sequential()
model.add(Embedding(10000, 100))
model.add(LSTM(64, return_sequences=True, dropout=0.2))  # First LSTM layer with return_sequences=True
model.add(LSTM(32, dropout=0.2))  # Second LSTM layer
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(len(Encoder.classes_), activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, LeakyReLU, SpatialDropout1D
from keras.initializers import GlorotUniform
from keras.regularizers import l2
from keras.layers import Bidirectional

model = Sequential()

# Embedding Layer
model.add(Embedding(10000, 100, input_length=max_len, embeddings_initializer=GlorotUniform()))

# First LSTM Layer with Dropout and Regularization
# Pass the GlorotUniform initializer to kernel_initializer
model.add(Bidirectional(LSTM(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2, kernel_regularizer=l2(l2=1e-4), kernel_initializer=GlorotUniform())))

# Second Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(32, dropout=0.2, recurrent_dropout=0.2, kernel_regularizer=l2(l2=1e-4), kernel_initializer=GlorotUniform()))) #

# Dense Layer with LeakyReLU Activation and Regularization
model.add(Dense(64, activation='leaky_relu', kernel_regularizer=l2(l2=1e-4)))

# Dropout Layer
model.add(Dropout(0.2))

# Output Layer
model.add(Dense(len(Encoder.classes_), activation='softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])



In [None]:
model.fit(X_train, y_train, epochs=25, batch_size=32,
          validation_data=(X_val, y_val),callbacks=[checkpoint_callback,early_stopping_callback])

Epoch 1/25
[1m2489/2489[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m649s[0m 257ms/step - accuracy: 0.7174 - loss: 0.9873 - val_accuracy: 0.7546 - val_loss: 0.7756
Epoch 2/25
[1m2489/2489[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m675s[0m 254ms/step - accuracy: 0.7556 - loss: 0.7502 - val_accuracy: 0.7561 - val_loss: 0.7368
Epoch 3/25
[1m2489/2489[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m683s[0m 255ms/step - accuracy: 0.7674 - loss: 0.6958 - val_accuracy: 0.7584 - val_loss: 0.7537
Epoch 4/25
[1m2489/2489[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m679s[0m 254ms/step - accuracy: 0.7796 - loss: 0.6614 - val_accuracy: 0.7573 - val_loss: 0.7404
Epoch 5/25
[1m2489/2489[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m686s[0m 255ms/step - accuracy: 0.7949 - loss: 0.6230 - val_accuracy: 0.7578 - val_loss: 0.7382
Epoch 6/25
[1m2489/2489[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m680s[0m 255ms/step - accuracy: 0.8043 - loss: 0.5918 - val_accuracy: 0.7482 - val_loss:

<keras.src.callbacks.history.History at 0x78da658b5c90>

In [None]:
# Evaluate the model on the test set
y_test=test['category']

from sklearn.metrics import accuracy_score, f1_score

# Make predictions on the test set
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_classes)

f1 = f1_score(y_test, y_pred_classes, average='weighted')

print('Test Accuracy:', accuracy)
print('Test F1-score:', f1)

[1m976/976[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 55ms/step
Test Accuracy: 0.7479586281981492
Test F1-score: 0.7096525882303913


In [None]:
#Code for loaded the saved model and predicting with it


from tensorflow.keras.models import load_model

# Load the saved model
model = load_model(checkpoint_filepath)

# Make predictions on new data
new_data =  """
I discovered that my personal information, including my [specific personal information, e.g., Social Security number,
 credit card numbers, or bank account information] had been compromised. I believe this information was used to
  [describe how the information was misused, e.g., open fraudulent accounts, make unauthorized purchases, or file fraudulent tax returns].
  I have attached [evidence, such as credit reports, police reports, or fraudulent transaction statements] to support my claim. I am requesting a
  thorough investigation into this matter and appropriate legal action against the individuals responsible.
"""
new_data = tokenizer.texts_to_sequences(new_data)
new_data = pad_sequences(new_data, maxlen=max_len)
predictions = model.predict(new_data)


[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 84ms/step


In [None]:
classs=np.argmax(np.argmax(predictions,axis=0))
Encoder.inverse_transform([classs])

array(['Online and Social Media Related Crime'], dtype=object)

### Our model is Severely Overfitting

**Addressing Overfitting in Text Classification**

**Overfitting** occurs when a model becomes too complex and learns the training data too well, leading to poor generalization on unseen data.

**Strategies to Mitigate Overfitting:**

1. **Data Augmentation:**
   - Increase data diversity by creating new samples through techniques like back-translation, synonym replacement, and text generation.

2. **Regularization:**
   - **L1/L2 Regularization:** Penalizes large weights, reducing model complexity.
   - **Dropout:** Randomly drops units during training, preventing co-adaptation.

3. **Early Stopping:**
   - Monitors a validation set and stops training when performance starts to degrade.

4. *Model Architecture:**
   - **Pre-trained Models:** Use pre-trained language models like BERT, RoBERTa, or ALBERT as a strong foundation.

5. **Hyperparameter Tuning:**
   - Experiment with different hyperparameters like learning rate, batch size, and number of epochs.
6. **Effective Weight Initialization:**
  - Initial Weights impacts the performace of our model experimenting with different initialization methods will possibly contribute to a better generalization

By implementing these techniques, you can effectively mitigate overfitting and improve the generalization performance of your text classification model.


## Naive Bayes

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score
import pickle



X_train, X_val, y_train, y_val = train_test_split(train['preprocessed_text'],
                                                  train['category'],
                                                  test_size=0.15,
                                                  random_state=42,)

#train['preprocessed_text']
#test['preprocessed_text']

# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_val = vectorizer.transform(X_val)
X_test=vectorizer.transform(test['preprocessed_text'])


# Naive Bayes (MultinomialNB)
naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(X_train, y_train)

# Make predictions
y_pred_nb = naive_bayes_model.predict(X_val)

# Evaluate Naive Bayes Model
accuracy_nb = accuracy_score(y_val, y_pred_nb)
f1_nb = f1_score(y_val, y_pred_nb, average='weighted')
print("Accuracy (Naive Bayes):", accuracy_nb)
print("F1-score (Naive Bayes):", f1_nb)

# Save the Naive Bayes model with pickle
with open('naive_bayes_model.pkl', 'wb') as f:
    pickle.dump(naive_bayes_model, f)


Accuracy (Naive Bayes): 0.7199886145307052
F1-score (Naive Bayes): 0.631301443007262
