In [66]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier  # Import LightGBM classifier

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from gensim.models import KeyedVectors
from nltk.tokenize import word_tokenize
from tqdm import tqdm

In [67]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from gensim.models import KeyedVectors
from tqdm import tqdm
from nltk.tokenize import word_tokenize

# Load your processed data
df_train = pd.read_csv('train_processed_data.csv')
df_test = pd.read_csv('test_processed_data.csv')

# Check for NaN values in training and testing sets
print("Training data NaN values:", df_train.isnull().sum())
print("Testing data NaN values:", df_test.isnull().sum())

# Fill NaN values with an empty string or drop them
df_train['crimeaditionalinfo_preprocessed'] = df_train['crimeaditionalinfo_preprocessed'].fillna('')
df_test['crimeaditionalinfo_preprocessed'] = df_test['crimeaditionalinfo_preprocessed'].fillna('')

# Define the target and features
X_text_train = df_train['crimeaditionalinfo_preprocessed']
y_train = df_train['category']  # Assuming 'category' column is the label

X_text_test = df_test['crimeaditionalinfo_preprocessed']  # Fixed to use test set
y_test = df_test['category']  # Assuming 'category' column is the label

# Combine training and testing labels to ensure all categories are captured
all_categories = pd.concat([y_train, y_test]).unique()

# Get unique categories from training data
train_categories = y_train.unique()

# Filter test data to only include categories present in training data
df_test= df_test[df_test['category'].isin(train_categories)]
X_text_test = df_test['crimeaditionalinfo_preprocessed']
y_test = df_test['category']


# Load pre-trained GloVe and Word2Vec embeddings
glove_path = 'glove.6B.300d.txt'
word2vec_path = 'GoogleNews-vectors-negative300.bin.gz'

# Load GloVe embeddings
def load_glove_embeddings(glove_path):
    embeddings = {}
    with open(glove_path, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

# Load Word2Vec embeddings
word2vec = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

# Generate embeddings for a given text
def get_embedding(text, embeddings, dim=300):
    tokens = word_tokenize(text)
    vectors = [embeddings[word] for word in tokens if word in embeddings]
    if len(vectors) == 0:
        return np.zeros(dim)
    else:
        return np.mean(vectors, axis=0)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_train = tfidf_vectorizer.fit_transform(X_text_train)
tfidf_test = tfidf_vectorizer.transform(X_text_test)

# Convert text to GloVe and Word2Vec embeddings
glove_embeddings = load_glove_embeddings(glove_path)
X_train_glove = np.array([get_embedding(text, glove_embeddings) for text in tqdm(X_text_train)])
X_test_glove = np.array([get_embedding(text, glove_embeddings) for text in tqdm(X_text_test)])

X_train_w2v = np.array([get_embedding(text, word2vec) for text in tqdm(X_text_train)])
X_test_w2v = np.array([get_embedding(text, word2vec) for text in tqdm(X_text_test)])

# Concatenate TF-IDF and Word2Vec embeddings (GloVe can be added similarly)
X_train_combined =  np.hstack((tfidf_train.toarray(), X_train_w2v,X_train_glove))  # X_train_glove, 
X_test_combined = np.hstack((tfidf_test.toarray(), X_test_w2v,X_test_glove))  # X_test_glove,

from sklearn.decomposition import PCA
import numpy as np

# Apply PCA without limiting components initially
pca = PCA()
y_train_pca_full = pca.fit_transform(X_train_combined)

# Calculate cumulative variance
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
n_components = np.argmax(cumulative_variance >= 0.95) + 1  # Selects the smallest number of components for 95% variance

print(f"Selected number of components for 95% variance: {n_components}")

# Train and evaluate models
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"{model.__class__.__name__} Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(classification_report(y_test, y_pred))


Training data NaN values: category                              0
sub_category                       6591
crimeaditionalinfo                   21
crimeaditionalinfo_preprocessed    1245
dtype: int64
Testing data NaN values: category                              0
sub_category                       2236
crimeaditionalinfo                    7
crimeaditionalinfo_preprocessed     393
dtype: int64


100%|██████████| 93686/93686 [00:17<00:00, 5224.78it/s]
100%|██████████| 31225/31225 [00:06<00:00, 5168.06it/s]
100%|██████████| 93686/93686 [00:20<00:00, 4536.68it/s]
100%|██████████| 31225/31225 [00:07<00:00, 4380.77it/s]


Selected number of components for 95% variance: 639


In [68]:
# Re-initialize PCA with selected components for 95% variance
#pca = PCA(n_components=n_components)
#X_train_combined = pca.fit_transform(X_train_combined)
#X_test_combined = pca.transform(X_test_combined)

In [69]:
print("Training RF")
# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42) # class_weight='balanced'
evaluate_model(rf, X_train_combined, y_train, X_test_combined, y_test)

Training RF
RandomForestClassifier Accuracy: 0.7431


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                                                      precision    recall  f1-score   support

                               Any Other Cyber Crime       0.58      0.05      0.09      3670
Child Pornography CPChild Sexual Abuse Material CSAM       0.79      0.22      0.34       123
                                Cryptocurrency Crime       1.00      0.10      0.18       166
                      Cyber Attack/ Dependent Crimes       1.00      1.00      1.00      1261
                                     Cyber Terrorism       0.00      0.00      0.00        52
      Hacking  Damage to computercomputer system etc       0.91      0.02      0.03       592
                            Online Cyber Trafficking       0.00      0.00      0.00        61
                              Online Financial Fraud       0.74      0.99      0.85     18896
                            Online Gambling  Betting       0.00      0.00      0.00       134
               Online and Social Media Related Crime       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [70]:
print("Training XG Boost")
from sklearn.preprocessing import LabelEncoder

# Label Encoding for y_train and y_test
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# XGBoost
xgb = XGBClassifier(device='cuda', use_label_encoder=False, eval_metric='mlogloss', random_state=42,learning_rate=0.05 )
evaluate_model(xgb, X_train_combined, y_train_encoded , X_test_combined, y_test_encoded)

Training XG Boost


Parameters: { "use_label_encoder" } are not used.



XGBClassifier Accuracy: 0.7523
              precision    recall  f1-score   support

           0       0.54      0.10      0.17      3670
           1       0.79      0.24      0.37       123
           2       0.76      0.37      0.50       166
           3       1.00      1.00      1.00      1261
           4       0.00      0.00      0.00        52
           5       0.47      0.22      0.30       592
           6       0.00      0.00      0.00        61
           7       0.77      0.98      0.86     18896
           8       0.77      0.07      0.14       134
           9       0.57      0.55      0.56      4139
          10       0.50      0.22      0.31        18
          11       0.99      0.91      0.95       912
          13       0.61      0.03      0.05       535
          14       0.64      0.06      0.11       666

    accuracy                           0.75     31225
   macro avg       0.60      0.34      0.38     31225
weighted avg       0.72      0.75      0.70     3

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [71]:
print("Training LGBM")
# LightGBM
lgbm = LGBMClassifier(random_state=42,learning_rate=0.05)  # Initialize LightGBM classifier class_weight='balanced'
evaluate_model(lgbm, X_train_combined, y_train, X_test_combined, y_test)

Training LGBM
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.117604 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 492153
[LightGBM] [Info] Number of data points in the train set: 93686, number of used features: 5529
[LightGBM] [Info] Start training from score -2.153206
[LightGBM] [Info] Start training from score -5.510168
[LightGBM] [Info] Start training from score -5.273918
[LightGBM] [Info] Start training from score -3.256795
[LightGBM] [Info] Start training from score -6.366300
[LightGBM] [Info] Start training from score -4.003455
[LightGBM] [Info] Start training from score -6.238218
[LightGBM] [Info] Start training from score -0.489312
[LightGBM] [Info] Start training from score -5.351879
[LightGBM] [Info] Start training from score -2.043443
[LightGBM] [Info] Start training from score -7.422352
[LightGBM] [Info] Start training from score -3.502503
[LightGBM] [Info] Start training from score -

- Using Word2Vec
  - XGBOOst: 0.74
  - LGBM=RF=0.72

- Tf-IDF + Word2vec
  - RF: 0.73
  - XGBOOST: 0.77  OVR: 0.75
  - LGBM: 0.76
- Glove
  - Rf: 0.741
  - boost: 0.74
  - lgbm: 0.70


- Tf + Glove + Word2vec + PCA
  - RF:0.70
  - XGBoost:
  - LGBM:

### Build model each for all classes

In [72]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelBinarizer
import pandas as pd


# Initialize a label binarizer
label_binarizer = LabelBinarizer()
label_binarizer.fit(y_train)

# Store individual models and predictions
models = {}
predictions = pd.DataFrame()

for class_label in label_binarizer.classes_:
    # Binarize the target variable for the current class
    y_train_bin = (y_train == class_label).astype(int)
    y_test_bin = (y_test == class_label).astype(int)
    
    # Initialize and train the model for the current class
    model = XGBClassifier(device='cuda',use_label_encoder=False, eval_metric='mlogloss', random_state=42, learning_rate=0.05)
    model.fit(X_train_combined, y_train_bin)
    
    # Store the model
    models[class_label] = model
    
    # Make predictions and store them in the DataFrame
    predictions[class_label] = model.predict_proba(X_test_combined)[:, 1]  # Probability of being in the class


# Choose the class with the highest probability
final_predictions = predictions.idxmax(axis=1)

# Evaluate the overall accuracy
print(f"Overall Accuracy: {accuracy_score(y_test, final_predictions):.4f}")
print(classification_report(y_test, final_predictions))


for class_label, model in models.items():
    y_test_bin = (y_test == class_label).astype(int)
    y_pred = (predictions[class_label] >= 0.5).astype(int)
    print(f"Evaluation for class '{class_label}':")
    print(classification_report(y_test_bin, y_pred))


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Overall Accuracy: 0.7516


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                                                      precision    recall  f1-score   support

                               Any Other Cyber Crime       0.58      0.09      0.15      3670
Child Pornography CPChild Sexual Abuse Material CSAM       0.80      0.27      0.40       123
                                Cryptocurrency Crime       0.78      0.28      0.42       166
                      Cyber Attack/ Dependent Crimes       1.00      1.00      1.00      1261
                                     Cyber Terrorism       0.00      0.00      0.00        52
      Hacking  Damage to computercomputer system etc       0.45      0.18      0.25       592
                            Online Cyber Trafficking       0.00      0.00      0.00        61
                              Online Financial Fraud       0.77      0.98      0.86     18896
                            Online Gambling  Betting       0.89      0.06      0.11       134
               Online and Social Media Related Crime       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00     31164
           1       0.00      0.00      0.00        61

    accuracy                           1.00     31225
   macro avg       0.50      0.50      0.50     31225
weighted avg       1.00      1.00      1.00     31225

Evaluation for class 'Online Financial Fraud':
              precision    recall  f1-score   support

           0       0.83      0.72      0.78     12329
           1       0.83      0.91      0.87     18896

    accuracy                           0.83     31225
   macro avg       0.83      0.81      0.82     31225
weighted avg       0.83      0.83      0.83     31225

Evaluation for class 'Online Gambling  Betting':
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     31091
           1       0.82      0.07      0.12       134

    accuracy                           1.00     31225
   macro avg       0.91      0.5

In [73]:
# Evaluate the overall accuracy
print(f"Overall Accuracy: {accuracy_score(y_test, final_predictions):.4f}")
print(classification_report(y_test, final_predictions))

Overall Accuracy: 0.7516


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                                                      precision    recall  f1-score   support

                               Any Other Cyber Crime       0.58      0.09      0.15      3670
Child Pornography CPChild Sexual Abuse Material CSAM       0.80      0.27      0.40       123
                                Cryptocurrency Crime       0.78      0.28      0.42       166
                      Cyber Attack/ Dependent Crimes       1.00      1.00      1.00      1261
                                     Cyber Terrorism       0.00      0.00      0.00        52
      Hacking  Damage to computercomputer system etc       0.45      0.18      0.25       592
                            Online Cyber Trafficking       0.00      0.00      0.00        61
                              Online Financial Fraud       0.77      0.98      0.86     18896
                            Online Gambling  Betting       0.89      0.06      0.11       134
               Online and Social Media Related Crime       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
