Preprocessing the dataset

In [None]:
import pandas as pd
import numpy as np
import string
import re
import matplotlib.pyplot as plt
import nltk
nltk.download('all')
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
data = pd.read_csv("train.csv", encoding="ISO-8859-1")
test = pd.read_csv("test.csv")
data = data.rename(columns={'crimeaditionalinfo': 'text'})
data = data.dropna(subset=['text'])
test = test.rename(columns={'crimeaditionalinfo': 'text'}) # renaming the crime info column
test = test.dropna(subset=['text']) # dropping all entries with no information on the crime

In [None]:
null_count = data['sub_category'].isnull().sum()
data['sub_category'].fillna(data['category'], inplace=True)
test['sub_category'].fillna(data['category'], inplace=True) # replacing the null entries in sub_category column with the category of the complaint
print('null count: ', null_count)
data.groupby('category')['sub_category'].value_counts() # number of entries and sub categories under each categories

In [None]:
mapping = data.set_index('sub_category')['category'].to_dict()
def get_category(sub_category):
    return mapping.get(sub_category)
subcategories = data['sub_category'].unique().tolist()


In [None]:
unique_data_subcategories = set(data['sub_category'].unique())
unique_test_subcategories = set(test['sub_category'].unique())

exclusive_in_data_subcategories = unique_data_subcategories - unique_test_subcategories
exclusive_in_test_subcategories = unique_test_subcategories - unique_data_subcategories

print("Subcategories exclusive to 'train' dataframe:")
print(list(exclusive_in_data_subcategories))
print("\nSubcategories exclusive to 'test' dataframe:")
print(list(exclusive_in_test_subcategories))
exclusive_in_test_subcategories

In [None]:
unique_data_categories = set(data['category'].unique())
unique_test_categories = set(test['category'].unique())

exclusive_in_data = unique_data_categories - unique_test_categories
exclusive_in_test = unique_test_categories - unique_data_categories

exclusive_in_data, exclusive_in_test


In [None]:
data = data[data['word_count'] >= 4]
stop_words = set(stopwords.words('english'))
data['text'] = data['text'].str.lower().apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words]))
test['text'] = test['text'].str.lower().apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words]))

In [None]:
# Removing all instances which are not of type string
data = data[data['text'].apply(lambda x: isinstance(x, str))]
test = test[test['text'].apply(lambda x: isinstance(x, str))]
total_char_count = data['text'].str.len().sum()
total_char_count

In [None]:
def Text_Cleaning(Text):
  Text = Text.lower()
  punc = str.maketrans(string.punctuation, ' '*len(string.punctuation))
  Text = Text.translate(punc)
  Text = re.sub(r'\d+', '', Text)
  Text = re.sub('https?://\S+|www\.\S+', '', Text)
  Text = re.sub('\n', '', Text)
  return Text
Stopwords = set(nltk.corpus.stopwords.words("english")) - set(["not"])

def Text_Processing(Text):
    Processed_Text = list()
    Lemmatizer = WordNetLemmatizer()
    Tokens = nltk.word_tokenize(Text)

    for word in Tokens:
        if word not in Stopwords:
            Processed_Text.append(Lemmatizer.lemmatize(word))

    return " ".join(Processed_Text)

In [None]:
data["text"] = data["text"].apply(lambda Text: Text_Cleaning(Text))
test["text"] = test["text"].apply(lambda Text: Text_Cleaning(Text))
data["text"] = data["text"].apply(lambda Text: Text_Processing(Text))
test["text"] = test["text"].apply(lambda Text: Text_Processing(Text))
total_char_count = data['text'].str.len().sum()
total_char_count

In [None]:
stemmer = PorterStemmer()
data['text'] = data['text'].apply(lambda x: ' '.join([stemmer.stem(word) for word in nltk.word_tokenize(x)]))
test['text'] = test['text'].apply(lambda x: ' '.join([stemmer.stem(word) for word in nltk.word_tokenize(x)]))

In [None]:
data.to_csv('cleaned_train.csv', index=False)
test.to_csv('cleaned_test.csv', index=False)

Logistic Regression

In [5]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


df_train = pd.read_csv('cleaned_train.csv')
df_test = pd.read_csv('cleaned_test.csv')
df_train['text'] = df_train['text'].astype(str).fillna("")
df_test['text'] = df_test['text'].astype(str).fillna("")


# Encoding categories and sub-categories
category_encoder = LabelEncoder()
sub_category_encoder = LabelEncoder()

# Custom function for safe encoding
def safe_transform(encoder, data):
    # Get the unique classes seen by the encoder
    classes = set(encoder.classes_)

    # Replace unseen labels with -1
    return [encoder.transform([label])[0] if label in classes else -1 for label in data]

category_encoder.fit_transform(df_train['category'])

# Transform the train data
df_train['category_label'] = category_encoder.transform(df_train['category'])

# Transform the test data (with safe handling for unseen labels)
df_test['category_label'] = safe_transform(category_encoder, df_test['category'])




# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust max_features as needed

# Fit on training text data and transform both train and test
X_train_tfidf = vectorizer.fit_transform(df_train['text'])
X_test_tfidf = vectorizer.transform(df_test['text'])



# Split the data into train and test sets
X_train, X_test, y_train_category, y_test_category = X_train_tfidf, X_test_tfidf, df_train['category_label'], df_test['category_label']


# Initialize Logistic Regression model
category_model = LogisticRegression(max_iter=1000)  # Increase `max_iter` if convergence issues occur

# Train the Logistic Regression model on the category labels
category_model.fit(X_train, y_train_category)

# Make predictions
y_pred_category = category_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test_category, y_pred_category)
print(f"Category Model Accuracy (Logistic Regression): {accuracy:.2f}")


prediction = category_model.predict(X_test_tfidf[0])
print(df_test['text'][0])
print(category_encoder.inverse_transform(prediction))


sub_category_models = {}

category_groups = df_train.groupby('category_label')

for category, group in category_groups:
    # Extract features and labels for this category
    X_category = X_train_tfidf[group.index]
    y_sub_category = group['sub_category']

    # Initialize and fit a LabelEncoder
    sub_category_encoder = LabelEncoder()
    y_sub_category_encoded = sub_category_encoder.fit_transform(y_sub_category)

    # Check if there's only one unique sub-category
    if len(set(y_sub_category_encoded)) == 1:
        # Save the constant prediction (always the single label)
        sub_category_models[category] = {
            "model": None,
            "encoder": sub_category_encoder,
            "constant_label": y_sub_category.iloc[0]
        }
        continue

    # Train a Logistic Regression model for sub-category
    sub_category_model = LogisticRegression(max_iter=1000)
    sub_category_model.fit(X_category, y_sub_category_encoded)

    # Save the trained model and encoder in the dictionary
    sub_category_models[category] = {
        "model": sub_category_model,
        "encoder": sub_category_encoder
    }


def predict_single_sample(X_sample):
    # Ensure the input sample is reshaped correctly for prediction
    if len(X_sample.shape) == 1:
        X_sample = X_sample.reshape(1, -1)

    # Step 1: Predict the category
    predicted_category = category_model.predict(X_sample)[0]

    # Step 2: Retrieve the sub-category model and encoder for the predicted category
    sub_category_data = sub_category_models.get(predicted_category)

    if sub_category_data is None:
        raise ValueError(f"No sub-category model found for category {predicted_category}.")

    sub_category_model = sub_category_data.get('model')  # Retrieve the model
    sub_category_encoder = sub_category_data.get('encoder')  # Retrieve the encoder

    # Step 3: Handle single sub-category case (model not trained)
    if sub_category_model is None:
        # Directly decode the only possible sub-category
        single_sub_category = sub_category_encoder.inverse_transform([0])[0]
        return predicted_category, single_sub_category

    # Step 4: Predict the sub-category using the trained model
    predicted_sub_category_encoded = sub_category_model.predict(X_sample)[0]
    predicted_sub_category = sub_category_encoder.inverse_transform([predicted_sub_category_encoded])[0]

    return predicted_category, predicted_sub_category


def evaluate_combined_model(X_test_tfidf, y_test_category, df_test):
    category_accuracy = 0
    combined_accuracy = 0

    # Convert X_test_tfidf to dense format for prediction
    X_test_dense = X_test_tfidf.toarray()
    total_samples = len(X_test_dense)
    # Track correct predictions
    correct_category_predictions = 0
    correct_combined_predictions = 0

    for i in range(total_samples):
        # Get the true category and sub-category for the current sample
        true_category = y_test_category.iloc[i]
        true_sub_category = df_test.iloc[i]['sub_category']

        # Step 1: Predict the category
        predicted_category = category_model.predict([X_test_dense[i]])[0]

        # Check if the category prediction is correct
        if predicted_category == true_category:
            correct_category_predictions += 1

            # Step 2: Predict the sub-category for the correct category
            sub_category_model_info = sub_category_models.get(true_category, None)
            if sub_category_model_info:
                sub_category_model = sub_category_model_info.get("model", None)
                if sub_category_model:
                    # Predict sub-category using the model
                    predicted_sub_category_encoded = sub_category_model.predict([X_test_dense[i]])[0]
                    predicted_sub_category = sub_category_model_info['encoder'].inverse_transform([predicted_sub_category_encoded])[0]
                else:
                    # No model (only one sub-category), use the constant label
                    predicted_sub_category = sub_category_model_info['constant_label']
            else:
                # No model (missing sub-category model), predict None or fallback
                predicted_sub_category = None

            # Check if sub-category prediction is correct
            if predicted_sub_category == true_sub_category:
                correct_combined_predictions += 1

    # Calculate accuracies
    category_accuracy = correct_category_predictions / total_samples
    combined_accuracy = correct_combined_predictions / total_samples

    return {
        "category_accuracy": category_accuracy,
        "combined_accuracy": combined_accuracy
    }



# Now, let's evaluate the model:
results = evaluate_combined_model(X_test_tfidf, y_test_category, df_test)

# Display the results
print(f"Category Accuracy: {results['category_accuracy']:.2f}")
print(f"Combined Accuracy: {results['combined_accuracy']:.2f}")

# Category Classification Report
# True labels and predictions for categories
y_true_category = df_test['category_label']
y_pred_category = category_model.predict(X_test_tfidf)

# Generate classification report for categories
category_report = classification_report(
    y_true_category,
    y_pred_category,
    target_names=category_encoder.classes_,
    zero_division=0
)
print("Category Classification Report:\n")
print(category_report)

# Sub-Category Classification Report
# Initialize variables to store true and predicted sub-categories
y_true_sub_category = []
y_pred_sub_category = []

# Convert X_test_tfidf to dense format for prediction
X_test_dense = X_test_tfidf.toarray()

# Iterate through each test sample
for i in range(len(X_test_dense)):
    true_category = y_true_category.iloc[i]
    true_sub_category = df_test.iloc[i]['sub_category']

    # Predict the category first
    predicted_category = category_model.predict([X_test_dense[i]])[0]

    # Retrieve the sub-category model and encoder for the predicted category
    sub_category_model_info = sub_category_models.get(predicted_category, None)
    if sub_category_model_info:
        sub_category_model = sub_category_model_info.get("model", None)
        sub_category_encoder = sub_category_model_info.get("encoder", None)

        if sub_category_model:
            # Predict sub-category using the model
            predicted_sub_category_encoded = sub_category_model.predict([X_test_dense[i]])[0]
            predicted_sub_category = sub_category_encoder.inverse_transform([predicted_sub_category_encoded])[0]
        else:
            # Use constant label if no model is available
            predicted_sub_category = sub_category_model_info['constant_label']
    else:
        predicted_sub_category = None

    # Append true and predicted sub-categories for the classification report
    y_true_sub_category.append(true_sub_category)
    y_pred_sub_category.append(predicted_sub_category)

# Generate classification report for sub-categories
sub_category_report = classification_report(
    y_true_sub_category,
    y_pred_sub_category,
    zero_division=0
)
print("\nSub-Category Classification Report:\n")
print(sub_category_report)

Category Model Accuracy (Logistic Regression): 0.76
sir namaskar mein ranjit kumar patrapais nehi tho sir kuch din pehel onlin loan aap credit pearl loan aap se money loan kiya thalekin sir loan bolk jub loan diy tho mein turant return kar diya thalekin din baad what app pe messag aya payment karomein bola diy aap mein wo de diyawo gali diy tho v return kar diyafir v messag kark bolt hai full payment karo half payment nehi chalegarap case mein daldeng etcfak illig se contact number v hack kar dete haibol rahehai sab ko messag kareng ye rapist hai bolk sirpl sir small ammount ke liy goggl play store se loan appli kiya thafak loan aap v hai socha nehi thapl sir request kar rahahun action lo sir mera number hai jo v proof chahiy dunga sir
['Online Financial Fraud']
Category Accuracy: 0.76
Combined Accuracy: 0.51
Category Classification Report:

                                                      precision    recall  f1-score   support

                               Any Other Cyber Crim

LSTM


In [10]:
import warnings
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
data = pd.read_csv('cleaned_train.csv')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
max_words = 10000  # Limit the number of unique words to 10,000
max_sequence_length = 100


# Preprocess text
def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        return text
    else:
        return ""


data['text'] = data['text'].apply(preprocess_text)

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data['text'])
X = tokenizer.texts_to_sequences(data['text'])
X = pad_sequences(X, maxlen=max_sequence_length)
# Label Encoding the categories (You can also encode subcategories in a similar way if needed)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['category'])
y = to_categorical(y)  # Convert labels to one-hot encoding
# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Build the LSTM Model
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))  # Output layer for multi-class classification
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")
# Save the model
model.save('lstm_model.h5')
import pandas as pd
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
import numpy as np

# 1. Load the Pre-Trained Model
model = load_model('lstm_model.h5')  # Load your pre-trained model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# 2. Load the Dataset (cleaned_test.csv)
df = pd.read_csv("cleaned_test.csv")

# Ensure the 'text' column contains strings
df['text'] = df['text'].astype(str).fillna("")
X_test_data = df['text']

# Ensure the 'category' column has no NaN values
df['category'] = df['category'].fillna("Unknown")
y_test = df['category']

# 3. Tokenization and Padding
# Initialize the Tokenizer (same parameters as during training)
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_test_data)  # Use the same tokenizer as during training

# Tokenize and pad test data
X_test_sequences = tokenizer.texts_to_sequences(X_test_data)
X_test_padded = pad_sequences(X_test_sequences, maxlen=100, padding='post', truncating='post')

# 4. Predict Using the Pre-Trained Model
y_pred_prob = model.predict(X_test_padded)  # Get predicted probabilities

# Convert predicted probabilities to class labels
y_pred = np.argmax(y_pred_prob, axis=1)

# 5. Encode the True Labels (if LabelEncoder was used during training)
label_encoder = LabelEncoder()
y_test_encoded = label_encoder.fit_transform(y_test)  # Encode the true labels
warnings.filterwarnings("ignore", category=UserWarning, module="absl")
# 6. Calculate the Evaluation Metrics
accuracy = accuracy_score(y_test_encoded, y_pred)
precision = precision_score(y_test_encoded, y_pred, average='weighted')
recall = recall_score(y_test_encoded, y_pred, average='weighted')
f1 = f1_score(y_test_encoded, y_pred, average='weighted')

# 7. Print the Metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pradeesh11/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/pradeesh11/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Epoch 1/5
[1m1096/1096[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 88ms/step - accuracy: 0.6885 - loss: 1.0531 - val_accuracy: 0.7430 - val_loss: 0.7732
Epoch 2/5
[1m1096/1096[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 89ms/step - accuracy: 0.7478 - loss: 0.7593 - val_accuracy: 0.7466 - val_loss: 0.7431
Epoch 3/5
[1m1096/1096[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 89ms/step - accuracy: 0.7631 - loss: 0.6993 - val_accuracy: 0.7481 - val_loss: 0.7259
Epoch 4/5
[1m1096/1096[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 89ms/step - accuracy: 0.7793 - loss: 0.6368 - val_accuracy: 0.7523 - val_loss: 0.7259
Epoch 5/5
[1m1096/1096[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 91ms/step - accuracy: 0.7961 - loss: 0.5901 - val_accuracy: 0.7504 - val_loss: 0.7555
[1m548/548[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 16ms/step - accuracy: 0.7470 - loss: 0.7707
Test Accuracy: 75.04%




[1m976/976[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 15ms/step
Accuracy: 0.0031
Precision: 0.0288
Recall: 0.0031
F1 Score: 0.0022


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Random Forest Classifier

In [8]:
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

df = pd.read_csv("cleaned_train.csv")
df['text'] = df['text'].fillna('')

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the entire dataset
X_vec = vectorizer.fit_transform(df['text'])

# Remove classes with only one sample
counts = df['category'].value_counts()
low_count_classes = counts[counts <= 1].index
df_filtered = df[~df['category'].isin(low_count_classes)]

# Assuming 'df' is your DataFrame and 'category' is the column with categories
df = df_filtered['category'].value_counts()
df = df_filtered


df.to_csv("unique_categories.csv", index=False)


# Define your features and target
X = df['text']
y = df['category']  # Replace with actual category column

# Split the data while keeping the indices
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# After splitting, we will also need to transform the text data
vectorizer = TfidfVectorizer(max_features=8000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Prepare features and target
X = df['text']
y = df['category']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorization using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train a Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_vectorized, y_train)

# Make predictions
y_pred = rf_model.predict(X_test_vectorized)



# Print accuracy score
print("Accuracy:", accuracy_score(y_test, y_pred))

# Create a DataFrame for the results
results_df = pd.DataFrame({
    'cleaned_text': X_test,  # Use original text from X_test
    'predicted_category': y_pred,
    'actual_category': y_test
})


# Save the results to a CSV file
results_df.to_csv('main_category_classification_results.csv', index=False)

# Save the Random Forest model
# joblib.dump(rf_model_model, 'random_forest_model.pkl')

# Save the TF-IDF vectorizer
# joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')


# Load the saved Random Forest model
rf_model = joblib.load('random_forest_model.pkl')

# Load the saved TF-IDF vectorizer
vectorizer = joblib.load('tfidf_vectorizer.pkl')

# Print classification report
print("Main Category Classification Report:\n", classification_report(y_test, y_pred,zero_division=0))

Accuracy: 0.7525396644218697
Main Category Classification Report:
                                                       precision    recall  f1-score   support

                               Any Other Cyber Crime       0.66      0.09      0.16      2116
Child Pornography CPChild Sexual Abuse Material CSAM       0.87      0.21      0.33        63
                                Cryptocurrency Crime       0.89      0.07      0.13       113
                      Cyber Attack/ Dependent Crimes       1.00      1.00      1.00       741
                                     Cyber Terrorism       0.00      0.00      0.00        33
      Hacking  Damage to computercomputer system etc       0.85      0.09      0.16       322
                            Online Cyber Trafficking       0.00      0.00      0.00        31
                              Online Financial Fraud       0.76      0.98      0.86     10440
                            Online Gambling  Betting       0.00      0.00      0.00   

XGBOOST

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from tqdm import tqdm

# Step 1: Load the Dataset
data = pd.read_csv('cleaned_train.csv')

# Step 2: Preprocess the Data
data['text'] = data['text'].astype(str).fillna("")  # Ensure text data is clean

# Encode labels
label_encoder = LabelEncoder()
data['category_encoded'] = label_encoder.fit_transform(data['category'])

# Step 3: Split the Data
X = data['text']  # Input text
y = data['category_encoded']  # Encoded labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Convert Text to Features Using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2), stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Step 5: Train the XGBoost Model with Progress Bar
xgb_model = XGBClassifier(random_state=42, eval_metric='mlogloss', objective='multi:softmax', num_class=len(label_encoder.classes_))

# Wrap the training process with tqdm for the progress bar
with tqdm(total=100, desc="Training Progress", unit="step") as pbar:
    for _ in range(1):  # Dummy loop to simulate steps; XGBoost handles training internally
        xgb_model.fit(
            X_train_tfidf,
            y_train,
            eval_set=[(X_test_tfidf, y_test)],
            verbose=False
        )
        pbar.update(100)  # Update progress bar to 100% since training is one complete step

# Step 6: Predict on the Test Data
y_pred = xgb_model.predict(X_test_tfidf)
unique_classes = sorted(set(y_test))  # Ensure all classes in y_test are covered
class_names = label_encoder.inverse_transform(unique_classes)
# Step 7: Generate the Classification Report
report = classification_report(y_test, y_pred,
                               labels=unique_classes,
                               target_names=class_names,
                               zero_division=0)
print(report)


Training Progress: 100%|██████████| 100/100 [04:33<00:00,  2.73s/step]


                                                      precision    recall  f1-score   support

                               Any Other Cyber Crime       0.48      0.23      0.31      2064
Child Pornography CPChild Sexual Abuse Material CSAM       0.71      0.27      0.39        63
                                Cryptocurrency Crime       0.67      0.39      0.49       102
                      Cyber Attack/ Dependent Crimes       1.00      1.00      1.00       715
                                     Cyber Terrorism       0.00      0.00      0.00        33
      Hacking  Damage to computercomputer system etc       0.49      0.33      0.40       349
                            Online Cyber Trafficking       0.00      0.00      0.00        29
                              Online Financial Fraud       0.80      0.95      0.87     10497
                            Online Gambling  Betting       0.50      0.05      0.09        85
               Online and Social Media Related Crime       

DistilBERT

In [None]:
import torch
from tqdm.auto import tqdm
import torch.nn as nn
import torch.optim as optim
from transformers import DistilBertTokenizer, DistilBertModel as DistilBert
import tensorflow as tf
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, TrainingArguments, Trainer
import torch

from transformers import DistilBertForSequenceClassification
import torch_xla
import torch_xla.core.xla_model as xm
import torch_xla.distributed.data_parallel as dp
import torch_xla.distributed.parallel_loader as pl
import torch_xla.utils.utils as xu
from torch.utils.data import DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from tqdm import tqdm
import torch
import torch_xla
import torch_xla.core.xla_model as xm
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
import csv


import pandas as pd
data= pd.read_csv('/kaggle/input/data-file/cleaned_train.csv')
tokenizer=DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

encodings=tokenizer.batch_encode_plus(
    data['text'].astype(str).tolist(),
    padding=True,
    truncation=True,
    return_tensors='pt'
)

encodings.input_ids

import torch
from tqdm.auto import tqdm
import torch.nn as nn
import torch.optim as optim
from transformers import DistilBertTokenizer, 
from sklearn.preprocessing import LabelEncoder  


class TextDataset():
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.label_encoder = LabelEncoder()
        self.label_encoder.fit(self.labels)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        # Tokenize the text
        encoding = self.tokenizer.encode_plus(
            self.texts[idx],
            max_length=self.max_len,
            add_special_tokens=True,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        # Encode the label using label_encoder
        label = self.label_encoder.transform([self.labels[idx]])[0]

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Initialize DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
max_len = 128 

# Create Dataset instances
train_dataset = TextDataset(data['text'].tolist(), data['sub_category'].tolist(), tokenizer, max_len)

label_encoder = LabelEncoder()
label_encoder.fit(data['sub_category'])

train_dataset.label_encoder = label_encoder


# Tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Converting the pandas DataFrame to a Hugging Face Dataset
data['text'] = data['text'].astype(str)
train_dataset = Dataset.from_dict(data)

# Initializing the label encoder and fitting it to the 'sub_category' column
label_encoder = LabelEncoder()
label_encoder.fit(data['sub_category'])

# Label encoding function
def label_encode(examples):
    examples['labels'] = label_encoder.transform(examples['sub_category'])
    return examples

# Preprocessing function
def preprocess_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

# Applying label encoding and preprocessing
train_dataset = train_dataset.map(label_encode, batched=True)
train_dataset = train_dataset.map(preprocess_function, batched=True)
train_dataset = train_dataset.remove_columns(['sub_category'])
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Load model
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=len(label_encoder.classes_)
)

# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# TPU device
device = xm.xla_device()
model.to(device)

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    model.train()
    epoch_loss = 0

    # Wrap DataLoader for TPU parallelization
    para_loader = pl.ParallelLoader(train_dataloader, [device])
    train_device_loader = para_loader.per_device_loader(device)
    
    progress_bar = tqdm(train_device_loader, desc=f"Training Epoch {epoch + 1}")

    for batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        xm.optimizer_step(optimizer) 
        
        epoch_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    print(f"Epoch {epoch + 1} Loss: {epoch_loss / len(train_device_loader)}")
    
    # Save model after each epoch
    xm.save(model.state_dict(), f"model_epoch_{epoch + 1}.pth")

print("Training complete!")

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=39)

# Load the weights from the .pth file
model.load_state_dict(torch.load("/kaggle/input/distil/transformers/default/1/Model Epoch 2.pth", map_location="cpu"))

from transformers import DistilBertTokenizer

# Save the model
model.save_pretrained("model_dir")

# Save the tokenizer (use the same tokenizer as during training)
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
tokenizer.save_pretrained("model_dir")


# Set the TPU device
device = xm.xla_device()

# Load model and tokenizer
model = DistilBertForSequenceClassification.from_pretrained("/kaggle/working/model_dir")
tokenizer = DistilBertTokenizer.from_pretrained("/kaggle/working/model_dir")

# Move model to TPU
model.to(device)

# Function to load the category map from CSV
def create_category_map(csv_file_path):
    category_map = {}
    with open(csv_file_path, mode='r', encoding='latin-1') as file:
        reader = csv.DictReader(file)
        for row in reader:
            category = row['category']
            sub_category = row['sub_category']
            category_map[sub_category] = category
    return category_map

# Function to retrieve the main category for a sub-category
def get_category_by_sub_category(sub_category, category_map):
    return category_map.get(sub_category, "Sub-category not found")

# Prediction function
def predict(text):
    # Tokenize and move inputs to the TPU device
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)
    
    # Ensure the model is in evaluation mode
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        probs = outputs.logits.softmax(dim=1)
        predicted_class = label_encoder.inverse_transform([probs.argmax().item()])
    return predicted_class

# Example: Predict and get category
text = """Identity theft   Fake Customer Care Service Fraud   Face Book
victim got call from suspect that he conntacted on facebook to victim whatsapp number while calling he catured his photo and make a nude vedio call and demanding money if not he will upload in his contacts so victim sent money   to suspect"""
prediction = predict(text)
print("Predicted sub-category:", prediction[0])

# Load the category map from your CSV file
category_map = create_category_map('/kaggle/input/data-file/train.csv')

# Find the main category for the predicted sub-category
sub_category_to_lookup = prediction[0]
category = get_category_by_sub_category(sub_category_to_lookup, category_map)
print(f"Category for sub-category '{sub_category_to_lookup}': {category}")
