### NLP Model for NCRP 


Basic data exploration

In [1]:
import pandas as pd

df = pd.read_csv('train.csv')

df.head()

Unnamed: 0,category,sub_category,crimeaditionalinfo
0,Online and Social Media Related Crime,Cyber Bullying Stalking Sexting,I had continue received random calls and abusi...
1,Online Financial Fraud,Fraud CallVishing,The above fraudster is continuously messaging ...
2,Online Gambling Betting,Online Gambling Betting,He is acting like a police and demanding for m...
3,Online and Social Media Related Crime,Online Job Fraud,In apna Job I have applied for job interview f...
4,Online Financial Fraud,Fraud CallVishing,I received a call from lady stating that she w...


In [2]:
len(df), len(df['category'].unique()), len(df['sub_category'].unique())

(93686, 15, 36)

In [3]:
empty_cells_count = df['category'].replace(r"^\s*$", pd.NA, regex=True).isnull().sum()
print(f"Total number of empty cells in 'category': {empty_cells_count}")

Total number of empty cells in 'category': 0


In [4]:
empty_cells_count = df['sub_category'].replace(r"^\s*$", pd.NA, regex=True).isnull().sum()
print(f"Total number of empty cells in 'sub_category': {empty_cells_count}")

Total number of empty cells in 'sub_category': 6591


In [5]:
df['category'].value_counts()

category
Online Financial Fraud                                  57434
Online and Social Media Related Crime                   12140
Any Other Cyber Crime                                   10878
Cyber Attack/ Dependent Crimes                           3608
RapeGang Rape RGRSexually Abusive Content                2822
Sexually Obscene material                                1838
Hacking  Damage to computercomputer system etc           1710
Sexually Explicit Act                                    1552
Cryptocurrency Crime                                      480
Online Gambling  Betting                                  444
Child Pornography CPChild Sexual Abuse Material CSAM      379
Online Cyber Trafficking                                  183
Cyber Terrorism                                           161
Ransomware                                                 56
Report Unlawful Content                                     1
Name: count, dtype: int64

In [6]:
df['sub_category'].value_counts()

sub_category
UPI Related Frauds                                                      26856
Other                                                                   10878
DebitCredit Card FraudSim Swap Fraud                                    10805
Internet Banking Related Fraud                                           8872
Fraud CallVishing                                                        5803
Cyber Bullying  Stalking  Sexting                                        4089
EWallet Related Fraud                                                    4047
FakeImpersonating Profile                                                2299
Profile Hacking Identity Theft                                           2073
Cheating by Impersonation                                                1988
Unauthorised AccessData Breach                                           1114
Online Job Fraud                                                          912
DematDepository Fraud                              

In [7]:
category_dict = df.groupby('category')['sub_category'].unique().apply(list).to_dict()
category_dict

{'Any Other Cyber Crime': ['Other'],
 'Child Pornography CPChild Sexual Abuse Material CSAM': [nan],
 'Cryptocurrency Crime': ['Cryptocurrency Fraud'],
 'Cyber Attack/ Dependent Crimes': ['Data Breach/Theft',
  'Denial of Service (DoS)/Distributed Denial of Service (DDOS) attacks',
  'Malware Attack',
  'Hacking/Defacement',
  'SQL Injection',
  'Ransomware Attack',
  'Tampering with computer source documents'],
 'Cyber Terrorism': ['Cyber Terrorism'],
 'Hacking  Damage to computercomputer system etc': ['Email Hacking',
  'Unauthorised AccessData Breach',
  'Website DefacementHacking',
  'Damage to computer computer systems etc',
  'Tampering with computer source documents'],
 'Online Cyber Trafficking': ['Online Trafficking'],
 'Online Financial Fraud': ['Fraud CallVishing',
  'UPI Related Frauds',
  'Internet Banking Related Fraud',
  'DebitCredit Card FraudSim Swap Fraud',
  'EWallet Related Fraud',
  'Business Email CompromiseEmail Takeover',
  'DematDepository Fraud'],
 'Online Ga

Data cleaning

In [8]:
df['category'] = df['category'].fillna('nan')
df['sub_category'] = df['sub_category'].fillna('nan')
df['crimeaditionalinfo'] = df['crimeaditionalinfo'].fillna('nan')

In [9]:
df.isnull().sum()

category              0
sub_category          0
crimeaditionalinfo    0
dtype: int64

In [10]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# List of stopwords
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # 1. Lowercase
    text = text.lower()

    # 2. Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # 3. Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # 5. Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text


df['crimeaditionalinfo'] = df['crimeaditionalinfo'].apply(clean_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vigne\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from transformers import BertTokenizer

# Load the mBERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Tokenize the text and add special tokens
df['tokens'] = df['crimeaditionalinfo'].apply(lambda x: tokenizer.tokenize(x))
df['tokens'] = df['tokens'].apply(lambda x: ['[CLS]'] + x + ['[SEP]'])

# Convert tokens to input IDs
df['input_ids'] = df['tokens'].apply(lambda x: tokenizer.convert_tokens_to_ids(x))

max_length = 284  # 95th percentile length

# Padding input_ids: truncate longer sequences and pad shorter ones
df['padded_input_ids'] = df['input_ids'].apply(lambda x: x[:max_length] + [0] * (max_length - len(x)) if len(x) < max_length else x[:max_length])

# Create attention masks: 1 for real tokens and 0 for padding
df['attention_mask'] = df['padded_input_ids'].apply(lambda x: [1 if i != 0 else 0 for i in x])

In [17]:
# Check padded input IDs and attention mask
df[['padded_input_ids', 'attention_mask']].head()

len(df['padded_input_ids'][0]), len(df['attention_mask'][0])

(284, 284)

In [18]:
from sklearn.preprocessing import LabelEncoder
# Encoding categories and sub-categories
category_encoder = LabelEncoder()
sub_category_encoder = LabelEncoder()

df['category_label'] = category_encoder.fit_transform(df['category'])
df['sub_category_label'] = sub_category_encoder.fit_transform(df['sub_category'])