In [32]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification

In [33]:
df = pd.read_csv(r'\Users\AIDL-LAB\Desktop\Dashan\dataset\data.csv', encoding='utf-8')
df

Unnamed: 0,category,sub_category,crimeaditionalinfo,new_category,cleaned_text
0,online and social media related crime,cyber bullying stalking sexting,I had continue received random calls and abusi...,Other Cyber Crime,continue received random calls abusive message...
1,online financial fraud,fraud callvishing,The above fraudster is continuously messaging ...,Financial Fraud Crimes,fraudster continuously messaging asking pay mo...
2,online gambling betting,online gambling betting,He is acting like a police and demanding for m...,Other Cyber Crime,acting police demanding money adding sections ...
3,online and social media related crime,online job fraud,In apna Job I have applied for job interview f...,Other Cyber Crime,job applied job interview telecalling resource...
4,online financial fraud,fraud callvishing,I received a call from lady stating that she w...,Financial Fraud Crimes,received call lady stating send new phone vivo...
...,...,...,...,...,...
114942,online and social media related crime,online matrimonial fraud,A lady named Rashmi probably a fake name had c...,Other Cyber Crime,lady named rashmi probably fake name called da...
114943,online financial fraud,internet banking related fraud,I am Mr Chokhe Ram Two pers mobile number wer...,Financial Fraud Crimes,mr chokhe ram two pers mobile number found goo...
114944,any other cyber crime,other,Mai Bibekbraj maine pahle ki complain kar chuk...,Other Cyber Crime,mai bibekbraj maine pahle ki complain kar chuk...
114945,online financial fraud,internet banking related fraud,received URL link for updating KYC from mobile...,Financial Fraud Crimes,received url link updating kyc mobile opening ...


In [34]:
df.groupby('new_category').size()

new_category
Financial Fraud Crimes       70114
Other Cyber Crime            39592
Women/Child Related Crime     5241
dtype: int64

In [35]:
df = df.dropna(subset=['crimeaditionalinfo'])
df = df.dropna(subset=['cleaned_text'])

In [36]:
df.groupby('new_category').size()

new_category
Financial Fraud Crimes       69472
Other Cyber Crime            39052
Women/Child Related Crime     5175
dtype: int64

In [37]:
df = df.drop_duplicates()

In [38]:
df.groupby('new_category').size()

new_category
Financial Fraud Crimes       68736
Other Cyber Crime            39020
Women/Child Related Crime     5072
dtype: int64

In [39]:
df['crimeaditionalinfo'].isna().sum()

0

In [40]:
df.groupby('new_category').size()

new_category
Financial Fraud Crimes       68736
Other Cyber Crime            39020
Women/Child Related Crime     5072
dtype: int64

In [41]:
from functions import resample_data
df = resample_data(df,48000,48000) 

In [42]:
df.groupby('new_category').size()

new_category
Financial Fraud Crimes       48000
Other Cyber Crime            48000
Women/Child Related Crime    48000
dtype: int64

In [43]:
df.loc[:, 'label'] = df['new_category'].astype('category').cat.codes
df.groupby('new_category').describe()

Unnamed: 0_level_0,label,label,label,label,label,label,label,label
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
new_category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Financial Fraud Crimes,48000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Other Cyber Crime,48000.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
Women/Child Related Crime,48000.0,2.0,0.0,2.0,2.0,2.0,2.0,2.0


In [44]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(df['label'].unique()))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [45]:
MAX_LEN = 256
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
EPOCHS = 3

In [46]:
texts = df['cleaned_text'].tolist() 
labels = df['label'].tolist()

In [47]:
from classes import TextDataset
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from torch.optim import AdamW as TorchAdamM
from transformers import get_linear_schedule_with_warmup
import torch
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.25, random_state=42
)
train_dataset = TextDataset(train_texts, train_labels, tokenizer, MAX_LEN)
val_dataset = TextDataset(val_texts, val_labels, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=BATCH_SIZE)
val_loader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=BATCH_SIZE)

optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(0.1 * total_steps), num_training_steps=total_steps)

In [48]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
device

device(type='cuda')

In [49]:
from methods import evaluate_model
from functions import train_epoch

best_accuracy = 0

for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch + 1}/{EPOCHS}")
 
    train_metrics = train_epoch(model, train_loader, optimizer, scheduler, device)
    train_loss = train_metrics['loss']
    train_accuracy = train_metrics['accuracy']
   
    val_metrics = evaluate_model(model, val_loader, device)
    val_loss = val_metrics['loss']
    val_accuracy = val_metrics['accuracy']
    val_f1 = val_metrics['f1_score']
    val_precision = val_metrics['precision']
    val_recall = val_metrics['recall']

    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")
    print(f"Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}, Val F1 Score: {val_f1:.4f}, Val Precision: {val_precision:.4f}, Val Recall: {val_recall:.4f}")
    
    if val_accuracy > best_accuracy:
        torch.save(model.state_dict(),'best1_category.bin')
        best_accuracy = val_accuracy

print("Training complete.")


Epoch 1/3


Training: 100%|██████████| 6750/6750 [59:17<00:00,  1.90it/s] 
Evaluating: 100%|██████████| 2250/2250 [07:34<00:00,  4.95it/s]


Evaluation Metrics:
Loss: 0.3866
Accuracy: 0.8350
Precision: 0.8342
Recall: 0.8350
F1 Score: 0.8345
Train Loss: 0.5662, Train Accuracy: 0.7483
Val Loss: 0.3866, Val Accuracy: 0.8350, Val F1 Score: 0.8345, Val Precision: 0.8342, Val Recall: 0.8350

Epoch 2/3


Training: 100%|██████████| 6750/6750 [1:05:28<00:00,  1.72it/s]   
Evaluating: 100%|██████████| 2250/2250 [07:34<00:00,  4.95it/s]


Evaluation Metrics:
Loss: 0.2969
Accuracy: 0.8846
Precision: 0.8844
Recall: 0.8846
F1 Score: 0.8841
Train Loss: 0.3151, Train Accuracy: 0.8691
Val Loss: 0.2969, Val Accuracy: 0.8846, Val F1 Score: 0.8841, Val Precision: 0.8844, Val Recall: 0.8846

Epoch 3/3


Training: 100%|██████████| 6750/6750 [57:57<00:00,  1.94it/s]
Evaluating: 100%|██████████| 2250/2250 [07:34<00:00,  4.95it/s]


Evaluation Metrics:
Loss: 0.3227
Accuracy: 0.8943
Precision: 0.8931
Recall: 0.8943
F1 Score: 0.8934
Train Loss: 0.2032, Train Accuracy: 0.9200
Val Loss: 0.3227, Val Accuracy: 0.8943, Val F1 Score: 0.8934, Val Precision: 0.8931, Val Recall: 0.8943
Training complete.


In [None]:
from huggingface_hub import login
login(token = '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')

In [101]:
model.push_to_hub('Darshankochar022/cyberguard_BERT_Category1')

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Darshankochar022/cyberguard_BERT_Category1/commit/ae8efe6870ab5778ddb49b98b40ffa90d896aa46', commit_message='Upload BertForSequenceClassification', commit_description='', oid='ae8efe6870ab5778ddb49b98b40ffa90d896aa46', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Darshankochar022/cyberguard_BERT_Category1', endpoint='https://huggingface.co', repo_type='model', repo_id='Darshankochar022/cyberguard_BERT_Category1'), pr_revision=None, pr_num=None)