In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [6]:
train_df = pd.read_csv(r'\Users\Greesma\Desktop\NLP\train.csv', encoding='ISO-8859-1', low_memory=False)
test_df = pd.read_csv(r'\Users\Greesma\Desktop\NLP\test.csv')

In [7]:
train_df.head(5)

Unnamed: 0,category,sub_category,crimeaditionalinfo
0,Online and Social Media Related Crime,Cyber Bullying Stalking Sexting,I had continue received random calls and abusi...
1,Online Financial Fraud,Fraud CallVishing,The above fraudster is continuously messaging ...
2,Online Gambling Betting,Online Gambling Betting,He is acting like a police and demanding for m...
3,Online and Social Media Related Crime,Online Job Fraud,In apna Job I have applied for job interview f...
4,Online Financial Fraud,Fraud CallVishing,I received a call from lady stating that she w...


In [8]:
X_train = train_df[['category', 'crimeaditionalinfo']]
y_train = train_df['sub_category']

X_test = test_df[['category', 'crimeaditionalinfo']]
y_test = test_df['sub_category']

In [9]:
y_train.unique()

array(['Cyber Bullying  Stalking  Sexting', 'Fraud CallVishing',
       'Online Gambling  Betting', 'Online Job Fraud',
       'UPI Related Frauds', 'Internet Banking Related Fraud', nan,
       'Other', 'Profile Hacking Identity Theft',
       'DebitCredit Card FraudSim Swap Fraud', 'EWallet Related Fraud',
       'Data Breach/Theft', 'Cheating by Impersonation',
       'Denial of Service (DoS)/Distributed Denial of Service (DDOS) attacks',
       'FakeImpersonating Profile', 'Cryptocurrency Fraud',
       'Malware Attack', 'Business Email CompromiseEmail Takeover',
       'Email Hacking', 'Hacking/Defacement',
       'Unauthorised AccessData Breach', 'SQL Injection',
       'Provocative Speech for unlawful acts', 'Ransomware Attack',
       'Cyber Terrorism', 'Tampering with computer source documents',
       'DematDepository Fraud', 'Online Trafficking',
       'Online Matrimonial Fraud', 'Website DefacementHacking',
       'Damage to computer computer systems etc', 'Impersonating E

In [10]:
def connotation_to_int(string):
    dictionary = {'Cyber Bullying  Stalking  Sexting': 0, 'Fraud CallVishing': 1,
       'Online Gambling  Betting': 2, 'Online Job Fraud': 3,
       'UPI Related Frauds': 4, 'Internet Banking Related Fraud': 5, None: 6,
       'Other': 7, 'Profile Hacking Identity Theft': 8,
       'DebitCredit Card FraudSim Swap Fraud': 9, 'EWallet Related Fraud': 10,
       'Data Breach/Theft': 11, 'Cheating by Impersonation': 12,
       'Denial of Service (DoS)/Distributed Denial of Service (DDOS) attacks': 13,
       'FakeImpersonating Profile': 14, 'Cryptocurrency Fraud': 15,
       'Malware Attack': 16, 'Business Email CompromiseEmail Takeover': 17,
       'Email Hacking': 18, 'Hacking/Defacement': 19,
       'Unauthorised AccessData Breach': 20, 'SQL Injection': 21,
       'Provocative Speech for unlawful acts': 22, 'Ransomware Attack': 23,
       'Cyber Terrorism': 24, 'Tampering with computer source documents': 25,
       'DematDepository Fraud': 26, 'Online Trafficking': 27,
       'Online Matrimonial Fraud': 28, 'Website DefacementHacking': 29,
       'Damage to computer computer systems etc': 30, 'Impersonating Email': 31,
       'EMail Phishing': 32, 'Ransomware': 33, 'Intimidating Email': 34,
       'Against Interest of sovereignty or integrity of India': 35               
                 }
    
    return dictionary.get(string,-1)


In [11]:
y_train = y_train.apply(lambda y: connotation_to_int(y))
y_test = y_test.apply(lambda y: connotation_to_int(y))

In [12]:
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, classification_report

# Define the model pipeline
model = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1, 10), min_df=5, max_df=0.70)),
    ('tfidf', TfidfTransformer(norm='l1')),
    ('clf', CalibratedClassifierCV(estimator=SGDClassifier(penalty='elasticnet', alpha=0.001, max_iter=500, l1_ratio=0.1, random_state=45, class_weight="balanced"), 
        method='isotonic'
    )),
])

# Handle NaN values in 'crime_additional_info' by filling with empty strings
X_train.loc[:, 'crimeaditionalinfo'] = X_train['crimeaditionalinfo'].fillna('')
X_test.loc[:, 'crimeaditionalinfo'] = X_test['crimeaditionalinfo'].fillna('')

# Fit the model
model.fit(X_train['crimeaditionalinfo'], y_train)

# Predictions
train_y_pred = model.predict(X_train['crimeaditionalinfo'])
test_y_pred = model.predict(X_test['crimeaditionalinfo'])

# Accuracy and classification reports
print('Train accuracy: %s' % accuracy_score(y_train, train_y_pred))
print('Test accuracy: %s' % accuracy_score(y_test, test_y_pred))

print('\nTrain Report\n')
print(classification_report(y_train, train_y_pred))

print('\nTest Report\n')
print(classification_report(y_test, test_y_pred))




Train accuracy: 0.3063531370749098
Test accuracy: 0.30471676966921774

Train Report

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00      6591
           0       0.45      0.48      0.46      4089
           1       0.00      0.00      0.00      5803
           2       0.00      0.00      0.00       444
           3       0.00      0.00      0.00       912
           4       0.30      0.99      0.46     26856
           5       0.00      0.00      0.00      8872
           7       0.00      0.00      0.00     10878
           8       0.00      0.00      0.00      2073
           9       0.00      0.00      0.00     10805
          10       0.00      0.00      0.00      4047
          11       0.00      0.00      0.00       484
          12       0.00      0.00      0.00      1988
          13       0.00      0.00      0.00       504
          14       0.00      0.00      0.00      2299
          15       0.00      0.00      0.00       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
