In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm

import re
import datetime as dt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

from sklearn.metrics import f1_score, precision_score, recall_score, hamming_loss
from sklearn.metrics import multilabel_confusion_matrix,classification_report, accuracy_score
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.model_selection import RepeatedStratifiedKFold, GridSearchCV

In [2]:
train_File = pd.read_csv('train.csv')
test_File = pd.read_csv('test.csv')

In [3]:
train_File.head()

Unnamed: 0,Description,Commenting,Ogling/Facial Expressions/Staring,Touching /Groping
0,"Was walking along crowded street, holding mums...",0,0,1
1,This incident took place in the evening.I was ...,0,1,0
2,I WAS WAITING FOR THE BUS. A MAN CAME ON A BIK...,1,0,0
3,Incident happened inside the train,0,0,0
4,I witnessed an incident when a chain was bruta...,0,0,0


In [4]:
test_File.head()

Unnamed: 0,Description,Commenting,Ogling/Facial Expressions/Staring,Touching /Groping
0,"During morning, a woman was walking by and thi...",1,1,0
1,A man tried to brush his penis off of a woman'...,0,0,1
2,This happened to a fellow passenger of mine tr...,0,1,0
3,ogling,0,1,0
4,When I was returning my home after finishing m...,0,0,1


In [5]:
train_File.shape

(7201, 4)

In [6]:

def text_splitter(text):

    return text.split()



tfidf = TfidfVectorizer(
                        min_df=0.00009,
                        max_features=2000,
                        smooth_idf=True,
                        norm="l2",
                        tokenizer = text_splitter,
                        sublinear_tf=False,
                        ngram_range=(1,3)
                       )

tfidf_train = tfidf.fit_transform(train_File.Description.apply(lambda x: np.str_(x)))
tfidf_test = tfidf.transform(test_File.Description)
print("Train and Test shape : ",tfidf_train.shape, tfidf_test.shape)



Train and Test shape :  (7201, 2000) (1701, 2000)


In [7]:
train_label = train_File[['Commenting','Ogling/Facial Expressions/Staring','Touching /Groping']]
test_label = test_File[['Commenting','Ogling/Facial Expressions/Staring','Touching /Groping']]

In [8]:
train_label.head()

Unnamed: 0,Commenting,Ogling/Facial Expressions/Staring,Touching /Groping
0,0,0,1
1,0,1,0
2,1,0,0
3,0,0,0
4,0,0,0


In [9]:
test_label.head()

Unnamed: 0,Commenting,Ogling/Facial Expressions/Staring,Touching /Groping
0,1,1,0
1,0,0,1
2,0,1,0
3,0,1,0
4,0,0,1


In [10]:
set(np.where(test_label.iloc[0])[0])

{0, 1}

In [11]:


def hamming_score(y_true, y_pred):

    acc_list = []


    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    for i in range(y_true.shape[0]):
        set_true = set(np.where(y_true[i])[0])
        set_pred = set(np.where(y_pred[i])[0])

        temp_a = None

        if len(set_true)==0 and len(set_pred) ==0:

            temp_a =1
        else:
            temp_a = len(set_true.intersection(set_pred))/float(len(set_true.union(set_pred)))
        acc_list.append(temp_a)
    return np.mean(acc_list)

In [12]:
x_train=tfidf_train
y_train=train_label

In [13]:
x_test=tfidf_test
y_test=test_label

In [14]:
from sklearn.tree import DecisionTreeClassifier
model=DecisionTreeClassifier()
model.fit(x_train,y_train)
y_pred=model.predict(x_test)

In [15]:
hamming_score(y_test,y_pred)

0.5742700372330002

In [16]:
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier()
model.fit(x_train,y_train)
y_pred=model.predict(x_test)

In [17]:
hamming_score(y_test,y_pred)

0.6289437585733881

In [18]:
y_pred

array([[1, 0, 0],
       [0, 0, 1],
       [0, 0, 0],
       ...,
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 0]])

In [19]:
!pip install shap



In [21]:
!pip install shap



In [22]:
import joblib
joblib.dump(model, 'trained_model.joblib')

['trained_model.joblib']

In [None]:
# explainer = shap.Explainer(model)
# shap_values = explainer(X_test)

# # Visualize the explanations
# shap.summary_plot(shap_values, X_test, feature_names=tfidf_vectorizer.get_feature_names_out())

# # Evaluate XAI model using Identity, Separability, Similarity, Stability metrics
# # Implement evaluation functions for these metrics

# # Example function for Identity
# # def calculate_identity(shap_values, y_pred):
# #     # Implement identity calculation
# #     pass


# # # Calculate Identity metric
# # identity_score = calculate_identity(shap_values, y_pred)
# # print("Identity Score:", identity_score)

In [None]:
# import shap

# # Assuming you have already trained your multi-label classification model
# # and stored it in the variable 'model'

# # Create a SHAP explainer object
# explainer = shap.Explainer(model, x_train)

# # Compute SHAP values for a sample of data points (e.g., x_test)
# shap_values = explainer.shap_values(x_test)

# # SHAP values will have shape (num_samples, num_features, num_classes)
# # If you're interested in the SHAP values for a specific class label, you can access it like this:
# # shap_values_class_0 = shap_values[0]  # SHAP values for class label 0
# # shap_values_class_1 = shap_values[1]  # SHAP values for class label 1
# # shap_values_class_2 = shap_values[2]  # SHAP values for class label 2
# # ...

# # You can then use these SHAP values to create visualizations or interpret your model's predictions
