In [3]:
# Importing stuff

import pandas as pd  # For handling data in DataFrames
import numpy as np  # For numerical operations

# Importing TensorFlow and Keras for building and training neural networks
import tensorflow as tf
from tensorflow.keras.models import Sequential  # Sequential model for stacking layers
from keras.layers import (  # Importing various layers for the model
    Dense, 
    Dropout, 
    Flatten, 
    Conv1D, 
    MaxPooling1D, 
)

# Importing libraries for Natural Language Processing (NLP)
import nltk  # Natural Language Toolkit for text processing
from nltk.corpus import stopwords  # To filter out common words
from string import punctuation  # To handle punctuation marks
from nltk.stem import WordNetLemmatizer  # For reducing words to their base forms
from nltk.tokenize import word_tokenize  # For splitting text into words

# Importing TfidfVectorizer for converting text data into numerical format
from sklearn.feature_extraction.text import TfidfVectorizer

# Importing processing utilities for model training
from sklearn.model_selection import train_test_split  # For splitting the dataset into training and testing sets
from keras.utils import to_categorical  # For converting labels to a categorical format
from sklearn.preprocessing import LabelEncoder  # For encoding categorical labels into integers


In [33]:
df = pd.read_csv("processedData.csv")

In [5]:
# Preprocessing the comments. Lemmatizing, removing stopwords, etc.s

stopwords_list = stopwords.words('english')

def tokenization(message):
    return word_tokenize(message)

lemmatizer = WordNetLemmatizer()
def lemmatize(message):
    return [lemmatizer.lemmatize(word) for word in message]

def remove_stopwords(message):
  res = []
  for word in message:
    if word not in stopwords_list:
      res.append(word)

  return res

def remove_stopwords(message):
    return [word for word in message if word not in stopwords_list]

def processing(message):
  message = message.lower()
  message = tokenization(message)
  message = remove_stopwords(message)
  message = lemmatize(message)
  cleaned_msg = ' '.join(message)
  return cleaned_msg

df['comments'] = df['comments'].apply(processing)

In [6]:
# Vectorizing the comments.

vectorizer = TfidfVectorizer(ngram_range=(1,4),max_features=64)
tfidf_X = vectorizer.fit_transform(df['comments'])

y = df['Betrayal']

In [7]:
# Building the model. It is a 1-D CNN.

model_CNN = Sequential()
model_CNN.add(Conv1D(filters=32, kernel_size=3, activation='leaky_relu', input_shape=(tfidf_X.shape[1],1)))
model_CNN.add(MaxPooling1D(pool_size=3))
model_CNN.add(Flatten())
model_CNN.add(Dense(units=32, activation = 'leaky_relu'))
model_CNN.add(Dense(units = 16, activation = 'leaky_relu'))
model_CNN.add(Dropout(0.2))
model_CNN.add(Dense(units=1, activation='softmax'))

model_CNN.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model_CNN.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [8]:
# Training
print(tfidf_X.shape, y.shape)
model_CNN.fit(tfidf_X, y, epochs=100)

(84, 64) (84,)
Epoch 1/100




[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.2617 - loss: 0.6945
Epoch 2/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.2656 - loss: 0.6795  
Epoch 3/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.2578 - loss: 0.6644  
Epoch 4/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.2539 - loss: 0.6492  
Epoch 5/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.2578 - loss: 0.6230  
Epoch 6/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.2656 - loss: 0.6216  
Epoch 7/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.2461 - loss: 0.5916  
Epoch 8/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.2500 - loss: 0.5751 
Epoch 9/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

<keras.src.callbacks.history.History at 0x21f936b76b0>

In [28]:
# Testing

y_pred_msg = model_CNN.predict(tfidf_X)

y_pred_msg = np.log(y_pred_msg)

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step  


In [10]:
df

Unnamed: 0.1,Unnamed: 0,Age,Gender,Clan,family_mental_history,Support Received,Mission Interference,Clan Size,knowledge_about_facilities,seeken_help,frequency_of_leaves,job_physical_consequence,job_mental_consequence,comrade_relationships,superior_relationship,physical_assessment,mental_assessment,comments,Betrayal
0,24,33,1.0,Zyphorians,1,1,2,63.0,1,2,0,1,1,2,2,1,2,relatively new job . ask later,0
1,25,35,1.0,Zyphorians,1,1,3,1200.0,2,0,4,2,1,1,2,1,2,sometimes think using drug mental health issue...,1
2,33,42,1.0,Zyphorians,1,1,3,63.0,2,2,4,0,1,1,1,0,2,selected current employer based policy self ca...,1
3,45,38,0.0,Zyphorians,1,1,3,63.0,2,2,2,1,1,1,2,1,1,health plan covered psychotherapy antidepressa...,0
4,49,30,1.0,Zyphorians,1,1,2,63.0,0,0,0,0,1,1,2,1,1,started new job last week hence lot n't know 's,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81,1186,56,0.0,Zyphorians,0,0,2,3.0,1,0,0,0,0,0,0,1,1,'m self-employed contract small start-up . cov...,0
82,1190,44,1.0,Zyphorians,1,1,3,300.0,2,2,0,0,1,1,1,1,1,mental health issue direct result trauma child...,0
83,1219,38,1.0,Zyphorians,0,1,3,1200.0,0,2,4,1,1,2,2,1,1,openly discus mental health struggle . found e...,0
84,1221,46,1.0,Zyphorians,1,1,3,300.0,1,0,0,1,1,1,2,2,2,starting new job hence numerous n't know selec...,0


# Gaussian Classification

Here, we generate the probability arrays for various different features, and at the end, we apply Bayes' theorem.

In [11]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

In [12]:
# Age probability array

gnb_age = GaussianNB()

X_age = np.array(df['Age']).reshape(-1, 1)

gnb_age.fit(X_age, np.array(y))
y_pred_age = gnb_age.predict_log_proba(X_age)

In [13]:
# Gender probability array

mnb_gender = MultinomialNB()

X_gender = np.array(df['Gender']).reshape(-1, 1)
mnb_gender.fit(X_gender, np.array(y))
y_pred_gender = gnb_age.predict_log_proba(X_gender)

In [14]:
# family_mental_history probability array

mnb_fmh = MultinomialNB()

X_fmh = np.array(df['family_mental_history']).reshape(-1, 1)
mnb_fmh.fit(X_fmh, np.array(y))
y_pred_fmh = gnb_age.predict_log_proba(X_fmh)

In [15]:
# Support Received probability array

mnb_sr = MultinomialNB()

X_sr = np.array(df['Support Received']).reshape(-1,1)
mnb_sr.fit(X_sr,np.array(y))
y_pred_sr = mnb_sr.predict_log_proba(X_sr)

In [16]:
# Mission Interference probability array

gnb_mi = GaussianNB()

X_mi = np.array(df['Mission Interference']).reshape(-1,1)
gnb_mi.fit(X_mi, np.array(y))
y_pred_mi = gnb_mi.predict_log_proba(X_mi)

In [17]:
# Knowledge About Facilities
gnb_knowledge = GaussianNB()
X_knowledge = np.array(df['knowledge_about_facilities']).reshape(-1, 1)
gnb_knowledge.fit(X_knowledge, np.array(y))
y_pred_knowledge = gnb_knowledge.predict_log_proba(X_knowledge)

In [18]:
# Sought Help
gnb_help = GaussianNB()
X_help = np.array(df['seeken_help']).reshape(-1, 1)
gnb_help.fit(X_help, np.array(y))
y_pred_help = gnb_help.predict_log_proba(X_help)

In [19]:
# Frequency of Leaves
gnb_leaves = GaussianNB()
X_leaves = np.array(df['frequency_of_leaves']).reshape(-1, 1)
gnb_leaves.fit(X_leaves, np.array(y))
y_pred_leaves = gnb_leaves.predict_log_proba(X_leaves)


In [22]:
# Job Physical Consequence
gnb_physical_job = GaussianNB()
X_physical_job = np.array(df['job_physical_consequence']).reshape(-1, 1)
gnb_physical_job.fit(X_physical_job, np.array(y))
y_pred_physical_job = gnb_physical_job.predict_log_proba(X_physical_job)


In [23]:
# Job Mental Consequence
gnb_mental_job = GaussianNB()
X_mental_job = np.array(df['job_mental_consequence']).reshape(-1, 1)
gnb_mental_job.fit(X_mental_job, np.array(y))
y_pred_mental_job = gnb_mental_job.predict_log_proba(X_mental_job)

In [24]:
# Comrade Relationships
gnb_comrade = GaussianNB()
X_comrade = np.array(df['comrade_relationships']).reshape(-1, 1)
gnb_comrade.fit(X_comrade, np.array(y))
y_pred_comrade = gnb_comrade.predict_log_proba(X_comrade)

In [25]:
# Superior Relationships
gnb_superior = GaussianNB()
X_superior = np.array(df['superior_relationship']).reshape(-1, 1)
gnb_superior.fit(X_superior, np.array(y))
y_pred_superior = gnb_superior.predict_log_proba(X_superior)

In [26]:
# Physical Assessment
gnb_physical_assess = GaussianNB()
X_physical_assess = np.array(df['physical_assessment']).reshape(-1, 1)
gnb_physical_assess.fit(X_physical_assess, np.array(y))
y_pred_physical_assess = gnb_physical_assess.predict_log_proba(X_physical_assess)

In [27]:
# Mental Assessment
gnb_mental_assess = GaussianNB()
X_mental_assess = np.array(df['mental_assessment']).reshape(-1, 1)
gnb_mental_assess.fit(X_mental_assess, np.array(y))
y_pred_mental_assess = gnb_mental_assess.predict_log_proba(X_mental_assess)

Implementing Bayes' Theorem

In [29]:
final_probabilities = (y_pred_mental_assess+
                       y_pred_physical_assess+
                       y_pred_superior+
                       y_pred_comrade+
                       y_pred_mental_job+
                       y_pred_physical_job+
                       y_pred_leaves+
                       y_pred_help+
                       y_pred_knowledge+
                       y_pred_mi+
                       y_pred_sr+
                       y_pred_fmh+
                       y_pred_gender+
                       y_pred_age+
                       y_pred_msg)

The maximum arguement here refers to the class chosen, after fitting our Gaussian model.

In [32]:
final_probabilities = np.argmax(final_probabilities, axis=1)
final_probabilities

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)