To accurately predict the pre-existence of mental health disorders, we performed this data analysis.

In [1]:
#Importing the necessary libraries here
import os
import pandas as pd
import numpy as np
from pprint import pprint

%matplotlib inline

In [2]:
#Loading the 'Mother-Jones-Mass-Shootings-Database-1982-2019-xlsx.csv' dataset into a dataframe
DATAFRAME = pd.read_csv('Mother-Jones-Mass-Shootings-Database-1982-2019-xlsx.csv' , encoding = "utf8")

In [3]:
#Exploring the columns
DATAFRAME.columns

Index(['case', 'location', 'date', 'summary', 'fatalities', 'injured',
       'total_victims', 'location2', 'age_of_shooter',
       'prior_signs_mental_health_issues', 'mental_health_details',
       'weapons_obtained_legally', 'where_obtained', 'weapon_type',
       'weapon_details', 'race', 'gender', 'sources', 'mental_health_sources',
       'sources_additional_age', 'latitude', 'longitude', 'type', 'year'],
      dtype='object')

In [4]:
#Dropping the columns that are not useful to train our machine learning models
DATAFRAME.drop(['location', 'date', 'summary', 'location2', 'age_of_shooter', 
                'mental_health_details','weapons_obtained_legally','where_obtained',
                'weapon_type','weapon_details', 'race', 'sources', 'mental_health_sources',
                'sources_additional_age', 'latitude', 'longitude' ], axis = 1, inplace = True)

DATAFRAME

Unnamed: 0,case,fatalities,injured,total_victims,prior_signs_mental_health_issues,gender,type,year
0,Molson Coors shooting,5,0,5,0,M,Mass,2020
1,Jersey City kosher market shooting,4,3,7,0,0,Spree,2019
2,Pensacola Naval base shooting,3,8,11,0,M,Mass,2019
3,Odessa-Midland shooting spree,7,25,32,Yes,M,Spree,2019
4,Dayton entertainment district shooting,9,27,36,0,M,Mass,2019
...,...,...,...,...,...,...,...,...
113,Shopping centers spree killings,6,14,20,Yes,Male,Spree,1987
114,United States Postal Service shooting,15,6,21,Unclear,Male,Mass,1986
115,San Ysidro McDonald's massacre,22,19,41,Yes,Male,Mass,1984
116,Dallas nightclub shooting,6,1,7,Yes,Male,Mass,1984


In [5]:
# Importing the LabelEncoder
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB
#Creating labelEncoder
labelEncoder = preprocessing.LabelEncoder();
print (labelEncoder.fit_transform(DATAFRAME['type']))
    

[0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0
 1 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 1 0 0 0
 0 0 1 0 0 0 0]


In [6]:
#Label Encoding 'gender' column
print(labelEncoder.fit_transform(DATAFRAME['gender']))

[3 0 3 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 5 3 3 4 4 4 3 4 3 3 2 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
 4 4 4 4 4 4 2 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
 4 4 4 4 4 4 4]


In [7]:
#Label Encoding 'prior_signs_mental_health_issues' column
print(labelEncoder.fit_transform(DATAFRAME['prior_signs_mental_health_issues']))

[0 0 0 6 0 0 2 2 6 0 6 0 6 0 0 0 6 0 0 6 6 6 2 2 6 3 3 2 6 3 3 6 3 6 6 6 3
 3 3 3 3 3 3 3 3 3 6 3 6 3 5 6 3 6 1 1 6 6 6 6 6 6 6 6 6 6 1 6 3 6 6 1 6 1
 6 3 6 3 1 1 6 6 6 6 6 1 6 6 6 6 6 6 6 1 6 1 1 6 1 6 3 6 1 1 6 1 6 3 1 1 6
 6 6 6 4 6 6 6]


In [8]:
#Label Encoding 'total_victims' column
print(labelEncoder.fit_transform(DATAFRAME['total_victims']))

[ 2  4  8 25 28 33 12 13  8  1  2  0 26 14  3  2  2  4 20  5  0 26  2 12
 32  0  3 37  2  0  2  0  0  8  2  3 13 36 14  5 27  9  0 15  4  7  1  3
 16 12  3 17  4  6  2  4 24  5  7 35  4  7  2  6  9 16  8  2 31 15  8  4
 22  5 10  4 34  7  8  6  5 12  8  9 12  6  4  5  4 12 19 29 24 12  3  4
  4  4  3 23  2 21  9 12  2 11  7  4 31 11 18 27  8 17 18 30  4  8]


In [9]:
#Naming the label encoder
labelEncoder = preprocessing.LabelEncoder()
    
# Encoding the features and the labels
type_encoded = labelEncoder.fit_transform(DATAFRAME['type'])
gender_encoded = labelEncoder.fit_transform(DATAFRAME['gender'])
victims_encoded = labelEncoder.fit_transform(DATAFRAME['total_victims'])
label = labelEncoder.fit_transform(DATAFRAME['prior_signs_mental_health_issues'])

    

In [10]:
# Building the features with victims_encoded, gender_encoded, and type_encoded columns
features = []
for i in range(len(type_encoded)):
    features.append([victims_encoded[i], gender_encoded[i], type_encoded[i]])
print(features)

[[2, 3, 0], [4, 0, 1], [8, 3, 0], [25, 3, 1], [28, 3, 0], [33, 3, 0], [12, 3, 0], [13, 3, 0], [8, 3, 0], [1, 3, 1], [2, 3, 0], [0, 3, 0], [26, 3, 0], [14, 3, 0], [3, 1, 0], [2, 3, 1], [2, 3, 0], [4, 3, 0], [20, 3, 0], [5, 3, 0], [0, 3, 0], [26, 3, 0], [2, 3, 0], [12, 3, 1], [32, 3, 0], [0, 3, 0], [3, 3, 0], [37, 3, 0], [2, 3, 0], [0, 3, 0], [2, 3, 0], [0, 3, 0], [0, 3, 0], [8, 3, 0], [2, 3, 0], [3, 3, 1], [13, 3, 0], [36, 3, 0], [14, 3, 0], [5, 3, 0], [27, 5, 0], [9, 3, 0], [0, 3, 0], [15, 4, 0], [4, 4, 0], [7, 4, 0], [1, 3, 0], [3, 4, 0], [16, 3, 0], [12, 3, 0], [3, 2, 1], [17, 4, 0], [4, 4, 0], [6, 4, 0], [2, 4, 0], [4, 4, 0], [24, 4, 0], [5, 4, 0], [7, 4, 0], [35, 4, 1], [4, 4, 0], [7, 4, 0], [2, 4, 0], [6, 4, 0], [9, 4, 0], [16, 4, 0], [8, 4, 0], [2, 4, 0], [31, 4, 0], [15, 4, 0], [8, 4, 0], [4, 4, 1], [22, 4, 0], [5, 4, 0], [10, 4, 1], [4, 4, 0], [34, 4, 0], [7, 4, 0], [8, 4, 0], [6, 4, 0], [5, 2, 0], [12, 4, 0], [8, 4, 1], [9, 4, 0], [12, 4, 0], [6, 4, 0], [4, 4, 0], [5, 4, 1], [

In [11]:
model = GaussianNB()

In [12]:
# Training the GaussianNB model
model.fit(features, label)

GaussianNB()

In [13]:
#Testing for predictions
print(model.predict([[4, 0, 1]])) #victims #gender #type

[0]


In [14]:
print(model.predict([[28, 3, 0]]))

[2]


In [15]:
print(model.predict([[35, 4, 1]]))

[6]


In [16]:
print(model.predict([[2, 3, 1]]))

[2]


In [17]:
#Because we wanted to test the performance of MultinomialNB on this dataset, 
#We load the dataset "Mother-Jones-Mass-Shootings-Database-1982-2019-xlsx.csv" here and convert it into a dataframe. 
df = pd.read_csv('Mother-Jones-Mass-Shootings-Database-1982-2019-xlsx.csv' , encoding = "utf8")


# Importing the label encoder 
from sklearn import preprocessing 

# label_encoder object knows how to understand word labels. 
label_encoder = preprocessing.LabelEncoder() 

# Encode labels in column. 
df['type_encoded']= label_encoder.fit_transform(df['type']) 
df['gender_encoded']= label_encoder.fit_transform(df['gender']) 
df['victims_encoded'] = label_encoder.fit_transform(df['total_victims'])
df['label'] = labelEncoder.fit_transform(df['prior_signs_mental_health_issues'])

##Dropping the columns that are not useful to train our machine learning models
df.drop(['location', 'date', 'summary', 'location2', 'age_of_shooter', 
                'mental_health_details','weapons_obtained_legally','where_obtained',
                'weapon_type','weapon_details', 'race', 'sources', 'mental_health_sources',
                'sources_additional_age', 'latitude', 'longitude' ], axis = 1, inplace = True)
df



Unnamed: 0,case,fatalities,injured,total_victims,prior_signs_mental_health_issues,gender,type,year,type_encoded,gender_encoded,victims_encoded,label
0,Molson Coors shooting,5,0,5,0,M,Mass,2020,0,3,2,0
1,Jersey City kosher market shooting,4,3,7,0,0,Spree,2019,1,0,4,0
2,Pensacola Naval base shooting,3,8,11,0,M,Mass,2019,0,3,8,0
3,Odessa-Midland shooting spree,7,25,32,Yes,M,Spree,2019,1,3,25,6
4,Dayton entertainment district shooting,9,27,36,0,M,Mass,2019,0,3,28,0
...,...,...,...,...,...,...,...,...,...,...,...,...
113,Shopping centers spree killings,6,14,20,Yes,Male,Spree,1987,1,4,17,6
114,United States Postal Service shooting,15,6,21,Unclear,Male,Mass,1986,0,4,18,4
115,San Ysidro McDonald's massacre,22,19,41,Yes,Male,Mass,1984,0,4,30,6
116,Dallas nightclub shooting,6,1,7,Yes,Male,Mass,1984,0,4,4,6


In [18]:
df.head()

Unnamed: 0,case,fatalities,injured,total_victims,prior_signs_mental_health_issues,gender,type,year,type_encoded,gender_encoded,victims_encoded,label
0,Molson Coors shooting,5,0,5,0,M,Mass,2020,0,3,2,0
1,Jersey City kosher market shooting,4,3,7,0,0,Spree,2019,1,0,4,0
2,Pensacola Naval base shooting,3,8,11,0,M,Mass,2019,0,3,8,0
3,Odessa-Midland shooting spree,7,25,32,Yes,M,Spree,2019,1,3,25,6
4,Dayton entertainment district shooting,9,27,36,0,M,Mass,2019,0,3,28,0


In [19]:
#Checking for nulll values
df.isnull().sum()

case                                0
fatalities                          0
injured                             0
total_victims                       0
prior_signs_mental_health_issues    0
gender                              0
type                                0
year                                0
type_encoded                        0
gender_encoded                      0
victims_encoded                     0
label                               0
dtype: int64

In [20]:
#Before we instantiate our ML model, we divide the dataset into two smaller training and testing datasets.
#We also set up our Features (X) and Labels (y) where X is 'total_victims', 'type_encoded', and 'gender_encoded'.
#y is 'prior_signs_mental_health_issues'
from sklearn.model_selection import train_test_split

# test size represents the proportion of training and testing data split. 
X = df[['total_victims', 'type_encoded','gender_encoded']]
y = df['prior_signs_mental_health_issues']

#Random_state is set to 1 to define the "randomness" of data randomisation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 1)

In [21]:
# First import  MultinomialNB from sklearn
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics


# Create an instance of the model
nb_model = MultinomialNB()

# Fit model to training data
nb_model.fit(X_train, y_train)

MultinomialNB()

In [22]:
# Predict answers to data from the X_text dataset
nb_model_predictions = nb_model.predict(X_test)

# Show results in a confusion matrix
print(metrics.confusion_matrix(y_test,nb_model_predictions))

[[ 0  0  0  2]
 [ 0  0  0  4]
 [ 0  0  0  0]
 [ 0  0  1 17]]


In [23]:
print(metrics.classification_report(y_test,nb_model_predictions))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
          No       0.00      0.00      0.00         4
         TBD       0.00      0.00      0.00         0
         Yes       0.74      0.94      0.83        18

    accuracy                           0.71        24
   macro avg       0.18      0.24      0.21        24
weighted avg       0.55      0.71      0.62        24



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
print(metrics.accuracy_score(y_test,nb_model_predictions))

0.7083333333333334


In [25]:
#loading the Shooting dataset in a dataframe
A = pd.read_csv('Shooting-csv.csv' , encoding = "utf8")

In [26]:
#Exploring the columns
A.columns

Index(['ï»¿Title', 'Location', 'Date', 'Incident Area', 'Open/Close Location',
       'Target', 'Cause', 'Summary', 'Fatalities', 'Injured', 'Total victims',
       'Policeman Killed', 'Age', 'Employeed (Y/N)', 'Employed at',
       'Mental Health Issues', 'Race', 'Gender', 'Latitude', 'Longitude'],
      dtype='object')

In [27]:
#Dropping the columns that are not useful to train our machine learning models
A.drop([ 'ï»¿Title', 'Location', 'Date', 'Incident Area',
       'Open/Close Location','Summary','Policeman Killed', 'Age',
       'Employeed (Y/N)', 'Employed at', 'Race','Latitude', 'Longitude'], axis = 1, inplace = True)

A

Unnamed: 0,Target,Cause,Fatalities,Injured,Total victims,Mental Health Issues,Gender
0,0,0,58,527,585,Unclear,M
1,coworkers,0,3,2,5,Yes,M
2,coworkers,terrorism,3,0,3,Unclear,M
3,coworkers,unemployement,5,0,5,Unclear,M
4,coworkers,0,3,0,3,Yes,M
...,...,...,...,...,...,...,...
318,random,terrorism,5,1,6,Yes,Male
319,random,terrorism,17,32,48,Yes,Male
320,random,unknown,26,20,46,No,M
321,random,unknown,3,0,3,No,M


In [28]:
# Importing label encoder 
from sklearn import preprocessing 

# label_encoder object knows how to understand word labels. 
label_encoder = preprocessing.LabelEncoder() 

# Encode labels in column. 
A['MentalHealth_encoded']= label_encoder.fit_transform(A['Mental Health Issues']) 
A['Gen_encoded']= label_encoder.fit_transform(A['Gender']) 
A['target_encoded'] = labelEncoder.fit_transform(A['Target'])
A['label_causes'] = labelEncoder.fit_transform(A['Cause'])

A

Unnamed: 0,Target,Cause,Fatalities,Injured,Total victims,Mental Health Issues,Gender,MentalHealth_encoded,Gen_encoded,target_encoded,label_causes
0,0,0,58,527,585,Unclear,M,1,1,0,0
1,coworkers,0,3,2,5,Yes,M,3,1,36,0
2,coworkers,terrorism,3,0,3,Unclear,M,1,1,36,22
3,coworkers,unemployement,5,0,5,Unclear,M,1,1,36,23
4,coworkers,0,3,0,3,Yes,M,3,1,36,0
...,...,...,...,...,...,...,...,...,...,...,...
318,random,terrorism,5,1,6,Yes,Male,3,3,49,22
319,random,terrorism,17,32,48,Yes,Male,3,3,49,22
320,random,unknown,26,20,46,No,M,0,1,49,24
321,random,unknown,3,0,3,No,M,0,1,49,24


In [29]:
A.head()

Unnamed: 0,Target,Cause,Fatalities,Injured,Total victims,Mental Health Issues,Gender,MentalHealth_encoded,Gen_encoded,target_encoded,label_causes
0,0,0,58,527,585,Unclear,M,1,1,0,0
1,coworkers,0,3,2,5,Yes,M,3,1,36,0
2,coworkers,terrorism,3,0,3,Unclear,M,1,1,36,22
3,coworkers,unemployement,5,0,5,Unclear,M,1,1,36,23
4,coworkers,0,3,0,3,Yes,M,3,1,36,0


In [30]:
#Checking for null values
A.isnull().sum()

Target                  0
Cause                   0
Fatalities              0
Injured                 0
Total victims           0
Mental Health Issues    0
Gender                  0
MentalHealth_encoded    0
Gen_encoded             0
target_encoded          0
label_causes            0
dtype: int64

In [31]:
#Before we instantiate our ML model, we divide the dataset into two smaller training and testing datasets.
#We also set up our Features (X) and Labels (y) where X is 'MentalHealth_encoded','Gen_encoded', and 'target_encoded'.
#y is 'Cause'

from sklearn.model_selection import train_test_split

# test size represents the proportion of training and testing data split. 
X = A[['MentalHealth_encoded','Gen_encoded', 'target_encoded']]
y = A['Cause']

# Random_state sets "randomness" of data randomisation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [32]:
# Import  MultinomialNB from sklearn
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics


# Creating an instance of the model
nb_model = MultinomialNB()

# Fitting model to training data
nb_model.fit(X_train, y_train)

MultinomialNB()

In [33]:
# Predict answers to data from the X_text dataset
nb_model_predictions = nb_model.predict(X_test)

# Show results in a confusion matrix
print(metrics.confusion_matrix(y_test,nb_model_predictions))

[[18  0  0  1  0  0  0  0  1  0  0  0  0  0]
 [ 5  0  0  1  0  0  0  0  0  0  0  0  0  0]
 [ 2  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 4  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 5  0  0  1  0  0  0  0  0  0  0  0  0  0]
 [ 1  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 9  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 9  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 2  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 3  0  0  0  0  0  0  0  0  0  0  0  0  0]]


In [34]:
#Printing the classification report and the accuracy of the model
print(metrics.classification_report(y_test,nb_model_predictions))


                       precision    recall  f1-score   support

                    0       0.30      0.90      0.44        20
                anger       0.00      0.00      0.00         6
                argue       0.00      0.00      0.00         2
     domestic dispute       0.00      0.00      0.00         4
        failing exams       0.00      0.00      0.00         1
                fight       0.00      0.00      0.00         1
          frustration       0.00      0.00      0.00         6
                  fun       0.00      0.00      0.00         1
post-traumatic stress       0.00      0.00      0.00         0
               psycho       0.00      0.00      0.00         9
              revenge       0.00      0.00      0.00         1
            terrorism       0.00      0.00      0.00         9
        unemployement       0.00      0.00      0.00         2
              unknown       0.00      0.00      0.00         3

             accuracy                           0.28 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [35]:
print(metrics.accuracy_score(y_test,nb_model_predictions))

0.27692307692307694


In [36]:
#Import RandomForestClassifier from sklearn
from sklearn.ensemble import RandomForestClassifier

# Create an instance of the model
RF_model = RandomForestClassifier()

#Fit model to training data
RF_model.fit(X_train, y_train)

RandomForestClassifier()

In [37]:
# Predict answers to data from the X_text dataset
RF_model_predictions = RF_model.predict(X_test)

# Show results in a confusion matrix
print(metrics.confusion_matrix(y_test,RF_model_predictions))

[[10  2  0  3  0  0  1  0  1  1  0  2  0  0]
 [ 4  1  1  0  0  0  0  0  0  0  0  0  0  0]
 [ 2  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1  0  0  3  0  0  0  0  0  0  0  0  0  0]
 [ 0  1  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 2  0  0  0  0  0  0  0  0  0  0  3  1  0]
 [ 1  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 4  0  0  0  0  0  0  0  0  0  1  4  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  1  0  0  0]
 [ 3  1  0  0  0  0  0  0  0  1  0  4  0  0]
 [ 0  1  0  0  0  0  0  0  0  0  0  0  1  0]
 [ 2  0  0  0  0  0  0  0  0  0  0  0  1  0]]


In [38]:
#the classification report and the accuracy is printed
print(metrics.classification_report(y_test,RF_model_predictions))


                    precision    recall  f1-score   support

                 0       0.33      0.50      0.40        20
             anger       0.17      0.17      0.17         6
             argue       0.00      0.00      0.00         2
  domestic dispute       0.50      0.75      0.60         4
     failing exams       0.00      0.00      0.00         1
             fight       0.00      0.00      0.00         1
       frustration       0.00      0.00      0.00         6
               fun       0.00      0.00      0.00         1
neighbors conflict       0.00      0.00      0.00         0
            psycho       0.00      0.00      0.00         9
           revenge       0.50      1.00      0.67         1
         terrorism       0.31      0.44      0.36         9
     unemployement       0.33      0.50      0.40         2
           unknown       0.00      0.00      0.00         3

          accuracy                           0.31        65
         macro avg       0.15      0.2

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [39]:
print(metrics.accuracy_score(y_test,RF_model_predictions))

0.3076923076923077


In [40]:
# Importing a Support Vector Classification model (SVC)
from sklearn.svm import SVC

# The gamma value is set to "auto" to avoid returning error. We fit the SVC model on our data. The confusion matrix is printed.
SVC_model = SVC(gamma="auto")
SVC_model.fit(X_train, y_train)

SVC_model_predictions = SVC_model.predict(X_test)

print(metrics.confusion_matrix(y_test, SVC_model_predictions))

[[11  2  0  3  0  0  0  0  1  2  0  1  0  0]
 [ 5  1  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 2  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1  0  0  3  0  0  0  0  0  0  0  0  0  0]
 [ 1  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 3  0  0  0  0  0  0  0  0  1  0  1  1  0]
 [ 1  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 5  0  0  0  0  0  0  0  0  0  0  4  0  0]
 [ 1  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 6  0  0  0  0  0  0  0  0  1  0  2  0  0]
 [ 0  2  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 2  0  0  0  0  0  0  0  0  0  0  1  0  0]]


In [41]:
#Printing the classification report
print(metrics.classification_report(y_test,SVC_model_predictions))

                    precision    recall  f1-score   support

                 0       0.28      0.55      0.37        20
             anger       0.20      0.17      0.18         6
             argue       0.00      0.00      0.00         2
  domestic dispute       0.50      0.75      0.60         4
     failing exams       0.00      0.00      0.00         1
             fight       0.00      0.00      0.00         1
       frustration       0.00      0.00      0.00         6
               fun       0.00      0.00      0.00         1
neighbors conflict       0.00      0.00      0.00         0
            psycho       0.00      0.00      0.00         9
           revenge       0.00      0.00      0.00         1
         terrorism       0.22      0.22      0.22         9
     unemployement       0.00      0.00      0.00         2
           unknown       0.00      0.00      0.00         3

          accuracy                           0.26        65
         macro avg       0.09      0.1

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [42]:
#Printing the accuracy
print(metrics.accuracy_score(y_test,SVC_model_predictions))

0.26153846153846155


In [43]:
#We import LogisticRegression from sklearn
from sklearn.linear_model import LogisticRegression

#We build the model with the L-BFGS option.
#Once the model is built, the training data is then provided to the model.

log_reg_model = LogisticRegression(solver = "lbfgs")

log_reg_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [44]:
#We test the accuracy of the model using the test data.
#We import the sklearn.metrics module which includes score functions, performance metrics and pairwise metrics and distance computations. 
#We create a predictions set with some test data. This is unseen data.

from  sklearn import metrics

# Creating a prediction set here
# Unseen contents of X_test
# The answers in y_test are expected which is a list of expected topic descriptions
log_reg_model_predictions = log_reg_model.predict(X_test)

In [45]:
#Predicted output from the model
log_reg_model_predictions

array(['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', 'terrorism', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', 'terrorism', '0',
       '0', '0', '0', '0', '0', '0', '0', 'terrorism', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', 'terrorism', '0', '0', '0', '0', '0',
       'terrorism', '0', '0', '0', '0'], dtype=object)

In [46]:
# Now we compare what the model predicted 
# with what is expected as output
# Printing a confusion matrix
print(metrics.confusion_matrix(y_test,log_reg_model_predictions))

[[18  0  0  0  0  0  0  0  0  0  2  0  0]
 [ 6  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 2  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 4  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 6  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 9  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 8  0  0  0  0  0  0  0  0  0  1  0  0]
 [ 2  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1  0  0  0  0  0  0  0  0  0  2  0  0]]


In [47]:
# Printing a classification report
print(metrics.classification_report(y_test,log_reg_model_predictions))

                  precision    recall  f1-score   support

               0       0.30      0.90      0.45        20
           anger       0.00      0.00      0.00         6
           argue       0.00      0.00      0.00         2
domestic dispute       0.00      0.00      0.00         4
   failing exams       0.00      0.00      0.00         1
           fight       0.00      0.00      0.00         1
     frustration       0.00      0.00      0.00         6
             fun       0.00      0.00      0.00         1
          psycho       0.00      0.00      0.00         9
         revenge       0.00      0.00      0.00         1
       terrorism       0.20      0.11      0.14         9
   unemployement       0.00      0.00      0.00         2
         unknown       0.00      0.00      0.00         3

        accuracy                           0.29        65
       macro avg       0.04      0.08      0.05        65
    weighted avg       0.12      0.29      0.16        65



  _warn_prf(average, modifier, msg_start, len(result))


In [48]:
# Printing the overall accuracy of the model
print(metrics.accuracy_score(y_test,log_reg_model_predictions))

0.2923076923076923


Reference:
    Dr. James Connolly's Supervised learning practical notes from AI2.