### Importing

In [6]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Initial Preparation

In [7]:
data_frame = pd.read_csv('shooting.csv',encoding = 'unicode_escape')
data_frame.info(verbose=True, null_counts=True)
data_frame1 = data_frame.copy()
data_frame1.drop(labels=["summary","sources","mental_health_sources","sources_additional_age","latitude","longitude"], axis='columns', inplace=True)
data_frame2 = data_frame1.copy()
data_frame1.dropna(axis='index', how='any', inplace=True)
data_frame2.head()
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(data_frame2, test_size=0.30, random_state=666)
print(len(train_set), len(test_set))
working_set = train_set.copy()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110 entries, 0 to 109
Data columns (total 24 columns):
case                                110 non-null object
location                            110 non-null object
date                                110 non-null object
summary                             110 non-null object
fatalities                          110 non-null int64
injured                             110 non-null int64
total_victims                       110 non-null int64
location.1                          110 non-null object
age_of_shooter                      110 non-null int64
prior_signs_mental_health_issues    110 non-null int64
mental_health_details               109 non-null object
weapons_obtained_legally            110 non-null int64
where_obtained                      109 non-null object
weapon_type                         110 non-null object
weapon_details                      110 non-null object
race                                109 non-null object
gende

I picked total_victims as my X and prior_signs_mental_health_issues as y. I picked these two beacause I wanted to see if these two attributes are related.

# Using Decision Tree

In [8]:
X_working_classifier = working_set[["total_victims"]]
y_working_classifier = working_set["prior_signs_mental_health_issues"]

tree_classifier = DecisionTreeClassifier()
tree_classifier.fit(X_working_classifier,y_working_classifier)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

### Training sets

In [9]:
y_predicted_working_classifier = tree_classifier.predict(X_working_classifier)
classifier_working_matrix = confusion_matrix(y_working_classifier,y_predicted_working_classifier)
print("The confusing matrix of training class is: ")
print(classifier_working_matrix)
print ("Accuracy of training class using classifier is ", accuracy_score(y_working_classifier, y_predicted_working_classifier ))

print ("Precision of training class using classifier is ", precision_score(y_working_classifier, y_predicted_working_classifier , average="weighted"))
print ("Sensitivity of training class using classifier is ", recall_score(y_working_classifier,y_predicted_working_classifier , average="weighted"))
print ("F1 of training class using classifier is ", f1_score(y_working_classifier, y_predicted_working_classifier , average="weighted"))

The confusing matrix of training class is: 
[[16  2  3]
 [ 7 31  3]
 [ 3  6  6]]
Accuracy of training class using classifier is  0.6883116883116883
Precision of training class using classifier is  0.6884781884781884
Sensitivity of training class using classifier is  0.6883116883116883
F1 of training class using classifier is  0.6849290780141843


### Test set

In [10]:
X_test_classifier = test_set[["total_victims"]]
y_test_classifier = test_set["prior_signs_mental_health_issues"]

y_predicted_test_classifier = tree_classifier.predict(X_test_classifier)
classifier_test_matrix = confusion_matrix(y_test_classifier,y_predicted_test_classifier)
print("The confusing matrix of testclass is: ")
print(classifier_test_matrix)
print ("Accuracy of test class using classifier is ", accuracy_score(y_test_classifier, y_predicted_test_classifier ))

print ("Precision of test class using classifier is ", precision_score(y_test_classifier, y_predicted_test_classifier , average="weighted"))
print ("Sensitivity of testclass using classifier is ", recall_score(y_test_classifier,y_predicted_test_classifier , average="weighted"))
print ("F1 of test class using classifier is ", f1_score(y_test_classifier, y_predicted_test_classifier , average="weighted"))

The confusing matrix of testclass is: 
[[5 7 2]
 [7 8 2]
 [0 1 1]]
Accuracy of test class using classifier is  0.42424242424242425
Precision of test class using classifier is  0.4464646464646465
Sensitivity of testclass using classifier is  0.42424242424242425
F1 of test class using classifier is  0.4302566120747938


# Using SVM 

### Training set

In [11]:
X = working_set[["total_victims"]]
y = working_set["prior_signs_mental_health_issues"]
svm_classifier = SVC(kernel="rbf")
svm_classifier.fit(X,y)

y_predicted = svm_classifier.predict(X)
matrix = confusion_matrix(y, y_predicted)
print("The confusion matrix for training set is: ")
print(matrix)
print ("Accuracy of training set is ", accuracy_score(y, y_predicted))
print ("Precision of training set is ", precision_score(y, y_predicted, average="weighted"))
print ("Sensitivity of training set is ", recall_score(y, y_predicted, average="weighted"))
print ("F1 of training set is ", f1_score(y, y_predicted, average="weighted"))


The confusion matrix for training set is: 
[[ 9 12  0]
 [ 1 40  0]
 [ 0 14  1]]
Accuracy of training set is  0.6493506493506493
Precision of training set is  0.7629673356946084
Sensitivity of training set is  0.6493506493506493
F1 of training set is  0.5808149869425118




### Test Set

In [12]:
X_test = test_set[["total_victims"]]
y_test = test_set["prior_signs_mental_health_issues"]
svm_classifier = SVC(kernel="rbf")
svm_classifier.fit(X_test,y_test)

y_predicted_test= svm_classifier.predict(X_test)
matrix_test = confusion_matrix(y_test, y_predicted_test)
print("The confusion matrix is for test set : ")
print(matrix_test)
print ("Accuracy for test set is ", accuracy_score(y_test, y_predicted_test))
print ("Precision for test set is ", precision_score(y_test, y_predicted_test, average="weighted"))
print ("Sensitivity for test set is ", recall_score(y_test, y_predicted_test, average="weighted"))
print ("F1 for test set is ", f1_score(y_test, y_predicted_test, average="weighted"))

The confusion matrix is for test set : 
[[12  2  0]
 [ 3 14  0]
 [ 1  1  0]]
Accuracy for test set is  0.7878787878787878
Precision for test set is  0.7424242424242424
Sensitivity for test set is  0.7878787878787878
F1 for test set is  0.7636363636363637


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


# Analysis

The accuracy and F1 score using SVM is better than using tree classifier. The accuracy of the test set is 42% using decision tree whereas the accuracy of test set using SVM is 78%. The same way, f1 score using decision tree for the test set is 43% where as f1 score using 76%. 

I picked total_victims and prior_signs_mental_health_issues to see various metrics of classification. SVM worked better in classification. 78% of the data was accuratly clasified with 74% precision. 

# K-Means Clustering

In [32]:
from sklearn.cluster import KMeans
X = working_set[["weapons_obtained_legally", 'prior_signs_mental_health_issues']]
kmeans_classifier = KMeans(n_clusters=3)
kmeans_classifier.fit(X)
cluster_x = [x for [x,y] in kmeans_classifier.cluster_centers_]
cluster_y = [y for [x,y] in kmeans_classifier.cluster_centers_]
import matplotlib.pyplot as plt
plt.scatter(working_set['weapons_obtained_legally'], working_set['prior_signs_mental_health_issues'], c=kmeans_classifier.labels_)
plt.scatter(cluster_x, cluster_y, marker="+", color="red")
plt.xlabel("X")
plt.ylabel("Y")
plt.show()

plt.scatter(working_set['weapons_obtained_legally'], working_set['prior_signs_mental_health_issues'], c=kmeans_classifier.labels_)
plt.scatter(cluster_x, cluster_y, marker="+", color="red")
plt.xlabel("X")
plt.ylabel("Y")
plt.axis("scaled")
plt.show()

In [33]:
from sklearn.cluster import KMeans
X = test_set[["weapons_obtained_legally", 'prior_signs_mental_health_issues']]
kmeans_classifier = KMeans(n_clusters=3)
kmeans_classifier.fit(X)
cluster_x = [x for [x,y] in kmeans_classifier.cluster_centers_]
cluster_y = [y for [x,y] in kmeans_classifier.cluster_centers_]
import matplotlib.pyplot as plt
plt.scatter(test_set['weapons_obtained_legally'], test_set['prior_signs_mental_health_issues'], c=kmeans_classifier.labels_)
plt.scatter(cluster_x, cluster_y, marker="+", color="red")
plt.xlabel("X")
plt.ylabel("Y")
plt.show()

plt.scatter(test_set['weapons_obtained_legally'], test_set['prior_signs_mental_health_issues'], c=kmeans_classifier.labels_)
plt.scatter(cluster_x, cluster_y, marker="+", color="red")
plt.xlabel("Weapons_obtained_legally")
plt.ylabel("prior_signs_mental_health_issues")
plt.title("Test set")
plt.axis("scaled")
plt.show()