In [1]:
#importing libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import datasets


crime_data = pd.read_excel("final_crime_data_dataset.xlsx")

In [14]:
crime_data


Unnamed: 0,STATE,DISTRICT,YEAR,MURDER,ATTEMPT TO MURDER,CULPABLE HOMICIDE NOT AMOUNTING TO MURDER,CUSTODIAL RAPE,OTHER RAPE,KIDNAPPING AND ABDUCTION OF WOMEN AND GIRLS,KIDNAPPING AND ABDUCTION OF OTHERS,...,INSULT TO MODESTY OF WOMEN,CRUELTY BY HUSBAND OR HIS RELATIVES,IMPORTATION OF GIRLS FROM FOREIGN COUNTRIES,CAUSING DEATH BY NEGLIGENCE,OTHER IPC CRIMES,TOTAL IPC CRIMES,POPULATION,CRIME RATE,STATE DIST NORM,VIOLENCE RATING
663,0,21,2001,13,0,0,0,3,2,0,...,1,9,0,0,310,637,314084,202.811987,0,0
1349,0,21,2002,14,3,0,0,1,1,1,...,3,4,0,0,316,580,293235,197.793579,0,0
2046,0,21,2003,16,4,1,0,2,2,0,...,2,7,0,0,299,611,273771,223.179226,0,0
2743,0,21,2004,13,1,2,0,10,3,0,...,3,5,0,6,373,721,255598,282.083584,0,0
3444,0,21,2005,14,3,3,0,4,1,1,...,1,5,0,3,354,666,238631,279.091987,0,0
4151,0,21,2006,3,4,1,0,6,5,0,...,4,7,0,2,311,655,222791,293.997513,0,0
5586,0,21,2008,8,10,2,0,12,13,3,...,3,25,0,4,374,859,194195,442.338886,1,1
6321,0,21,2009,15,4,3,0,18,14,2,...,7,21,0,2,440,921,181305,507.983784,1,1
7067,0,21,2010,8,7,4,0,23,8,2,...,10,9,0,9,479,956,169270,564.778165,1,1
7825,0,21,2011,14,5,2,0,13,12,3,...,3,5,0,6,382,771,105597,730.134379,1,1


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve


def generate_train_test_data(x_dataset,y_data):
    tempBikeDs = np.copy(x_dataset)    
    X_train,X_test,Y_train,Y_test = train_test_split(tempBikeDs,y_data,test_size=0.2,random_state = 40)
    return X_train,X_test,Y_train,Y_test

def confusion_matrix(Y, T,title=None):
    if len(Y) != len(T):
        raise ValueError("Length mismatch for predicted and test data")
    
    classes = np.unique(T)
#     print(classes)
    classes_len = len(classes)
    
    cfm = pd.DataFrame(np.zeros((classes_len, classes_len)), index=classes, columns=classes, dtype=int)
    
    Tidx = [T == c for c in classes]
    for c in classes:
        pred_idx = Y == c
        cfm.loc[c, :] = [np.sum(np.logical_and(pred_idx, tidx)) for tidx in Tidx]
    index = ["-","+"]
    columns = ["-","+"]
    print(f"Confusion Matrix {'' if title == None else title}")
    display(pd.DataFrame([['TN','FN'],['FP','TP']],index=index,columns=columns))
    display(cfm)
    return cfm


#get accuracy based on predicted Y and test T data
def get_accuracy(Y,T):
    return np.sum(Y == T)/len(Y)

#get precision from confusion matrix
def get_precision(confusion_mat):
    confusion_mat = confusion_mat.values
    confusion_mat = np.float64(confusion_mat)
    return confusion_mat[1, 1] / (confusion_mat[1, 1] + confusion_mat[1, 0])

#get precision from confusion matrix
def get_recall(confusion_mat):
    confusion_mat = confusion_mat.values
    confusion_mat = np.float64(confusion_mat)
    return confusion_mat[1,1] / confusion_mat.sum(axis=0)[1]

#get specificity from confusion matrix
def get_specificity(confusion_mat):
    confusion_mat = confusion_mat.values
    confusion_mat = np.float64(confusion_mat)
    return confusion_mat[0,0]/ confusion_mat.sum(axis=0)[-1]

#get f1 score from confusion matrix
def get_f1_score(confusion_mat):
    confusion_mat = confusion_mat.values
    confusion_mat = np.float64(confusion_mat)
    return confusion_mat[1,1] / confusion_mat[1,1] + ((confusion_mat[0,1]+confusion_mat[1,0])/2)

#get roc from confusion matrix
def get_roc(Y,T):
    return roc_auc_score(Y,T)

#get overall statistical result
def get_result(Y,T):
    conf_mat = confusion_matrix(Y,T)
    result_set  = [get_accuracy(Y,T),get_precision(conf_mat),get_recall(conf_mat),get_specificity(conf_mat),get_f1_score(conf_mat),get_roc(Y,T)]
    indexes = ["Accuracy","Precision","Recall","Specificity","F1 Score","ROC"]
#     print(f"{get_accuracy(Y,T)}, {get_precision(conf_mat)}, {get_recall(conf_mat)}, {get_specificity(conf_mat)}  ,{get_f1_score(conf_mat)}, {get_roc(Y,T)}")
    display(pd.DataFrame(result_set,index = indexes,columns=["Statistics"])) 
    plot_roc(Y,T)
    
def plot_predicted_test_data(Y,T):
    plt.figure(figsize=(10,10))
    plt.plot(np.sort(Y,0),label="Predicted Result")
    plt.plot(np.sort(T,0),label="Test Data")
    plt.title("Plot of Predivcted Data and Test Data")
    plt.legend()
    plt.show()

def plot_roc(Y,T):
    plt.figure(figsize=(10,10))
    fpr,tpr,threshold = roc_curve(Y,T)
    plt.plot(fpr, tpr, linewidth=2, label=None)
    plt.plot([0, 1], [0, 1], 'k--') 
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('False Positive Rate (FPR)')
    plt.ylabel('True Positive Rate (TPR)')
    plt.show()
    
    
def compare_evaluation_result(Y,T,title):
    cfm = confusion_matrix(Y,T,title)
    stats = [get_accuracy(Y,T),get_precision(cfm),get_recall(cfm),get_specificity(cfm),get_f1_score(cfm),get_roc(Y,T)]
    return stats

In [10]:
from sklearn.model_selection import train_test_split

def generate_train_test_data(x_dataset,y_data):
    tempBikeDs = np.copy(x_dataset)    
    X_train,X_test,Y_train,Y_test = train_test_split(tempBikeDs,y_data,test_size=0.2,random_state = 40)
    return X_train,X_test,Y_train,Y_test


X = crime_data.iloc[:,:-2]
Y = crime_data.iloc[:,-1]
X_train,X_test,Y_train,Y_test = generate_train_test_data(X,Y)

# len(X_train) 5706

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

clf=RandomForestClassifier(n_estimators=100)
clf.fit(X_train,Y_train)
y_pred=clf.predict(X_test)

In [12]:
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(Y_test, y_pred))

Accuracy: 0.8079887876664331


In [13]:
confusion_matrix(y_pred,Y_test)

Confusion Matrix 


Unnamed: 0,-,+
-,TN,FN
+,FP,TP


Unnamed: 0,0,1
0,558,142
1,132,595


Unnamed: 0,0,1
0,558,142
1,132,595
