In [1]:
#importing packages

import numpy as np
import timeit
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import matplotlib
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.cm as cm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [2]:
# reading dataset with pandas
df=pd.read_csv("./dataset1.csv")
df.head()

Unnamed: 0,Report Number,Date Reported,Date of Occurrence,Time of Occurrence,City,Crime Code,Crime Description,Victim Age,Victim Gender,Weapon Used,Crime Domain,Police Deployed,Case Closed,Date Case Closed
0,1,02-01-2020 00:00,01-01-2020 00:00,01-01-2020 01:11,Delhi,576,IDENTITY THEFT,16,M,Blunt Object,Violent Crime,13,No,
1,2,01-01-2020 19:00,01-01-2020 01:00,01-01-2020 06:26,Kolkata,128,CYBERCRIME,37,M,Poison,Other Crime,9,No,
2,3,02-01-2020 05:00,01-01-2020 02:00,01-01-2020 14:30,Bangalore,271,KIDNAPPING,48,F,Blunt Object,Other Crime,15,No,
3,4,01-01-2020 05:00,01-01-2020 03:00,01-01-2020 14:46,Pune,170,KIDNAPPING,49,F,Firearm,Other Crime,1,Yes,29-04-2020 05:00
4,5,01-01-2020 21:00,01-01-2020 04:00,01-01-2020 16:51,Pune,421,IDENTITY THEFT,30,F,Other,Other Crime,18,Yes,08-01-2020 21:00


In [3]:
# extracting date, month, time
df['Date of Occurrence'] = pd.to_datetime(df['Date of Occurrence'], format='mixed', errors='coerce')
df['date_of_occ'] = df['Date of Occurrence'].dt.day
df['month_of_occ'] = df['Date of Occurrence'].dt.month
df['year_of_occ'] = df['Date of Occurrence'].dt.year
df['hour_of_occ'] = df['Date of Occurrence'].dt.hour
df['mins_of_occ'] = df['Date of Occurrence'].dt.minute


In [4]:
# columns used to train model
col_list = ['date_of_occ','month_of_occ','year_of_occ','hour_of_occ','mins_of_occ','City','Crime Description','Police Deployed']

In [5]:
# making dataframe with columns used to train model
df2 = df[col_list]
df2.head()


Unnamed: 0,date_of_occ,month_of_occ,year_of_occ,hour_of_occ,mins_of_occ,City,Crime Description,Police Deployed
0,1,1,2020,0,0,Delhi,IDENTITY THEFT,13
1,1,1,2020,1,0,Kolkata,CYBERCRIME,9
2,1,1,2020,2,0,Bangalore,KIDNAPPING,15
3,1,1,2020,3,0,Pune,KIDNAPPING,1
4,1,1,2020,4,0,Pune,IDENTITY THEFT,18


In [6]:
# droping empty rows and checking size of dataset
df2 = df2.dropna()
df2.shape


(40160, 8)

In [7]:
#extracting datatypes of columns
df2.dtypes


date_of_occ           int32
month_of_occ          int32
year_of_occ           int32
hour_of_occ           int32
mins_of_occ           int32
City                 object
Crime Description    object
Police Deployed       int64
dtype: object

In [8]:
# Factorize Crime Description column:
crime_var = pd.factorize(df2['Crime Description'])
df2.loc[:, 'Crime Description'] = crime_var[0].astype(np.int32)
definition_list_Crime_Description = np.array(crime_var[1])
print(crime_var)
 
# Factorize Police Deployed:
police_var = pd.factorize(df2['Police Deployed'])
df2.loc[:, 'Police Deployed'] = police_var[0].astype(np.int32)
definition_list_police = police_var[1]

# Factorize City:
city_var = pd.factorize(df2['City'])
df2.loc[:, 'City'] = city_var[0].astype(np.int32)
definition_list_city = city_var[1]

# Factorize occurrence year:
year_var = pd.factorize(df2['year_of_occ'])
df2.loc[:, 'year_of_occ'] = year_var[0].astype(np.int32)
definition_list_year = year_var[1]


# Factorize occurrence month:
month_var = pd.factorize(df2['month_of_occ'])
df2.loc[:, 'month_of_occ'] = month_var[0].astype(np.int32)
definition_list_month = month_var[1]

# Factorize occurrence day:
day_var = pd.factorize(df2['date_of_occ'])
df2.loc[:, 'date_of_occ'] = day_var[0].astype(np.int32)
definition_list_day = day_var[1]

# Factorize hour of occurrence:
hour_var = pd.factorize(df2['hour_of_occ'])
df2.loc[:, 'hour_of_occ'] = hour_var[0].astype(np.int32)
definition_list_hour = hour_var[1]

# Factorize minutes of occurrence:
mins_var = pd.factorize(df2['mins_of_occ'])
df2.loc[:, 'mins_of_occ'] = mins_var[0].astype(np.int32)
definition_list_mins = mins_var[1]


(array([0, 1, 2, ..., 0, 3, 2], shape=(40160,)), Index(['IDENTITY THEFT', 'CYBERCRIME', 'KIDNAPPING', 'ASSAULT', 'EXTORTION'], dtype='object'))


In [9]:
df2.head()


Unnamed: 0,date_of_occ,month_of_occ,year_of_occ,hour_of_occ,mins_of_occ,City,Crime Description,Police Deployed
0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,1,1,1
2,0,0,0,2,0,2,2,2
3,0,0,0,3,0,3,2,3
4,0,0,0,4,0,3,0,4


In [10]:
# splitting x and y to train model
x = df2.drop(['Crime Description', 'Police Deployed'],axis=1).values
y = df2['Crime Description'].values.astype(int)

In [11]:
# splitting data into test and train data
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 21)


In [13]:
#Training model for crime prediction
from sklearn.tree import DecisionTreeClassifier

classifier1 = DecisionTreeClassifier(criterion='entropy', random_state=42)
classifier1.fit(X_train, y_train)


In [14]:
# predicting output for test data and checking model performance
y_pred = classifier1.predict(X_test)

print("Accuracy of Decision Tree : ",accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred)) 
print(classification_report(y_test, y_pred, target_names=[str(label) for label in definition_list_Crime_Description]))

Accuracy of Decision Tree :  0.20647410358565738
[[604 439 446 450 403]
 [446 379 373 372 376]
 [447 382 381 374 365]
 [454 380 370 370 364]
 [425 369 386 346 339]]
                precision    recall  f1-score   support

IDENTITY THEFT       0.25      0.26      0.26      2342
    CYBERCRIME       0.19      0.19      0.19      1946
    KIDNAPPING       0.19      0.20      0.20      1949
       ASSAULT       0.19      0.19      0.19      1938
     EXTORTION       0.18      0.18      0.18      1865

      accuracy                           0.21     10040
     macro avg       0.20      0.20      0.20     10040
  weighted avg       0.21      0.21      0.21     10040



In [15]:
definition_lists = {
    'date_of_occ': definition_list_day,
    'month_of_occ': definition_list_month,
    'year_of_occ': definition_list_year,
    'hour_of_occ': definition_list_day,
    'mins_of_occ': definition_list_day,
    'City': definition_list_city
}

In [22]:
# #Training model for polic deployment
x1 = df2.drop(['Police Deployed'],axis=1).values
y1 = df2['Police Deployed'].values.astype(int)

X1_train, X1_test, y1_train, y1_test = train_test_split(x1, y1, test_size = 0.25, random_state = 21)

classifier2 = DecisionTreeClassifier(criterion='entropy', random_state=42)
classifier2.fit(X1_train, y1_train)
y1_pred = classifier2.predict(X1_test)

print("Accuracy of Decision Tree : ",accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred)) 
print(classification_report(y_test, y_pred, target_names=[str(label) for label in definition_list_Crime_Description]))

Accuracy of Decision Tree :  0.20647410358565738
[[604 439 446 450 403]
 [446 379 373 372 376]
 [447 382 381 374 365]
 [454 380 370 370 364]
 [425 369 386 346 339]]
                precision    recall  f1-score   support

IDENTITY THEFT       0.25      0.26      0.26      2342
    CYBERCRIME       0.19      0.19      0.19      1946
    KIDNAPPING       0.19      0.20      0.20      1949
       ASSAULT       0.19      0.19      0.19      1938
     EXTORTION       0.18      0.18      0.18      1865

      accuracy                           0.21     10040
     macro avg       0.20      0.20      0.20     10040
  weighted avg       0.21      0.21      0.21     10040



In [23]:
# function to transform input data into desired input format for pridiction
def transform_input(sample, definition_lists):
    transformed_sample = []
    
    # Mapping each feature
    transformed_sample.append(np.where(definition_lists['date_of_occ'] == sample[0])[0][0] if sample[0] in definition_lists['date_of_occ'] else -1)
    transformed_sample.append(np.where(definition_lists['month_of_occ'] == sample[1])[0][0] if sample[1] in definition_lists['month_of_occ'] else -1)
    transformed_sample.append(np.where(definition_lists['year_of_occ'] == sample[2])[0][0] if sample[2] in definition_lists['year_of_occ'] else -1)
    transformed_sample.append(np.where(definition_lists['hour_of_occ'] == sample[3])[0][0] if sample[3] in definition_lists['hour_of_occ'] else -1)
    transformed_sample.append(np.where(definition_lists['mins_of_occ'] == sample[4])[0][0] if sample[4] in definition_lists['mins_of_occ'] else -1)
    transformed_sample.append(np.where(definition_lists['City'] == sample[5])[0][0] if sample[5] in definition_lists['City'] else -1)
    
    return np.array(transformed_sample).reshape(1, -1)


In [24]:
definition_lists1 = {
    'date_of_occ': definition_list_day,
    'month_of_occ': definition_list_month,
    'year_of_occ': definition_list_year,
    'hour_of_occ': definition_list_day,
    'mins_of_occ': definition_list_day,
    'City': definition_list_city,
    'Crime Description': definition_list_Crime_Description
}

In [25]:
#function to transform input data into desired input format for polic deployment prediction
def transform_inputMain(sample, definition_lists):
    transformed_sample = []
    
    # Mapping each feature
    transformed_sample.append(np.where(definition_lists['date_of_occ'] == sample[0])[0][0] if sample[0] in definition_lists['date_of_occ'] else -1)
    transformed_sample.append(np.where(definition_lists['month_of_occ'] == sample[1])[0][0] if sample[1] in definition_lists['month_of_occ'] else -1)
    transformed_sample.append(np.where(definition_lists['year_of_occ'] == sample[2])[0][0] if sample[2] in definition_lists['year_of_occ'] else -1)
    transformed_sample.append(np.where(definition_lists['hour_of_occ'] == sample[3])[0][0] if sample[3] in definition_lists['hour_of_occ'] else -1)
    transformed_sample.append(np.where(definition_lists['mins_of_occ'] == sample[4])[0][0] if sample[4] in definition_lists['mins_of_occ'] else -1)
    transformed_sample.append(np.where(definition_lists['City'] == sample[5])[0][0] if sample[5] in definition_lists['City'] else -1)
    transformed_sample.append(np.where(definition_lists['Crime Description'] == sample[6])[0][0] if sample[6] in definition_lists['Crime Description'] else -1)
    
    return np.array(transformed_sample).reshape(1, -1)

In [28]:
def predict(input_sample):
    cities = ["Delhi", "Mumbai", "Bangalore", "Kolkata", "Hyderabad", "Pune", "Patna"]
    
    for city in cities:
        # Make a copy of the original input to avoid modifying the original
        input_copy = input_sample.copy()
        
        # Add city to the input (but do not add crime description as we are predicting it)
        input_copy.append(city)
        
        # Transform input and predict crime description
        X_new = transform_input(input_copy, definition_lists)
        predicted_class = classifier1.predict(X_new)
        probs = classifier1.predict_proba(X_new)
        
        # Print the probability of each crime description
        for i in range(len(probs[0])):
            crime_desc = definition_list_Crime_Description[i]
            crime_prob = probs[0][i] * 100  # Crime probability as percentage
            
            input_police = input_copy.copy()
            input_police.append(crime_desc)
            # print(f"Input for Police Prediction (including crime description): {input_police}")

            X_police = transform_inputMain(input_police, definition_lists1)
            predicted_police = classifier2.predict(X_police)
            
            # Print the results
            print(f" {city} ----> {crime_desc} Probability is {crime_prob:.2f}%")
            if crime_prob != 0:
                print(f"Recommended Police for {crime_desc} in {city} is {definition_list_police[predicted_police[0]]}")
        print("\n")



In [29]:
#predicting potential crimes in city with probability & how much polic should be deployed
#input = ['date_of_occ','month_of_occ','year_of_occ','hour_of_occ','mins_of_occ']

input_sample = [22, 3, 23, 6, 30]
predict(input_sample)

 Delhi ----> IDENTITY THEFT Probability is 100.00%
Recommended Police for IDENTITY THEFT in Delhi is 5
 Delhi ----> CYBERCRIME Probability is 0.00%
 Delhi ----> KIDNAPPING Probability is 0.00%
 Delhi ----> ASSAULT Probability is 0.00%
 Delhi ----> EXTORTION Probability is 0.00%


 Mumbai ----> IDENTITY THEFT Probability is 100.00%
Recommended Police for IDENTITY THEFT in Mumbai is 12
 Mumbai ----> CYBERCRIME Probability is 0.00%
 Mumbai ----> KIDNAPPING Probability is 0.00%
 Mumbai ----> ASSAULT Probability is 0.00%
 Mumbai ----> EXTORTION Probability is 0.00%


 Bangalore ----> IDENTITY THEFT Probability is 0.00%
 Bangalore ----> CYBERCRIME Probability is 100.00%
Recommended Police for CYBERCRIME in Bangalore is 11
 Bangalore ----> KIDNAPPING Probability is 0.00%
 Bangalore ----> ASSAULT Probability is 0.00%
 Bangalore ----> EXTORTION Probability is 0.00%


 Kolkata ----> IDENTITY THEFT Probability is 0.00%
 Kolkata ----> CYBERCRIME Probability is 0.00%
 Kolkata ----> KIDNAPPING Proba