In [42]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KernelDensity
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, mean_absolute_error
from sklearn.linear_model import PoissonRegressor
from sklearn.cluster import KMeans
import joblib
import os


In [43]:
print(np.__version__)

1.24.3


In [44]:
print(joblib.__version__)

1.4.2


In [45]:
import sklearn
print(sklearn.__version__)

1.6.1


In [46]:
print(pd.__version__)

2.0.3


In [47]:

# Create models directory
os.makedirs('models', exist_ok=True)

# Load dataset (adjust path as needed)
df = pd.read_csv("original_dataset.csv")  # Replace with your CSV file path
df = df.copy()


In [48]:
# Step 1: Preprocessing
print("Preprocessing dataset...")

# Ensure required columns are present
required_cols = ['Latitude', 'Longitude', 'Date_Time', 'Crime_Type', 'District', 'Arrest']
if not all(col in df.columns for col in required_cols):
    raise ValueError(f"CSV must contain: {required_cols}")



Preprocessing dataset...


In [49]:
# Parse Date_Time
df['Date_Time'] = pd.to_datetime(df['Date_Time'], format='%m/%d/%Y %I:%M:%S %p', errors='coerce')
df = df.dropna(subset=['Date_Time', 'Latitude', 'Longitude', 'Crime_Type', 'District', 'Arrest'])
df.loc[:, 'Date'] = df['Date_Time'].dt.date
df.loc[:, 'Hour'] = df['Date_Time'].dt.hour
df.loc[:, 'Month'] = df['Date_Time'].dt.month



In [50]:
# Encode categorical columns
label_encoders = {}
for col in ['Crime_Type', 'District']:
    le = LabelEncoder()
    df.loc[:, f'{col}_Encoded'] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le
    joblib.dump(le, f'models/{col.lower()}_encoder.pkl')

In [51]:
# Convert Arrest to binary (TRUE/FALSE to 1/0)
df.loc[:, 'Arrest Status'] = df['Arrest'].map({True: 1, False: 0, 'TRUE': 1, 'FALSE': 0}).fillna(0)
# ... (previous imports and initial preprocessing steps remain the same)

In [52]:
# After merging Risk
crime_counts = df.groupby(['Latitude', 'Longitude']).size().reset_index(name='CrimeCount')
threshold = crime_counts['CrimeCount'].quantile(0.75)
crime_counts['Risk'] = (crime_counts['CrimeCount'] > threshold).astype(int)
df = df.merge(crime_counts[['Latitude', 'Longitude', 'Risk']], on=['Latitude', 'Longitude'], how='left')
df['Risk'] = df['Risk'].fillna(0).astype(int)  # Fill NaN with 0

In [53]:
# After assigning District_Cluster
crime_dist_by_district = pd.crosstab(index=df['District'], columns=df['Crime_Type'], normalize='index')
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
district_clusters = kmeans.fit_predict(crime_dist_by_district)
district_to_cluster = dict(zip(crime_dist_by_district.index, district_clusters))
df.loc[:, 'District_Cluster'] = df['District'].map(district_to_cluster)
df['District_Cluster'] = df['District_Cluster'].fillna(0).astype(int)  # Fill NaN with 0



In [54]:
# Save preprocessed dataset
df.to_csv("preprocessed_dataset.csv", index=False)
print("Preprocessed dataset saved as 'preprocessed_dataset.csv'")


Preprocessed dataset saved as 'preprocessed_dataset.csv'


In [55]:

# Step 2: Model Development
# 1. KDE for Hotspots
print("\nTraining KDE Model...")
coordinates = df[['Latitude', 'Longitude']].values
kde = KernelDensity(bandwidth=0.01, kernel='gaussian')
kde.fit(coordinates)



Training KDE Model...


In [56]:
joblib.dump(kde, 'models/kde_model.pkl')


['models/kde_model.pkl']

In [57]:
# Test KDE with the sample record
example_coords = np.array([[15.1763, 77.69]])
kde_score = kde.score_samples(example_coords)[0]
kde_intensity = float(np.exp(kde_score))
print(f"KDE Example Prediction: Latitude=15.1763, Longitude=77.69, Intensity={kde_intensity:.4f}")


KDE Example Prediction: Latitude=15.1763, Longitude=77.69, Intensity=0.2745


In [58]:
# 2. SVM for Risk Classification
print("\nTraining SVM Model...")
# Remove Risk column if it already exists to avoid merge conflicts
if 'Risk' in df.columns:
    df = df.drop(columns=['Risk'])
    print("Dropped existing 'Risk' column to avoid merge conflict.")

# Define risk based on crime frequency per location
crime_counts = df.groupby(['Latitude', 'Longitude']).size().reset_index(name='CrimeCount')
threshold = crime_counts['CrimeCount'].quantile(0.75)  # Top 25% as high risk
crime_counts['Risk'] = (crime_counts['CrimeCount'] > threshold).astype(int)
df = df.merge(crime_counts[['Latitude', 'Longitude', 'Risk']], on=['Latitude', 'Longitude'], how='left')
df['Risk'] = df['Risk'].fillna(0).astype(int)  # Fill NaN with 0

features = ['Latitude', 'Longitude', 'Hour', 'Crime_Type_Encoded', 'District_Encoded', 'Arrest Status']
X = df[features]
y = df['Risk']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
svm_model = SVC(kernel='rbf', probability=True)
svm_model.fit(X_train, y_train)



Training SVM Model...
Dropped existing 'Risk' column to avoid merge conflict.


In [59]:
y_pred = svm_model.predict(X_test)
print("SVM Classification Report:")
print(classification_report(y_test, y_pred))


SVM Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.98      5586
           1       0.00      0.00      0.00       272

    accuracy                           0.95      5858
   macro avg       0.48      0.50      0.49      5858
weighted avg       0.91      0.95      0.93      5858



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [60]:
joblib.dump(svm_model, 'models/svm_model.pkl')


['models/svm_model.pkl']

In [61]:
# Test SVM with the sample record
crime_type_encoded = label_encoders['Crime_Type'].transform(['THEFT'])[0]
district_encoded = label_encoders['District'].transform(['Anantapur'])[0]
example_svm = np.array([[15.1763, 77.69, 15, crime_type_encoded, district_encoded, 0]])
svm_probs = svm_model.predict_proba(example_svm)[0]
print(f"SVM Example Prediction: High Risk={svm_probs[1]:.4f}, Low Risk={svm_probs[0]:.4f}")


SVM Example Prediction: High Risk=0.0391, Low Risk=0.9609




In [62]:

# Assuming df is your preprocessed DataFrame with encoded columns
print("\nTraining Poisson Model...")
crime_counts_by_time = df.groupby(['Date', 'Latitude', 'Longitude', 'Crime_Type_Encoded']).size().reset_index(name='CrimeCount')
crime_counts_by_time = crime_counts_by_time.merge(
    df[['Date', 'Latitude', 'Longitude', 'Hour', 'Month', 'District_Encoded']].drop_duplicates(),
    on=['Date', 'Latitude', 'Longitude'],
    how='left'
)
features_poisson = ['Latitude', 'Longitude', 'Hour', 'Month', 'District_Encoded', 'Crime_Type_Encoded']
X_poisson = crime_counts_by_time[features_poisson]
y_poisson = crime_counts_by_time['CrimeCount']
X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(X_poisson, y_poisson, test_size=0.2, random_state=42)
poisson_model = PoissonRegressor()
poisson_model.fit(X_train_p, y_train_p)



Training Poisson Model...


In [63]:
y_pred_p = poisson_model.predict(X_test_p)
mae = mean_absolute_error(y_test_p, y_pred_p)
print(f"Poisson MAE: {mae:.4f}")
joblib.dump(poisson_model, 'models/poisson_model.pkl')


Poisson MAE: 0.0013


['models/poisson_model.pkl']

In [64]:

# Training K-Means Model
print("\nTraining K-Means Model...")
crime_dist_by_district = pd.crosstab(index=df['District'], columns=df['Crime_Type'], normalize='index')
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
district_clusters = kmeans.fit_predict(crime_dist_by_district)
district_to_cluster = dict(zip(crime_dist_by_district.index, district_clusters))
# Ensure all districts from the form are included
all_districts = ["Anantapur", "Chittoor", "East Godavari", "Guntur", "Krishna", "Kurnool", "Nellore",
                 "Prakasam", "Srikakulam", "Vishakhapatnam", "Vizianagaram", "West Godavari"]
for district in all_districts:
    if district not in district_to_cluster:
        district_to_cluster[district] = -1
df.loc[:, 'District_Cluster'] = df['District'].map(district_to_cluster)
cluster_crime_dist = pd.crosstab(index=df['District_Cluster'], columns=df['Crime_Type'], normalize='index')
# Convert to dictionary with labels
crime_types = cluster_crime_dist.columns.tolist()  # e.g., ['THEFT', 'ASSAULT', ...]
cluster_crime_dist_dict = {
    cluster: dict(zip(crime_types, cluster_crime_dist.loc[cluster].values))
    for cluster in cluster_crime_dist.index
}
joblib.dump(district_to_cluster, 'models/district_cluster_mapping.pkl')



Training K-Means Model...




['models/district_cluster_mapping.pkl']

In [65]:
joblib.dump(cluster_crime_dist_dict, 'models/cluster_crime_distribution.pkl')


['models/cluster_crime_distribution.pkl']

In [66]:
joblib.dump(crime_types, 'models/crime_types.pkl')  # Save crime type labels

['models/crime_types.pkl']