In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.preprocessing import PolynomialFeatures


# Load the dataset
# df = pd.DataFrame(data, columns=attributes)
df = pd.read_csv('01_District_wise_crimes_committed_IPC_2001_2012.csv')

# Calculate total crimes and categorize them
df['TOTAL_CRIMES'] = df.drop(columns=['YEAR', 'STATE/UT']).sum(axis=1)

# Define bins for categorizing total crimes
bins = [0, df['TOTAL_CRIMES'].quantile(0.33), df['TOTAL_CRIMES'].quantile(0.66), df['TOTAL_CRIMES'].max()]
labels = ['low', 'medium', 'high']
df['CRIME_CATEGORY'] = pd.cut(df['TOTAL_CRIMES'], bins=bins, labels=labels, include_lowest=True)

# Separate features and target variable
X = df.drop(columns=['YEAR', 'STATE/UT', 'TOTAL_CRIMES', 'CRIME_CATEGORY'])
y = df['CRIME_CATEGORY']

# Feature Engineering: Add polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)

# Normalize features
scaler = StandardScaler()
X_poly = scaler.fit_transform(X_poly)

# Shuffle the data while maintaining the stratification
df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Stratified manual split for 12 years of data
train_size = int(len(df_shuffled) * 9 / 12)  # 9 years for training
df_train = df_shuffled.iloc[:train_size]
df_test = df_shuffled.iloc[train_size:]

X_train = df_train.drop(columns=['YEAR', 'STATE/UT', 'TOTAL_CRIMES', 'CRIME_CATEGORY'])
y_train = df_train['CRIME_CATEGORY']
X_test = df_test.drop(columns=['YEAR', 'STATE/UT', 'TOTAL_CRIMES', 'CRIME_CATEGORY'])
y_test = df_test['CRIME_CATEGORY']

# Normalize features after the split
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Encode target labels with value between 0 and n_classes-1
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

# Define base models with hyperparameter tuning
param_grid_svm = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
param_grid_dt = {'max_depth': [None, 10, 20], 'min_samples_split': [2, 10, 20]}
param_grid_rf = {'n_estimators': [50, 100], 'max_depth': [None, 10, 20]}

svm = GridSearchCV(SVC(probability=True), param_grid_svm, cv=5)
decision_tree = GridSearchCV(DecisionTreeClassifier(), param_grid_dt, cv=5)
random_forest = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=5)

base_models = [('svm', svm), ('decision_tree', decision_tree), ('random_forest', random_forest)]

# Stratified K-Fold cross-validation for base models
kf = StratifiedKFold(n_splits=5)
val_predictions = np.zeros((len(X_train), len(base_models)))

for i, (name, model) in enumerate(base_models):
    fold_predictions = np.zeros(len(X_train))
    for train_index, val_index in kf.split(X_train, y_train):
        X_fold_train, X_val = X_train[train_index], X_train[val_index]
        y_fold_train, y_val = y_train[train_index], y_train[val_index]
        
        model.fit(X_fold_train, y_fold_train)
        fold_predictions[val_index] = model.predict(X_val)
    
    val_predictions[:, i] = fold_predictions

#Re-train base models on full 9-year training data
for name, model in base_models:
    model.fit(X_train, y_train)

# Collect predictions for the entire 3-year testing period
test_predictions = np.zeros((len(X_test), len(base_models)))

for i, (name, model) in enumerate(base_models):
    test_predictions[:, i] = model.predict(X_test)

# Use predictions from base models to train the meta-model
meta_model = SVC(probability=True)
meta_model.fit(val_predictions, y_train)  # Train on validation predictions

# Recalibrate meta-model and make final predictions
final_predictions = meta_model.predict(test_predictions)

# Evaluate final predictions
accuracy = accuracy_score(y_test, final_predictions)
print(f'Final model accuracy: {accuracy}')
print(classification_report(y_test, final_predictions))


Final model accuracy: 0.9924611973392461
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       723
           1       1.00      0.99      0.99       763
           2       0.99      0.99      0.99       769

    accuracy                           0.99      2255
   macro avg       0.99      0.99      0.99      2255
weighted avg       0.99      0.99      0.99      2255

