In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# Load datasets
data = pd.read_csv('/kaggle/input/crime-cast-forecasting-crime-categories/train.csv')
test_data = pd.read_csv('/kaggle/input/crime-cast-forecasting-crime-categories/test.csv')

# Exploratory Data Analysis (EDA)
print(data.info())
print(data.head())
print(data.isnull().sum())
print(data.describe())

# Visualizations
sns.histplot(data['Victim_Age'].dropna(), kde=True)
plt.title('Distribution of Victim Age')
plt.show()

sns.countplot(y='Crime_Category', data=data, order=data['Crime_Category'].value_counts().index)
plt.title('Crime Category Counts')
plt.show()

# Data preprocessing function
def data_features_preprocessor(data_features, multilabel_binarizer=None, onehot_encoder=None, scaler=None):
    data_features.Victim_Age = data_features.Victim_Age.apply(lambda x: np.nan if x < 0 else x)
    data_features.Victim_Sex = data_features.Victim_Sex.apply(lambda x: np.nan if x not in ["M", "F", "X"] else x)
    data_features.Date_Reported = pd.to_datetime(data_features.Date_Reported, format='%m/%d/%Y %I:%M:%S %p')
    data_features.Date_Occurred = pd.to_datetime(data_features.Date_Occurred, format='%m/%d/%Y %I:%M:%S %p')
    
    data_features['Reported_Year'] = data_features.Date_Reported.dt.year
    data_features['Reported_Month'] = data_features.Date_Reported.dt.month
    data_features['Reported_Day'] = data_features.Date_Reported.dt.day
    data_features['Reported_DayOfWeek'] = data_features.Date_Reported.dt.dayofweek
    data_features['Occurred_Year'] = data_features.Date_Occurred.dt.year
    data_features['Occurred_Month'] = data_features.Date_Occurred.dt.month
    data_features['Occurred_Day'] = data_features.Date_Occurred.dt.day
    data_features['Occurred_DayOfWeek'] = data_features.Date_Occurred.dt.dayofweek
    data_features.Time_Occurred = data_features.Time_Occurred.astype(int).astype(str).str.zfill(4)
    data_features['Occurred_Hour'] = data_features.Time_Occurred.str.slice(0, 2).astype(int)
    data_features['Occurred_Minute'] = data_features.Time_Occurred.str.slice(2, 4).astype(int)
    
    data_features.drop(columns=['Cross_Street', 'Date_Reported', 'Date_Occurred', 'Time_Occurred', 'Area_Name', 'Status_Description', 'Premise_Description', 'Weapon_Description'], inplace=True)
    
    data_features.Weapon_Used_Code = data_features.Weapon_Used_Code.astype(str)
    data_features.Weapon_Used_Code.replace('nan', np.nan, inplace=True)
    imputer_constant = SimpleImputer(strategy='constant', fill_value='Unknown')
    imputer_mode = SimpleImputer(strategy='most_frequent')
    imputer_constant_columns = ['Modus_Operandi', 'Weapon_Used_Code', 'Victim_Sex', 'Victim_Descent']
    imputer_mode_columns = ['Victim_Age']
    for current_column in imputer_constant_columns:
        data_features[[current_column]] = imputer_constant.fit_transform(data_features[[current_column]])
    for current_column in imputer_mode_columns:
        data_features[[current_column]] = imputer_mode.fit_transform(data_features[[current_column]])
    
    if multilabel_binarizer:
        data_features['Modus_Operandi'] = data_features['Modus_Operandi'].apply(lambda x: x.split())
        binarized_modus_operandi = multilabel_binarizer.transform(data_features['Modus_Operandi'])
        binarized_modus_operandi_cols = ['Modus_Operandi:' + name for name in multilabel_binarizer.classes_]
        binarized_modus_operandi_df = pd.DataFrame(binarized_modus_operandi, columns=binarized_modus_operandi_cols)
        data_features = data_features.drop(columns='Modus_Operandi').join(binarized_modus_operandi_df)

    if onehot_encoder:
        onehotencoded_data = onehot_encoder.transform(data_features[['Location', 'Area_ID', 'Reporting_District_no', 'Victim_Sex', 'Victim_Descent', 'Premise_Code', 'Weapon_Used_Code', 'Status']])
        onehotencoded_cols = onehot_encoder.get_feature_names_out()
        onehotencoded_df = pd.DataFrame(onehotencoded_data, columns=onehotencoded_cols)
        data_features = data_features.drop(columns=['Location', 'Area_ID', 'Reporting_District_no', 'Victim_Sex', 'Victim_Descent', 'Premise_Code', 'Weapon_Used_Code', 'Status']).reset_index(drop=True)
        data_features = pd.concat([data_features, onehotencoded_df], axis=1)
    
    if scaler:
        numeric_columns = data_features.select_dtypes(include=[np.number]).columns
        data_features[numeric_columns] = scaler.fit_transform(data_features[numeric_columns])
    
    return data_features

# Preprocessing
data_features = data.drop("Crime_Category", axis=1)
data_labels = data['Crime_Category'].copy()

multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(data_features['Modus_Operandi'].fillna('Unknown').apply(lambda x: x.split()))

onehot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
onehot_encoder.fit(data_features[['Location', 'Area_ID', 'Reporting_District_no', 'Victim_Sex', 'Victim_Descent', 'Premise_Code', 'Weapon_Used_Code', 'Status']])

data_features_transformed = data_features_preprocessor(data_features, multilabel_binarizer, onehot_encoder)

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(data_features_transformed, data_labels , test_size=0.20, random_state=69)

# Model definitions
rf = RandomForestClassifier(
    n_estimators=500,
    max_features=0.25,
    max_depth=None,
    min_samples_split=10,
    min_samples_leaf=1,
    bootstrap=False,
    random_state=69
)

gb = GradientBoostingClassifier(
    n_estimators=700,
    learning_rate=0.1,
    max_depth=10,
    min_samples_split=3,
    min_samples_leaf=3,
    subsample=0.9,
    max_features='sqrt',
    random_state=69
)

svc = SVC(
    tol=0.001,
    shrinking=False,
    probability=True,
    kernel='linear',
    gamma=0.15,
    degree=1,
    coef0=0.15,
    class_weight=None,
    C=10,
    random_state=69
)

xgb = XGBClassifier(
    n_estimators=200,
    max_depth=5,
    learning_rate=0.15,
    subsample=1.0,
    colsample_bytree=0.8,
    gamma=0.15,
    min_child_weight=1,
    reg_alpha=0,
    reg_lambda=1.75,
    scale_pos_weight=3,
    random_state=69
)

meta_model = LogisticRegression(
    tol=0.01,
    solver='liblinear',
    penalty='l1',
    max_iter=500,
    fit_intercept=False,
    class_weight=None,
    C=0.8286427728546842,
    random_state=69
)

base_models = [
        ('rf', rf),
        ('gb', gb),
        ('xgb', xgb)
    ]

stacking_clf = StackingClassifier(
    estimators=base_models ,
    final_estimator=meta_model,
    verbose=3
)

# Model training
stacking_clf.fit(X_train, y_train)

# Model evaluation
y_pred = stacking_clf.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f'Stacking Classifier Accuracy: {accuracy:.3f}')

# Test data preprocessing and prediction
test_data_preprocessed = data_features_preprocessor(test_data, multilabel_binarizer, onehot_encoder)
kaggle_y_predict = stacking_clf.predict(test_data_preprocessed)

# Submission
submission = pd.DataFrame({'ID': range(1, len(kaggle_y_predict) + 1), 'Crime_Category': kaggle_y_predict})
submission.to_csv('submission.csv', index=False)

Random forest Classifier :  

evaluation score = 91.9%   
test score = 91.6%
best_params = {
    'n_estimators': [500],
    'max_features': [0.25],
    'max_depth': [None],
    'min_samples_split': [10],
    'min_samples_leaf': [1],
    'bootstrap': [False]
}       
evaluation score after hyperparameters tuning: 94.3%    

       

            
Gradient Boosting Classifier :  
evaluation score = 93.8%  
test score = 93.3%  
best_params = {
    'n_estimators': [700],
    'learning_rate': [0.1],
    'max_depth': [10],
    'min_samples_split': [3],
    'min_samples_leaf': [3],
    'subsample': [0.9],
    'max_features': ['sqrt']
}      
evaluation score after hyperparameters tuning: 94.2% 

      
     
SVC Classifier :  
evaluation score =  58.9%    
test score = 58.6%   
best_params = {
    'tol': 0.001, 
    'shrinking': False, 
    'probability': True, 
    'kernel': 'linear', 'gamma': 0.15, 
    'degree': 1, 
    'coef0': 0.15, 
    'class_weight': None, 
    'C': 10        
}       
evaluation score after hyperparameters tuning:91.9%          

       
      

XGBoost Classifier :  
evaluation score = 95.4%        
test score = 95.14%      
best_params = {
    'n_estimators': [200],
    'max_depth': [5],
    'learning_rate': [0.15],
    'subsample': [1.0],
    'colsample_bytree': [0.8],
    'gamma': [0.15],
    'min_child_weight': [1],
    'reg_alpha': [0],
    'reg_lambda': [1.75],
    'scale_pos_weight': [3]
}        
evaluation score after hyperparameters tuning: 95.18%      


      
     
Logistic Regression Classifier :  
evaluation score =  58.6%    
test score = 58.8%   
best_params = {
    'tol': 0.01,
    'solver': 'liblinear',
    'penalty': 'l1',
    'max_iter': 500,
    'fit_intercept': False,
    'class_weight': None,
    'C': 0.8286427728546842
}       
evaluation score after hyperparameters tuning: 93.7%              

       
      