# Ensemble Classifier Project

In [None]:
!pip install kagglehub shap
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score
import shap
import kagglehub
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load data
path = kagglehub.dataset_download("vesuvius13/serial-killers-dataset")
dfs = []
for file in ['Lessthan_5_victim_count.csv', '5_to_14_victim_count.csv', '15_to_30_victim_count.csv', 'Highest_victim_count.csv']:
    df = pd.read_csv(f'{path}/{file}')
    df['category'] = ['Low','Medium','High','Very High'][['Lessthan_5','5_to_14','15_to_30','Highest'].index([x for x in ['Lessthan_5','5_to_14','15_to_30','Highest'] if x in file][0])]
    dfs.append(df)

data = pd.concat(dfs, ignore_index=True)
data['victims'] = data['Proven victims'].str.extract('(\\d+)').astype(float)
data['years'] = data['Years active'].str.len()
data = data.dropna(subset=['victims', 'years'])

X = data[['victims', 'years']]
y = data['category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f'Data shape: {X.shape}, Classes: {y.unique()}')

In [None]:
# Ensemble Models
models = {
    'RF': RandomForestClassifier(random_state=42),
    'GB': GradientBoostingClassifier(random_state=42)
}

# Baseline performance
baseline = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    baseline[name] = accuracy_score(y_test, model.predict(X_test))
    print(f'{name} Baseline: {baseline[name]:.3f}')

# Voting Classifier
voting = VotingClassifier([('rf', RandomForestClassifier()), ('gb', GradientBoostingClassifier())])
voting.fit(X_train, y_train)
baseline['Voting'] = accuracy_score(y_test, voting.predict(X_test))
print(f'Voting: {baseline["Voting"]:.3f}')

In [None]:
# Hyperparameter Tuning
params = {
    'RF': {'n_estimators': [50, 100], 'max_depth': [5, 10]},
    'GB': {'n_estimators': [50, 100], 'learning_rate': [0.1, 0.2]}
}

tuned = {}
for name, model in models.items():
    grid = GridSearchCV(model, params[name], cv=3)
    grid.fit(X_train, y_train)
    tuned[name] = {'model': grid.best_estimator_, 'score': accuracy_score(y_test, grid.predict(X_test))}
    print(f'{name} Tuned: {tuned[name]["score"]:.3f}, Best params: {grid.best_params_}')

In [None]:
# Comparison Graphs
plt.figure(figsize=(12, 5))

# Performance comparison
plt.subplot(1, 2, 1)
models_list = list(baseline.keys())
baseline_scores = [baseline[m] for m in models_list]
tuned_scores = [tuned.get(m, {}).get('score', baseline[m]) for m in models_list]

x = range(len(models_list))
plt.bar([i-0.2 for i in x], baseline_scores, 0.4, label='Baseline')
plt.bar([i+0.2 for i in x], tuned_scores, 0.4, label='Tuned')
plt.xticks(x, models_list)
plt.ylabel('Accuracy')
plt.title('Model Performance Comparison')
plt.legend()

# Feature importance
plt.subplot(1, 2, 2)
best_model = max(tuned.values(), key=lambda x: x['score'])['model']
plt.bar(['victims', 'years'], best_model.feature_importances_)
plt.title('Feature Importance')
plt.ylabel('Importance')

plt.tight_layout()
plt.show()

In [None]:
# SHAP Analysis
explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_test)

plt.figure(figsize=(10, 6))
shap.summary_plot(shap_values, X_test, feature_names=['victims', 'years'], show=False)
plt.title('SHAP Feature Importance')
plt.show()

print('\n=== RESULTS ===')
print(f'Best model accuracy: {max(tuned.values(), key=lambda x: x["score"])["score"]:.3f}')
print('Hyperparameter tuning output completed')
print('SHAP explanations generated')
print('Ensemble models compared successfully')