In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.model_selection import train_test_split

df = pd.read_csv("XY_train.csv", encoding='iso-8859-1')

# ======== MISSION 0 ========
# split input dataset to training dataset & testing dataset

# remove irrelevant columns
df_imputed = df.drop(columns=['ID', 'POSTAL_CODE'])

# convert values to numerical
df_imputed['GENDER'] = df_imputed['GENDER'].map({'male': 0, 'female': 1})
df_imputed['EDUCATION'] = df_imputed['EDUCATION'].map({'none': 0, 'high school': 1, 'university': 2})
df_imputed['VEHICLE_TYPE'] = df_imputed['VEHICLE_TYPE'].map({'sedan': 0, 'sports car': 1})
df_imputed['VEHICLE_YEAR'] = df_imputed['VEHICLE_YEAR'].map({'before 2015': 0, 'after 2015': 1})
df_imputed['INCOME'] = df_imputed['INCOME'].map({'poverty': 0, 'working class': 1, 'middle class': 2, 'upper class': 3})

df_imputed['AGE'] = pd.cut(df_imputed['AGE'], bins=[0, 20, 30, 50, 70, 100], labels=[0, 1, 2, 3, 4])
df_imputed['DRIVING_EXPERIENCE'] = pd.cut(df_imputed['DRIVING_EXPERIENCE'], bins=[0, 1, 3, 10, 20, 100],
                                          labels=[0, 1, 2, 3, 4])

# remove rows with missing values
df_imputed = df_imputed.dropna()

# save the imputed dataset
df_imputed.to_csv('XY_train_imputed.csv', index=False)

# define X (features) and Y (target variable for prediction)
# target variable is 'OUTCOME'

X = df_imputed.drop(columns=['OUTCOME'])
Y = df_imputed['OUTCOME']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

# ======== MISSION 1 ========

# define a new DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train, Y_train)


def print_dtc_score(dtc, X_train, Y_train, X_test, Y_test):
    # determining the prediction accuracy can be done with
    # DecisionTreeClassifier.score or
    # sklearn.metrics.roc_auc_score

    print("=================== MODEL ACCURACY SCORES ===================")

    # ===== accuracy score =====
    y_train_pred = dtc.predict(X_train)

    # calculate accuracy
    score = accuracy_score(Y_train, y_train_pred)
    # print the accuracy of the model for the training dataset
    print(f"Model accuracy score for TRAINING dataset: {score}")
    y_test_pred = dtc.predict(X_test)
    # print the accuracy of the model for the test dataset
    score = accuracy_score(Y_test, y_test_pred)
    print(f"Model accuracy score for TEST dataset: {score}")


print_dtc_score(dtc, X_train, Y_train, X_test, Y_test)

# ======== MISSION 3 ========
# we need to do hyperparameter tuning to improve the model's performance
# we can use GridSearchCV to find the best hyperparameters

# plotting the leaves amount with help of kfold to check accuracy using AUC-ROC
Min_Sample_Leaves_Amount = np.arange(1, 1350, 50)
TreeModel = DecisionTreeClassifier(random_state=42)
SKFold_CV = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
ParamatersDict = {'min_samples_leaf': Min_Sample_Leaves_Amount}

# using grid search to find min leaves amount that we can starting our paraments from:
grid = GridSearchCV(estimator=TreeModel, param_grid=ParamatersDict, scoring='accuracy',
                    cv=SKFold_CV, n_jobs=-1, refit=True, return_train_score=True)
grid.fit(X_train, Y_train)
cv_results = pd.DataFrame(grid.cv_results_)
Selected_Leaves = ['mean_test_score',
                   'mean_train_score', 'param_min_samples_leaf']

DF_Selected_Values = cv_results[Selected_Leaves]
DF_Selected_Values = DF_Selected_Values.sort_values(
    'param_min_samples_leaf', ascending=True)

plt.figure(figsize=(13, 4))
plt.plot(DF_Selected_Values['param_min_samples_leaf'],
         DF_Selected_Values['mean_train_score'], marker='x', markersize=4)
plt.plot(DF_Selected_Values['param_min_samples_leaf'],
         DF_Selected_Values['mean_test_score'], marker='o', markersize=4)
plt.title('min samples leaves accuracy score')
plt.legend(['Train accuracy', 'Validation accuracy'])
plt.xlabel('min_samples_leaf')
plt.xticks([int(x) for x in DF_Selected_Values['param_min_samples_leaf']])
plt.ylabel('accuracy score')
#plt.show()

# define the hyperparameters to tune
max_depth_list = np.arange(1, 20)
leaf_samples = np.arange(50, 300, 5)
params_dt = {
    'max_depth': max_depth_list,
    'criterion': ['entropy', 'gini'],
    'class_weight': ['balanced', None],
    'min_samples_leaf': leaf_samples,
}
# define the GridSearchCV
TreeModel = DecisionTreeClassifier(random_state=42)
SKFold_CV = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
# use grid tree to tune the parameters
Grids_Tree = GridSearchCV(estimator=TreeModel, param_grid=params_dt, scoring='accuracy',
                          cv=SKFold_CV, n_jobs=-1, refit=True, return_train_score=True)
Grids_Tree.fit(X_train, Y_train)

best_hyperparameters = Grids_Tree.best_params_
best_TreeModel = Grids_Tree.best_estimator_

# plot the best hyperparameters using matplotlib

cv_results = pd.DataFrame(Grids_Tree.cv_results_)
Selected_Leaves = ['std_test_score', 'mean_test_score', 'mean_train_score',
                   'param_max_depth', 'param_criterion', 'param_class_weight', 'param_min_samples_leaf']
DF_Selected_Values = cv_results[Selected_Leaves]
DF_Selected_Values = DF_Selected_Values.sort_values(
    'mean_test_score', ascending=False).head(10)
DF_Selected_Values['mean_test_score'] = DF_Selected_Values['mean_test_score'].round(
    4)
DF_Selected_Values['mean_train_score'] = DF_Selected_Values['mean_train_score'].round(
    4)
DF_Selected_Values['std_test_score'] = DF_Selected_Values['std_test_score'].round(
    4)
column_names = {
    'mean_test_score': 'Mean test score',
    'mean_train_score': 'Mean train score',
    'param_max_depth': 'Max depth',
    'param_criterion': 'Criterion',
    'param_class_weight': 'Class weight',
    'param_min_samples_leaf': 'Min samples leaf',
    'std_test_score': 'std test score'
}

# plot the results
DF_Selected_Values = DF_Selected_Values.rename(columns=column_names)
fig, ax = plt.subplots(figsize=(10, 3))
ax.axis('off')
ax.set_title('Grid Search results', y=1.1)
table = ax.table(cellText=DF_Selected_Values.values,
                 colLabels=DF_Selected_Values.columns, loc='center')
table.auto_set_font_size(False)
table.set_fontsize(12)
table.scale(1.25, 1.25)
#plt.show()

# features importance:

importance = pd.DataFrame({'Feature_name': X_train.columns,
                           'Importance': best_TreeModel.feature_importances_.round(4)})

# sort the DataFrame by importance in descending order
importance = importance.sort_values(by='Importance', ascending=False)

fig, ax = plt.subplots(figsize=(9, 10))
ax.axis('off')
table = ax.table(cellText=importance.values,
                 colLabels=importance.columns, loc='center')
table.auto_set_font_size(False)
table.set_fontsize(11)
table.scale(0.6, 1.5)
#plt.show()

# ======== MISSION 4 ========
best_estimator = Grids_Tree.best_estimator_
best_params = Grids_Tree.best_params_

# print the accuracy scores for the best estimator model
print_dtc_score(best_estimator, X_train, Y_train, X_test, Y_test)

# plot the tree for the best estimator model
plt.figure(figsize=(20, 16))
plot_tree(best_estimator, filled=True, max_depth=2, feature_names=X_train.columns, class_names=['0', '1'], fontsize=6)
plt.savefig('decision_tree.png')

# print the feature importance for the best estimator model (ordered by importance)
feature_importances = best_estimator.feature_importances_
feature_importances = [(feature_name, importance) for feature_name, importance in
                       zip(X_train.columns, feature_importances)]
feature_importances = sorted(feature_importances, key=lambda x: x[1], reverse=True)

print("=================== FEATURE IMPORTANCE SCORES ===================")
for feature_name, importance in feature_importances:
    print(f"{feature_name}: {importance}")


In [None]:
best_params = Grids_Tree.best_params_
print("Best hyperparameters:", best_params)


In [None]:
import pickle

# Save the best model to a file
with open('best_decision_tree_model.pkl', 'wb') as file:
    pickle.dump(best_estimator, file)

print("Best model saved as 'best_decision_tree_model.pkl'")


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import multivariate_normal
import seaborn as sns
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, make_scorer
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
%matplotlib inline
import pandas as pd
from sympy.physics.control.control_plots import matplotlib

# Load data
from google.colab import files
uploaded = files.upload()
XY_train = pd.read_csv("XY_train.csv")

#missing data
missing_data_per_column = XY_train.isnull().sum()
print(missing_data_per_column)
#filling data using KNN
from sklearn.impute import KNNImputer

# Calculate the correlation between ANNUAL_MILEAGE and OUTCOME
correlation1 = XY_train['ANNUAL_MILEAGE'].corr(XY_train['OUTCOME'])
correlation2 = XY_train['CREDIT_SCORE'].corr(XY_train['OUTCOME'])

numeric_columns = ["ANNUAL_MILEAGE", "CREDIT_SCORE"]
knn_imputer = KNNImputer(n_neighbors=6)

#Apply KNN Imputer only to the numeric columns
df_numeric = pd.DataFrame(
    knn_imputer.fit_transform(XY_train[numeric_columns]),
    columns=numeric_columns
)

#Replace the original columns with the imputed ones
XY_train[numeric_columns] = df_numeric

# Inspect the column names and order
print("Column Names after Dropping ID:")
print(XY_train.columns)

# Encode categorical columns by name
# GENDER
XY_train['GENDER'] = XY_train['GENDER'].str.strip().map({"male": 0, "female": 1})

# EDUCATION
education_map = {"none": 0, "high school": 1, "university": 2}
XY_train['EDUCATION'] = XY_train['EDUCATION'].str.strip().map(education_map)

# INCOME
income_map = {"poverty": 0, "working class": 1, "middle class": 2, "upper class": 3}
XY_train['INCOME'] = XY_train['INCOME'].str.strip().map(income_map)

# VEHICLE_YEAR
vehicle_year_map = {"before 2015": 0, "after 2015": 1}
XY_train['VEHICLE_YEAR'] = XY_train['VEHICLE_YEAR'].str.strip().map(vehicle_year_map)

# VEHICLE_TYPE
vehicle_type_map = {"sedan": 0, "sports car": 1}
XY_train['VEHICLE_TYPE'] = XY_train['VEHICLE_TYPE'].str.strip().map(vehicle_type_map)

# Verify the updated dataset
print("\nUpdated DataFrame Head:")
print(XY_train.head())

#now with standarization
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix, accuracy_score
import numpy as np

# Separate features and labels
X = XY_train.drop(columns=['ID','OUTCOME'])
Y = XY_train['OUTCOME']

# Normalize the features using StandardScaler
scaler = MinMaxScaler()
normalized_features = scaler.fit_transform(X)

# Set a random seed for reproducibility
random_seed = 42
XY_train.keys()
print(X.shape)
print(Y.shape)
pd.value_counts(Y)
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=123)
print(f"Train size: {X_train.shape[0]}")
print(f"Test size: {X_test.shape[0]}")
print("Train\n-----------\n", pd.value_counts(Y_train)/Y_train.shape[0])
print("\nTest\n-----------\n", pd.value_counts(Y_test)/Y_test.shape[0])
#Default model
model = MLPClassifier(random_state=42)
model.fit(X_train, Y_train)


from sklearn.metrics import accuracy_score

# תחזיות המודל על סט האימון והבחינה
train_predictions = model.predict(X_train)  # X_train הוא מערך התכונות בסט האימון
test_predictions = model.predict(X_test)    # X_test הוא מערך התכונות בסט הבחינה

# חישוב ה-accuracy
train_accuracy = accuracy_score(Y_train, train_predictions)  # y_train הוא ה-labels בסט האימון
test_accuracy = accuracy_score(Y_test, test_predictions)      # y_test הוא ה-labels בסט הבחינה

# הדפסת התוצאות
print(f"Train Accuracy: {train_accuracy * 100:.2f}%")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")
**סעיף2- grid search**
param_grid = {
    'hidden_layer_sizes': [(50,), (50, 25), (75, 50, 25)],
    'activation': ['tanh', 'relu'],
    'alpha': [0.0001, 0.001, 0.01, 0.1],
    'learning_rate': ['adaptive', 'invscaling'],
    'learning_rate_init': [0.0001, 0.001, 0.01],
    'max_iter': [200, 400],
    'solver': ['adam', 'sgd'],
    'early_stopping': [True, False]
}

# Set up GridSearchCV with a refined grid
grid_search = GridSearchCV(estimator=mlp, param_grid=param_grid, scoring=make_scorer(f1_score, pos_label=1), cv=5, verbose=3, n_jobs=-1)

# Fit the grid search to the training data
grid_search.fit(X_train,Y_train)
#improved model
improvedModel = grid_search.best_estimator_
improvedModel.fit(X_train, Y_train)

from sklearn.metrics import accuracy_score

# תחזיות המודל על סט האימון והבחינה
train_predictions = improvedModel.predict(X_train)  # X_train הוא מערך התכונות בסט האימון
test_predictions = improvedModel.predict(X_test)    # X_test הוא מערך התכונות בסט הבחינה

# חישוב ה-accuracy
train_accuracy = accuracy_score(Y_train, train_predictions)  # y_train הוא ה-labels בסט האימון
test_accuracy = accuracy_score(Y_test, test_predictions)      # y_test הוא ה-labels בסט הבחינה

# הדפסת התוצאות
print(f"Train Accuracy: {train_accuracy * 100:.2f}%")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

####Heat map
#HEAT-MAP פרמטרים
# יצירת Heat Map 1
# Convert the grid search results to a DataFrame
results = pd.DataFrame(grid_search.cv_results_)

# Heat Map 1: Hidden Layer Sizes and Activation
heatmap_data = results.pivot_table(index='param_hidden_layer_sizes', columns='param_activation', values='mean_test_score', aggfunc='mean')
plt.figure(figsize=(10, 8))
sns.heatmap(heatmap_data, annot=True, cmap='YlGnBu', fmt='.3f', cbar=True)
plt.title('Heat Map of Grid Search Results for hidden layer sizes and activation', fontsize=20)
plt.xlabel('Activation', fontsize=14)
plt.ylabel('Hidden Layer Sizes', fontsize=14)
plt.show()

# Heat Map 2: Alpha and Learning Rate
heatmap_data = results.pivot_table(index='param_alpha', columns='param_learning_rate', values='mean_test_score', aggfunc='mean')
plt.figure(figsize=(10, 8))
sns.heatmap(heatmap_data, annot=True, cmap='YlGnBu', fmt='.3f', cbar=True)
plt.title('Heat Map of Grid Search Results for alpha and learning rate', fontsize=20)
plt.xlabel('Learning Rate', fontsize=14)
plt.ylabel('Alpha', fontsize=14)
plt.show()


# Heat Map 3: learning_rate_init and max_iter
heatmap_data = results.pivot_table(index='param_learning_rate_init', columns='param_max_iter', values='mean_test_score', aggfunc='mean')
plt.figure(figsize=(10, 8))
sns.heatmap(heatmap_data, annot=True, cmap='YlGnBu', fmt='.3f', cbar=True)
plt.title('Heat Map of Grid Search Results for hidden layer sizes and activation', fontsize=20)
plt.xlabel('max_iter', fontsize=14)
plt.ylabel('learning_rate_init', fontsize=14)
plt.show()
from sklearn.metrics import confusion_matrix

# תחזית על סט הבדיקות
Y_pred = improvedModel.predict(X_test)

# חישוב מטריצת המבוכה
cm = confusion_matrix(Y_test, Y_pred)

# הצגת מטריצת המבוכה
print("Confusion Matrix:")
print(cm)


In [None]:
import pandas as pd
from sympy.physics.control.control_plots import matplotlib

# Load data
XY_train = pd.read_csv("XY_train.csv")

#missing data
missing_data_per_column = XY_train.isnull().sum()
print(missing_data_per_column)

In [None]:
#filling data using KNN
from sklearn.impute import KNNImputer

# Calculate the correlation between ANNUAL_MILEAGE and OUTCOME
correlation1 = XY_train['ANNUAL_MILEAGE'].corr(XY_train['OUTCOME'])
correlation2 = XY_train['CREDIT_SCORE'].corr(XY_train['OUTCOME'])

numeric_columns = ["ANNUAL_MILEAGE", "CREDIT_SCORE"]
knn_imputer = KNNImputer(n_neighbors=6)

#Apply KNN Imputer only to the numeric columns
df_numeric = pd.DataFrame(
    knn_imputer.fit_transform(XY_train[numeric_columns]),
    columns=numeric_columns
)

#Replace the original columns with the imputed ones
XY_train[numeric_columns] = df_numeric


In [None]:
# Print data types of all columns
print("Data Types:")
print(XY_train.dtypes)

In [None]:
# Drop the first column (ID)


# Inspect the column names and order
print("Column Names after Dropping ID:")
print(XY_train.columns)

# Encode categorical columns by name
# GENDER
XY_train['GENDER'] = XY_train['GENDER'].str.strip().map({"male": 0, "female": 1})

# EDUCATION
education_map = {"none": 0, "high school": 1, "university": 2}
XY_train['EDUCATION'] = XY_train['EDUCATION'].str.strip().map(education_map)

# INCOME
income_map = {"poverty": 0, "working class": 1, "middle class": 2, "upper class": 3}
XY_train['INCOME'] = XY_train['INCOME'].str.strip().map(income_map)

# VEHICLE_YEAR
vehicle_year_map = {"before 2015": 0, "after 2015": 1}
XY_train['VEHICLE_YEAR'] = XY_train['VEHICLE_YEAR'].str.strip().map(vehicle_year_map)

# VEHICLE_TYPE
vehicle_type_map = {"sedan": 0, "sports car": 1}
XY_train['VEHICLE_TYPE'] = XY_train['VEHICLE_TYPE'].str.strip().map(vehicle_type_map)

# Verify the updated dataset
print("\nUpdated DataFrame Head:")
print(XY_train.head())


In [None]:
import  seaborn as sns
import matplotlib.pyplot as plt 
sns.pairplot(XY_train, hue='OUTCOME')
plt.show()

In [None]:
#now with standarization
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix, accuracy_score
import numpy as np

# Separate features and labels
features = XY_train.drop(columns=['OUTCOME'])
labels = XY_train['OUTCOME']

# Normalize the features using StandardScaler
scaler = StandardScaler()
normalized_features = scaler.fit_transform(features)

# Set a random seed for reproducibility
random_seed = 42

# Initialize and fit the K-means model
kmeans = KMeans(n_clusters=2, random_state=random_seed)
clusters = kmeans.fit_predict(normalized_features)



In [None]:
from sklearn.metrics import silhouette_score, adjusted_rand_score, normalized_mutual_info_score
from sklearn.metrics import silhouette_score

# Silhouette Score
silhouette_before = silhouette_score(normalized_features, clusters)
print(f"Silhouette Score (Before PCA): {silhouette_before:.2f}")

from sklearn.metrics import davies_bouldin_score
#Davies-Bouldin Index:
db_index = davies_bouldin_score(normalized_features, clusters)
print(f"Davies-Bouldin Index: {db_index:.2f}")

# Normalized Mutual Information (NMI)
nmi_before = normalized_mutual_info_score(labels, clusters)
print(f"Normalized Mutual Information (Before PCA): {nmi_before:.2f}")


In [None]:
#now with PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=2)

principal_components = pca.fit_transform(normalized_features)

# Print explained variance ratio for each component
explained_variance_ratio = pca.explained_variance_ratio_
print("Explained Variance Ratio:", explained_variance_ratio)
#sum
print("Explained Variance Ratio sum:",pca.explained_variance_ratio_.sum())


pca = PCA.fit(normalized_features)

In [None]:
import matplotlib.pyplot as plt

# Create a DataFrame for the principal components
pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])

# Add the labels back for coloring the points
pca_df['OUTCOME'] = labels

# Plot the principal components
plt.figure(figsize=(8, 6))
for label in pca_df['OUTCOME'].unique():
    plt.scatter(
        pca_df.loc[pca_df['OUTCOME'] == label, 'PC1'],
        pca_df.loc[pca_df['OUTCOME'] == label, 'PC2'],
        label=f'Class {label}',
        alpha=0.7
    )

plt.title("PCA Visualization")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
kmeans_pca = KMeans(n_clusters=2, random_state=random_seed)
clusters_pca = kmeans_pca.fit_predict(principal_components)

In [None]:
# Silhouette Score
silhouette_after = silhouette_score(principal_components, clusters_pca)
print(f"Silhouette Score (After PCA): {silhouette_after:.2f}")

#Davies-Bouldin Index:
db_index = davies_bouldin_score(principal_components, clusters_pca)
print(f"Davies-Bouldin Index: {db_index:.2f}")

# Normalized Mutual Information (NMI)
nmi_after = normalized_mutual_info_score(labels, clusters_pca)
print(f"Normalized Mutual Information (After PCA): {nmi_after:.2f}")

In [None]:
# Visualize the clusters in PCA space
plt.figure(figsize=(8, 6))
for cluster in np.unique(clusters_pca):
    subset = pca_df[clusters_pca == cluster]
    plt.scatter(subset['PC1'], subset['PC2'], label=f'Cluster {cluster}', alpha=0.7)

plt.title("K-means Clusters in PCA Space")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
#------------------------------------2---------------------SVM


In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)

# Step 1: Load and Preprocess the Dataset
# Assuming `XY_train` is the DataFrame with features and the 'OUTCOME' column as the target

# Separate features and labels
X = XY_train.drop(columns=['OUTCOME'])
y = XY_train['OUTCOME']

# Split the dataset into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Step 2: Train an SVM Model (with class balancing)
svm_model = SVC(class_weight='balanced', random_state=42)  # Class_weight addresses class imbalance
svm_model.fit(X_train, y_train)

In [None]:
# Step 3: Evaluate the Model
y_pred = svm_model.predict(X_test)
print("\n=== Model Performance ===")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(f"F1-Score: {f1_score(y_test, y_pred):.2f}")
print(f"Precision: {precision_score(y_test, y_pred):.2f}")
print(f"Recall: {recall_score(y_test, y_pred):.2f}")

In [None]:
# Step 4: Hyperparameter Tuning with GridSearchCV
# Define the parameter grid
param_grid = {
    'C': [0.1, 1,2, 3, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto'],
    'class_weight': ['balanced']
}

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(SVC(random_state=42), param_grid, cv=5, scoring='accuracy', verbose=1)
grid_search.fit(X_train, y_train)

# Display the best parameters and their performance
print("\n=== Best Parameters Found ===")
print(grid_search.best_params_)


In [None]:
# Save the best model
import joblib
best_model = grid_search.best_estimator_
joblib.dump(best_model, 'best_model.pkl')
print("Model saved as 'best_model.pkl'.")

In [None]:
# Step 5: Evaluate the Best Model on Test Set
best_model = grid_search.best_estimator_
y_best_pred = best_model.predict(X_test)

print("\n=== Best Model Performance ===")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_best_pred))
print("\nClassification Report:\n", classification_report(y_test, y_best_pred))
print(f"Accuracy: {accuracy_score(y_test, y_best_pred):.2f}")
print(f"F1-Score: {f1_score(y_test, y_best_pred):.2f}")
print(f"Precision: {precision_score(y_test, y_best_pred):.2f}")
print(f"Recall: {recall_score(y_test, y_best_pred):.2f}")

In [None]:
# Create a DataFrame from the grid search results
results_df = pd.DataFrame(grid_search.cv_results_)

In [None]:
from sklearn.metrics import classification_report

# Extract the parameter combinations and corresponding models from the grid search
all_params = grid_search.cv_results_['params']
all_models = grid_search.cv_results_

# Create a DataFrame to store the results
results = []

for params in all_params:
    # Train an SVM model with the current parameter combination
    model = SVC(**params)
    model.fit(X_train, y_train)
    
    # Evaluate on the test set
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    accuracy = model.score(X_test, y_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    confusion = confusion_matrix(y_test, y_pred)
    
    # Save results
    results.append({
        "params": params,
        "accuracy": accuracy,
        "f1_score": report["weighted avg"]["f1-score"],
        "precision": report["weighted avg"]["precision"],
        "recall": report["weighted avg"]["recall"],
        "confusion_matrix": confusion
    })

# Convert the results to a DataFrame for better analysis
import pandas as pd
results_df = pd.DataFrame(results)

# Sort by accuracy or any other metric
sorted_results = results_df.sort_values(by="accuracy", ascending=False)

# Display the top models
print(sorted_results.head())


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Assuming you have the grid_search results
# Replace `grid_search.cv_results_` with your actual GridSearchCV results
mean_test_scores = grid_search.cv_results_['mean_test_score']
param_C = [params['C'] for params in grid_search.cv_results_['params']]
param_kernel = [params['kernel'] for params in grid_search.cv_results_['params']]

# Create a DataFrame for visualization
results_df = pd.DataFrame({
    'C': param_C,
    'Kernel': param_kernel,
    'Mean Test Score': mean_test_scores
})

# Aggregate the data to handle duplicate entries
aggregated_results = results_df.groupby(['C', 'Kernel'])['Mean Test Score'].mean().reset_index()

# Pivot for heatmap
pivot_table = aggregated_results.pivot(index='C', columns='Kernel', values='Mean Test Score')

# Heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(pivot_table, annot=True, cmap='coolwarm', fmt='.3f', linewidths=0.5)
plt.title('Grid Search Results: Mean Test Scores for C and Kernel', fontsize=14)
plt.xlabel('Kernel', fontsize=12)
plt.ylabel('C', fontsize=12)
plt.show()

# Line Plot: Performance trends across C values for each kernel
plt.figure(figsize=(10, 6))
for kernel in aggregated_results['Kernel'].unique():
    kernel_data = aggregated_results[aggregated_results['Kernel'] == kernel]
    plt.plot(kernel_data['C'], kernel_data['Mean Test Score'], label=f'Kernel: {kernel}', marker='o')

plt.title('Mean Test Scores vs. C for Different Kernels', fontsize=14)
plt.xlabel('C', fontsize=12)
plt.ylabel('Mean Test Score', fontsize=12)
plt.legend(title='Kernel')
plt.grid(True)
plt.show()

# Bar Chart: Highlighting top-performing combinations
top_results = aggregated_results.sort_values(by='Mean Test Score', ascending=False).head(5)
plt.figure(figsize=(10, 6))
sns.barplot(data=top_results, x='C', y='Mean Test Score', hue='Kernel', dodge=True, palette='viridis')
plt.title('Top 5 Hyperparameter Combinations by Mean Test Score', fontsize=14)
plt.xlabel('C', fontsize=12)
plt.ylabel('Mean Test Score', fontsize=12)
plt.legend(title='Kernel')
plt.show()


In [None]:
# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
test_accuracy = best_model.score(X_test, y_test)
train_accuracy =  best_model.score(X_train, y_train)

print(f"Accuracy on the training set: {train_accuracy:.2f}")
print(f"Accuracy on the test set: {test_accuracy:.2f}")


In [None]:
# Load the model from the file
with open('best_decision_tree_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

print("Model loaded successfully!")


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# Generate predictions for the test set
y_test_pred = loaded_model.predict(X_test)

# Calculate the confusion matrix
conf_matrix = confusion_matrix(Y_test, y_test_pred)

# Display the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=['Class 0', 'Class 1'])
disp.plot(cmap='Blues', values_format='d')
plt.title('Confusion Matrix for Loaded Model')
plt.show()


In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.model_selection import train_test_split

df = pd.read_csv("X_test.csv", encoding='iso-8859-1')

# ======== MISSION 0 ========
# split input dataset to training dataset & testing dataset

# remove irrelevant columns
df_imputed = df.drop(columns=['ID', 'POSTAL_CODE'])

# convert values to numerical
df_imputed['GENDER'] = df_imputed['GENDER'].map({'male': 0, 'female': 1})
df_imputed['EDUCATION'] = df_imputed['EDUCATION'].map({'none': 0, 'high school': 1, 'university': 2})
df_imputed['VEHICLE_TYPE'] = df_imputed['VEHICLE_TYPE'].map({'sedan': 0, 'sports car': 1})
df_imputed['VEHICLE_YEAR'] = df_imputed['VEHICLE_YEAR'].map({'before 2015': 0, 'after 2015': 1})
df_imputed['INCOME'] = df_imputed['INCOME'].map({'poverty': 0, 'working class': 1, 'middle class': 2, 'upper class': 3})

df_imputed['AGE'] = pd.cut(df_imputed['AGE'], bins=[0, 20, 30, 50, 70, 100], labels=[0, 1, 2, 3, 4])
df_imputed['DRIVING_EXPERIENCE'] = pd.cut(df_imputed['DRIVING_EXPERIENCE'], bins=[0, 1, 3, 10, 20, 100],
                                          labels=[0, 1, 2, 3, 4])

# remove rows with missing values
df_imputed = df_imputed.dropna()

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
import pickle

# Load the training data
XY_train = pd.read_csv("XY_train.csv", encoding='iso-8859-1')

# Drop irrelevant columns
XY_train = XY_train.drop(columns=['ID', 'POSTAL_CODE'])

# Convert categorical variables to numerical
XY_train['GENDER'] = XY_train['GENDER'].map({'male': 0, 'female': 1})
XY_train['EDUCATION'] = XY_train['EDUCATION'].map({'none': 0, 'high school': 1, 'university': 2})
XY_train['VEHICLE_TYPE'] = XY_train['VEHICLE_TYPE'].map({'sedan': 0, 'sports car': 1})
XY_train['VEHICLE_YEAR'] = XY_train['VEHICLE_YEAR'].map({'before 2015': 0, 'after 2015': 1})
XY_train['INCOME'] = XY_train['INCOME'].map({'poverty': 0, 'working class': 1, 'middle class': 2, 'upper class': 3})

XY_train['AGE'] = pd.cut(XY_train['AGE'], bins=[0, 20, 30, 50, 70, 100], labels=[0, 1, 2, 3, 4])
XY_train['DRIVING_EXPERIENCE'] = pd.cut(XY_train['DRIVING_EXPERIENCE'],
                                        bins=[0, 1, 3, 10, 20, 100], labels=[0, 1, 2, 3, 4])

# Handle missing values using KNNImputer
numeric_columns = ["ANNUAL_MILEAGE", "CREDIT_SCORE"]
scaler = MinMaxScaler()

# Scale the numeric columns
scaled_data = scaler.fit_transform(XY_train[numeric_columns])

# Apply KNNImputer to the scaled data
knn_imputer = KNNImputer(n_neighbors=6)
imputed_data = knn_imputer.fit_transform(scaled_data)

# Replace the original columns with the imputed ones (inverse scale)
XY_train[numeric_columns] = scaler.inverse_transform(imputed_data)

# Separate features and target variable
X_train = XY_train.drop(columns=['OUTCOME'])
Y_train = XY_train['OUTCOME']

# Load the test data
X_test = pd.read_csv("X_test.csv", encoding='iso-8859-1')

# Preprocess the test data (same steps as training)
X_test = X_test.drop(columns=['ID', 'POSTAL_CODE'])
X_test['GENDER'] = X_test['GENDER'].map({'male': 0, 'female': 1})
X_test['EDUCATION'] = X_test['EDUCATION'].map({'none': 0, 'high school': 1, 'university': 2})
X_test['VEHICLE_TYPE'] = X_test['VEHICLE_TYPE'].map({'sedan': 0, 'sports car': 1})
X_test['VEHICLE_YEAR'] = X_test['VEHICLE_YEAR'].map({'before 2015': 0, 'after 2015': 1})
X_test['INCOME'] = X_test['INCOME'].map({'poverty': 0, 'working class': 1, 'middle class': 2, 'upper class': 3})

X_test['AGE'] = pd.cut(X_test['AGE'], bins=[0, 20, 30, 50, 70, 100], labels=[0, 1, 2, 3, 4])
X_test['DRIVING_EXPERIENCE'] = pd.cut(X_test['DRIVING_EXPERIENCE'],
                                      bins=[0, 1, 3, 10, 20, 100], labels=[0, 1, 2, 3, 4])

# Scale the numeric columns in test data and impute missing values
X_test[numeric_columns] = scaler.transform(X_test[numeric_columns])
X_test[numeric_columns] = scaler.inverse_transform(knn_imputer.transform(X_test[numeric_columns]))

# Load the saved model
with open("best_decision_tree_model.pkl", "rb") as file:
    loaded_model = pickle.load(file)

# Make predictions on the preprocessed test data
y_test_pred = loaded_model.predict(X_test)

# Create the output DataFrame in the required format
y_test_output = pd.DataFrame({'target': y_test_pred})

# Save the predictions to an Excel file
y_test_output.to_excel("y_test_predictions.xlsx", index=False)

print("Predictions saved to 'y_test_predictions.xlsx'")
