In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "e11e20c3",
   "metadata": {
    "papermill": {
     "duration": 0.017192,
     "end_time": "2023-05-20T08:04:53.476874",
     "exception": false,
     "start_time": "2023-05-20T08:04:53.459682",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "# Import Libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "68c8d6db",
   "metadata": {
    "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
    "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
    "papermill": {
     "duration": 2.218609,
     "end_time": "2023-05-20T08:05:07.690099",
     "exception": false,
     "start_time": "2023-05-20T08:05:05.471490",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from sklearn.preprocessing import LabelEncoder, StandardScaler\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.naive_bayes import GaussianNB\n",
    "from sklearn.linear_model import LogisticRegression, Perceptron\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, PrecisionRecallDisplay, RocCurveDisplay\n",
    "import joblib"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9a597793",
   "metadata": {
    "papermill": {
     "duration": 0.013165,
     "end_time": "2023-05-20T08:05:07.715822",
     "exception": false,
     "start_time": "2023-05-20T08:05:07.702657",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "# Load Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c3d8e60c",
   "metadata": {
    "papermill": {
     "duration": 0.106488,
     "end_time": "2023-05-20T08:05:07.835088",
     "exception": false,
     "start_time": "2023-05-20T08:05:07.728600",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "df = pd.read_csv('data.csv', sep=';')\n",
    "df.columns = df.columns.str.strip().str.replace('\"', '').str.replace('\\t', '')\n",
    "print(df.head())\n",
    "print(df.info())\n",
    "print(df['Target'].value_counts())"
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Preprocessing"
   ],
   "cell_type": "markdown"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "# Features and target\n",
    "X = df.drop('Target', axis=1)\n",
    "y = df['Target']\n",
    "\n",
    "# Encode y\n",
    "le = LabelEncoder()\n",
    "y = le.fit_transform(y)\n",
    "joblib.dump(le, 'label_encoder.pkl')\n",
    "\n",
    "# Scale X\n",
    "scaler = StandardScaler()\n",
    "X = scaler.fit_transform(X)\n",
    "joblib.dump(scaler, 'scaler.pkl')\n",
    "\n",
    "# Split\n",
    "x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Define Evaluation Function"
   ],
   "cell_type": "markdown"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "def perform(y_pred):\n",
    "    print(\"Accuracy : \", accuracy_score(y_test, y_pred))\n",
    "    print(\"Precision : \", precision_score(y_test, y_pred, average = 'micro'))\n",
    "    print(\"Recall : \", recall_score(y_test, y_pred, average = 'micro'))\n",
    "    print(\"F1 Score : \", f1_score(y_test, y_pred, average = 'micro'))\n",
    "    cm = confusion_matrix(y_test, y_pred)\n",
    "    cmd = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=le.classes_)\n",
    "    cmd.plot()\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Naive Bayes"
   ],
   "cell_type": "markdown"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "model_nb = GaussianNB()\n",
    "model_nb.fit(x_train, y_train)\n",
    "y_pred_nb = model_nb.predict(x_test)\n",
    "perform(y_pred_nb)"
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Logistic Regression"
   ],
   "cell_type": "markdown"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "model_lr = LogisticRegression(max_iter=1000)\n",
    "model_lr.fit(x_train, y_train)\n",
    "y_pred_lr = model_lr.predict(x_test)\n",
    "perform(y_pred_lr)"
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Random Forest"
   ],
   "cell_type": "markdown"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "model_rf = RandomForestClassifier(random_state=42)\n",
    "model_rf.fit(x_train, y_train)\n",
    "y_pred_rf = model_rf.predict(x_test)\n",
    "perform(y_pred_rf)\n",
    "# Save as best model\n",
    "joblib.dump(model_rf, 'model.pkl')"
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "# SVC"
   ],
   "cell_type": "markdown"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "model_svc = SVC()\n",
    "model_svc.fit(x_train, y_train)\n",
    "y_pred_svc = model_svc.predict(x_test)\n",
    "perform(y_pred_svc)"
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Perceptron"
   ],
   "cell_type": "markdown"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "model_mlp = Perceptron()\n",
    "model_mlp.fit(x_train, y_train)\n",
    "y_pred_mlp = model_mlp.predict(x_test)\n",
    "perform(y_pred_mlp)"
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "# KNN Classifier"
   ],
   "cell_type": "markdown"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "error = []\n",
    "for i in range(1, 40):\n",
    "    knn = KNeighborsClassifier(n_neighbors=i)\n",
    "    knn.fit(x_train, y_train)\n",
    "    pred_i = knn.predict(x_test)\n",
    "    error.append(accuracy_score(y_test, pred_i))\n",
    "\n",
    "plt.figure(figsize=(12, 6))\n",
    "plt.plot(range(1, 40), error, color='red', linestyle='dashed', marker='o', markerfacecolor='blue', markersize=10)\n",
    "plt.title('K Value accuracy')\n",
    "plt.xlabel('K Value')\n",
    "plt.ylabel('Accuracy')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "model_knn = KNeighborsClassifier(n_neighbors=3)\n",
    "model_knn.fit(x_train, y_train)\n",
    "y_pred_knn = model_knn.predict(x_test)\n",
    "perform(y_pred_knn)"
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Comparison"
   ],
   "cell_type": "markdown"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "pred = [y_pred_nb, y_pred_lr, y_pred_rf, y_pred_svc, y_pred_mlp, y_pred_knn]\n",
    "acc = []\n",
    "classifiers = [\"NaiveBayes\", \"Logistic Regression\", \"RandomForest\", \"Support Vector Classifier\", \"Perceptron\", \"KNN\"]\n",
    "for i in pred:\n",
    "    temp = accuracy_score(y_test, i)\n",
    "    acc.append(temp)\n",
    "\n",
    "plt.barh(classifiers, acc)\n",
    "plt.ylabel('Classifiers')\n",
    "plt.xlabel('Accuracy')\n",
    "plt.title('Model Comparison')\n",
    "plt.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}