1478314783 " return metrics"
1478414784 ]
1478514785 },
14786+ {
14787+ "cell_type": "code",
14788+ "execution_count": null,
14789+ "metadata": {},
14790+ "outputs": [],
14791+ "source": [
14792+ "#For hyperparameter tuning we will use optuna\n",
14793+ "!pip install optuna\n",
14794+ "import optuna"
14795+ ]
14796+ },
1478614797 {
1478714798 "cell_type": "code",
1478814799 "execution_count": 801,
@@ -14800,18 +14811,40 @@
1480014811 ],
1480114812 "source": [
1480214813 "#DecisionTreeClassifier\n",
14814+ "# Define the objective function\n",
14815+ "def objective(trial):\n",
14816+ " max_depth = trial.suggest_int('max_depth', 1, 20)\n",
14817+ " min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 50)\n",
14818+ "\n",
14819+ " model = DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=min_samples_leaf)\n",
14820+ " model.fit(X_train, y_train)\n",
14821+ " \n",
14822+ " y_pred = model.predict(X_test)\n",
14823+ " accuracy = accuracy_score(y_test, y_pred)\n",
14824+ " \n",
14825+ " return accuracy\n",
14826+ "\n",
14827+ "# Create a study object and optimize the objective function\n",
14828+ "study = optuna.create_study(direction='maximize')\n",
14829+ "study.optimize(objective, n_trials=100)\n",
14830+ "\n",
14831+ "# Print the best hyperparameters\n",
14832+ "print('Best hyperparameters: ', study.best_params)\n",
14833+ "\n",
14834+ "# Train the model with the best hyperparameters\n",
14835+ "best_params = study.best_params\n",
1480314836 "start = time.time()\n",
14804- "modelDC = DecisionTreeClassifier(max_depth = 12, min_samples_leaf = 10 )\n",
14837+ "modelDC = DecisionTreeClassifier(**best_params )\n",
1480514838 "modelDC.fit(X_train, y_train)\n",
1480614839 "end = time.time()\n",
1480714840 "TimeDC = end - start\n",
1480814841 "print('Time: ', TimeDC)\n",
1480914842 "\n",
14810- "#Evaluating model on test set\n",
14843+ "# Evaluating model on test set\n",
1481114844 "y_pred = modelDC.predict(X_test)\n",
1481214845 "all_metrics.update(metrics_data(\"Decision Trees\", y_test, y_pred))\n",
1481314846 "\n",
14814- "#Evaluating model on train set\n",
14847+ "# Evaluating model on train set\n",
1481514848 "y_pred = modelDC.predict(X_train)\n",
1481614849 "accuracyDC2 = accuracy_score(y_train, y_pred)\n",
1481714850 "print('Accuracy on train set: {}'.format(accuracyDC2))"
@@ -14833,17 +14866,39 @@
1483314866 ],
1483414867 "source": [
1483514868 "#MultinomialNB\n",
14869+ "# Define the objective function\n",
14870+ "def objective(trial):\n",
14871+ " alpha = trial.suggest_float('alpha', 1e-3, 1e-1, log=True)\n",
14872+ "\n",
14873+ " model = MultinomialNB(alpha=alpha)\n",
14874+ " model.fit(X_train, y_train)\n",
14875+ " \n",
14876+ " y_pred = model.predict(X_test)\n",
14877+ " accuracy = accuracy_score(y_test, y_pred)\n",
14878+ " \n",
14879+ " return accuracy\n",
14880+ "\n",
14881+ "# Create a study object and optimize the objective function\n",
14882+ "study = optuna.create_study(direction='maximize')\n",
14883+ "study.optimize(objective, n_trials=100)\n",
14884+ "\n",
14885+ "# Print the best hyperparameters\n",
14886+ "print('Best hyperparameters: ', study.best_params)\n",
14887+ "\n",
14888+ "# Train the model with the best hyperparameters\n",
14889+ "best_params = study.best_params\n",
1483614890 "start = time.time()\n",
14837- "modelNB = MultinomialNB(alpha=0.005 )\n",
14891+ "modelNB = MultinomialNB(**best_params )\n",
1483814892 "modelNB.fit(X_train, y_train)\n",
1483914893 "end = time.time()\n",
1484014894 "TimeNB = end - start\n",
14895+ "print('Time: ', TimeNB)\n",
1484114896 "\n",
14842- "#Evaluating model on test set\n",
14897+ "# Evaluating model on test set\n",
1484314898 "y_pred = modelNB.predict(X_test)\n",
1484414899 "all_metrics.update(metrics_data(\"Multinomial Naive Bayes\", y_test, y_pred))\n",
1484514900 "\n",
14846- "#Evaluating model on train set\n",
14901+ "# Evaluating model on train set\n",
1484714902 "y_pred = modelNB.predict(X_train)\n",
1484814903 "accuracyNB2 = accuracy_score(y_train, y_pred)\n",
1484914904 "print('Accuracy on train set: {}'.format(accuracyNB2))"
@@ -14931,18 +14986,40 @@
1493114986 ],
1493214987 "source": [
1493314988 "#GaussianNB\n",
14989+ "\n",
14990+ "# Define the objective function\n",
14991+ "def objective(trial):\n",
14992+ " var_smoothing = trial.suggest_float('var_smoothing', 1e-11, 1e-7, log=True)\n",
14993+ "\n",
14994+ " model = GaussianNB(var_smoothing=var_smoothing)\n",
14995+ " model.fit(X_train, y_train)\n",
14996+ " \n",
14997+ " y_pred = model.predict(X_test)\n",
14998+ " accuracy = accuracy_score(y_test, y_pred)\n",
14999+ " \n",
15000+ " return accuracy\n",
15001+ "\n",
15002+ "# Create a study object and optimize the objective function\n",
15003+ "study = optuna.create_study(direction='maximize')\n",
15004+ "study.optimize(objective, n_trials=100)\n",
15005+ "\n",
15006+ "# Print the best hyperparameters\n",
15007+ "print('Best hyperparameters: ', study.best_params)\n",
15008+ "\n",
15009+ "# Train the model with the best hyperparameters\n",
15010+ "best_params = study.best_params\n",
1493415011 "start = time.time()\n",
14935- "modelGNB = GaussianNB()\n",
15012+ "modelGNB = GaussianNB(**best_params )\n",
1493615013 "modelGNB.fit(X_train, y_train)\n",
1493715014 "end = time.time()\n",
1493815015 "TimeGNB = end - start\n",
1493915016 "print('Time: ', TimeGNB)\n",
1494015017 "\n",
14941- "#Evaluating model on test set\n",
15018+ "# Evaluating model on test set\n",
1494215019 "y_pred = modelGNB.predict(X_test)\n",
1494315020 "all_metrics.update(metrics_data(\"Gaussian Naive Bayes\", y_test, y_pred))\n",
1494415021 "\n",
14945- "#Evaluating model on train set\n",
15022+ "# Evaluating model on train set\n",
1494615023 "y_pred = modelGNB.predict(X_train)\n",
1494715024 "accuracyGNB2 = accuracy_score(y_train, y_pred)\n",
1494815025 "print('Accuracy on train set: {}'.format(accuracyGNB2))"
@@ -14965,18 +15042,41 @@
1496515042 ],
1496615043 "source": [
1496715044 "#Logistic Regression\n",
15045+ "# Define the objective function\n",
15046+ "def objective(trial):\n",
15047+ " # Define the search space for hyperparameters\n",
15048+ " C = trial.suggest_float('C', 1e-4, 1e2, log=True)\n",
15049+ " solver = trial.suggest_categorical('solver', ['liblinear', 'lbfgs', 'newton-cg', 'sag', 'saga'])\n",
15050+ "\n",
15051+ " model = LogisticRegression(C=C, solver=solver, max_iter=1000)\n",
15052+ " model.fit(X_train, y_train)\n",
15053+ " \n",
15054+ " y_pred = model.predict(X_test)\n",
15055+ " accuracy = accuracy_score(y_test, y_pred)\n",
15056+ " \n",
15057+ " return accuracy\n",
15058+ "\n",
15059+ "# Create a study object and optimize the objective function\n",
15060+ "study = optuna.create_study(direction='maximize')\n",
15061+ "study.optimize(objective, n_trials=100)\n",
15062+ "\n",
15063+ "# Print the best hyperparameters\n",
15064+ "print('Best hyperparameters: ', study.best_params)\n",
15065+ "\n",
15066+ "# Train the model with the best hyperparameters\n",
15067+ "best_params = study.best_params\n",
1496815068 "start = time.time()\n",
14969- "modelLR = LogisticRegression()\n",
15069+ "modelLR = LogisticRegression(**best_params, max_iter=1000 )\n",
1497015070 "modelLR.fit(X_train, y_train)\n",
1497115071 "end = time.time()\n",
1497215072 "TimeLR = end - start\n",
1497315073 "print('Time: ', TimeLR)\n",
1497415074 "\n",
14975- "#Evaluating model on test set\n",
15075+ "# Evaluating model on test set\n",
1497615076 "y_pred = modelLR.predict(X_test)\n",
1497715077 "all_metrics.update(metrics_data(\"Logistic Regression\", y_test, y_pred))\n",
1497815078 "\n",
14979- "#Evaluating model on train set\n",
15079+ "# Evaluating model on train set\n",
1498015080 "y_pred = modelLR.predict(X_train)\n",
1498115081 "accuracyLR2 = accuracy_score(y_train, y_pred)\n",
1498215082 "print('Accuracy on train set: {}'.format(accuracyLR2))"
@@ -14999,18 +15099,51 @@
1499915099 ],
1500015100 "source": [
1500115101 "#RandomForestClassifier\n",
15102+ "# Define the objective function\n",
15103+ "def objective(trial):\n",
15104+ " # Define the search space for hyperparameters\n",
15105+ " n_estimators = trial.suggest_int('n_estimators', 100, 1000)\n",
15106+ " max_depth = trial.suggest_int('max_depth', 10, 50)\n",
15107+ " min_samples_split = trial.suggest_int('min_samples_split', 2, 10)\n",
15108+ " min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)\n",
15109+ " max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])\n",
15110+ " \n",
15111+ " model = RandomForestClassifier(\n",
15112+ " n_estimators=n_estimators,\n",
15113+ " max_depth=max_depth,\n",
15114+ " min_samples_split=min_samples_split,\n",
15115+ " min_samples_leaf=min_samples_leaf,\n",
15116+ " max_features=max_features,\n",
15117+ " random_state=42\n",
15118+ " )\n",
15119+ " model.fit(X_train, y_train)\n",
15120+ " \n",
15121+ " y_pred = model.predict(X_test)\n",
15122+ " accuracy = accuracy_score(y_test, y_pred)\n",
15123+ " \n",
15124+ " return accuracy\n",
15125+ "\n",
15126+ "# Create a study object and optimize the objective function\n",
15127+ "study = optuna.create_study(direction='maximize')\n",
15128+ "study.optimize(objective, n_trials=100)\n",
15129+ "\n",
15130+ "# Print the best hyperparameters\n",
15131+ "print('Best hyperparameters: ', study.best_params)\n",
15132+ "\n",
15133+ "# Train the model with the best hyperparameters\n",
15134+ "best_params = study.best_params\n",
1500215135 "start = time.time()\n",
15003- "rfc = RandomForestClassifier()\n",
15136+ "rfc = RandomForestClassifier(**best_params, random_state=42 )\n",
1500415137 "rfc.fit(X_train, y_train)\n",
1500515138 "end = time.time()\n",
1500615139 "TimeRFC = end - start\n",
1500715140 "print('Time: ', TimeRFC)\n",
1500815141 "\n",
15009- "#Evaluating model on test set\n",
15142+ "# Evaluating model on test set\n",
1501015143 "y_pred = rfc.predict(X_test)\n",
1501115144 "all_metrics.update(metrics_data(\"Random Forest\", y_test, y_pred))\n",
1501215145 "\n",
15013- "#Evaluating model on train set\n",
15146+ "# Evaluating model on train set\n",
1501415147 "y_pred = rfc.predict(X_train)\n",
1501515148 "accuracyRFC2 = accuracy_score(y_train, y_pred)\n",
1501615149 "print('Accuracy on train set: {}'.format(accuracyRFC2))"
@@ -15033,18 +15166,41 @@
1503315166 ],
1503415167 "source": [
1503515168 "#LinearSVC\n",
15169+ "def objective(trial):\n",
15170+ " # Define the search space for hyperparameters\n",
15171+ " C = trial.suggest_float('C', 1e-4, 1e2, log=True)\n",
15172+ " max_iter = trial.suggest_int('max_iter', 1000, 10000)\n",
15173+ " loss = trial.suggest_categorical('loss', ['hinge', 'squared_hinge'])\n",
15174+ " \n",
15175+ " model = LinearSVC(C=C, max_iter=max_iter, loss=loss, random_state=42)\n",
15176+ " model.fit(X_train, y_train)\n",
15177+ " \n",
15178+ " y_pred = model.predict(X_test)\n",
15179+ " accuracy = accuracy_score(y_test, y_pred)\n",
15180+ " \n",
15181+ " return accuracy\n",
15182+ "\n",
15183+ "# Create a study object and optimize the objective function\n",
15184+ "study = optuna.create_study(direction='maximize')\n",
15185+ "study.optimize(objective, n_trials=100)\n",
15186+ "\n",
15187+ "# Print the best hyperparameters\n",
15188+ "print('Best hyperparameters: ', study.best_params)\n",
15189+ "\n",
15190+ "# Train the model with the best hyperparameters\n",
15191+ "best_params = study.best_params\n",
1503615192 "start = time.time()\n",
15037- "svc = LinearSVC()\n",
15038- "svc.fit(X_train, y_train) \n",
15193+ "svc = LinearSVC(**best_params, random_state=42 )\n",
15194+ "svc.fit(X_train, y_train)\n",
1503915195 "end = time.time()\n",
1504015196 "TimeSVC = end - start\n",
1504115197 "print('Time: ', TimeSVC)\n",
1504215198 "\n",
15043- "#Evaluating model on test set\n",
15199+ "# Evaluating model on test set\n",
1504415200 "y_pred = svc.predict(X_test)\n",
1504515201 "all_metrics.update(metrics_data(\"LinearSVC\", y_test, y_pred))\n",
1504615202 "\n",
15047- "#Evaluating model on train set\n",
15203+ "# Evaluating model on train set\n",
1504815204 "y_pred = svc.predict(X_train)\n",
1504915205 "accuracySVC2 = accuracy_score(y_train, y_pred)\n",
1505015206 "print('Accuracy on train set: {}'.format(accuracySVC2))"
@@ -15067,20 +15223,52 @@
1506715223 ],
1506815224 "source": [
1506915225 "#Gradient Boosting Classifier\n",
15070- "start = time.time()\n",
15226+ "ef objective(trial):\n",
15227+ " # Define the search space for hyperparameters\n",
15228+ " n_estimators = trial.suggest_int('n_estimators', 100, 1000)\n",
15229+ " learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3, log=True)\n",
15230+ " max_depth = trial.suggest_int('max_depth', 3, 20)\n",
15231+ " min_samples_split = trial.suggest_int('min_samples_split', 2, 10)\n",
15232+ " min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)\n",
15233+ " subsample = trial.suggest_float('subsample', 0.5, 1.0)\n",
15234+ " \n",
15235+ " model = GradientBoostingClassifier(\n",
15236+ " n_estimators=n_estimators,\n",
15237+ " learning_rate=learning_rate,\n",
15238+ " max_depth=max_depth,\n",
15239+ " min_samples_split=min_samples_split,\n",
15240+ " min_samples_leaf=min_samples_leaf,\n",
15241+ " subsample=subsample,\n",
15242+ " random_state=42\n",
15243+ " )\n",
15244+ " model.fit(X_train, y_train)\n",
15245+ " \n",
15246+ " y_pred = model.predict(X_test)\n",
15247+ " accuracy = accuracy_score(y_test, y_pred)\n",
15248+ " \n",
15249+ " return accuracy\n",
1507115250 "\n",
15072- "grb= GradientBoostingClassifier()\n",
15073- "grb.fit(X_train,y_train)\n",
15251+ "# Create a study object and optimize the objective function\n",
15252+ "study = optuna.create_study(direction='maximize')\n",
15253+ "study.optimize(objective, n_trials=100)\n",
15254+ "\n",
15255+ "# Print the best hyperparameters\n",
15256+ "print('Best hyperparameters: ', study.best_params)\n",
15257+ "\n",
15258+ "# Train the model with the best hyperparameters\n",
15259+ "best_params = study.best_params\n",
15260+ "start = time.time()\n",
15261+ "grb = GradientBoostingClassifier(**best_params, random_state=42)\n",
15262+ "grb.fit(X_train, y_train)\n",
1507415263 "end = time.time()\n",
1507515264 "Timegrb = end - start\n",
1507615265 "print('Time: ', Timegrb)\n",
1507715266 "\n",
15078- "#Evaluating model on test set\n",
15267+ "# Evaluating model on test set\n",
1507915268 "y_pred = grb.predict(X_test)\n",
1508015269 "all_metrics.update(metrics_data(\"Gradient Boosting Classifier\", y_test, y_pred))\n",
1508115270 "\n",
15082- "\n",
15083- "#Evaluating model on train set\n",
15271+ "# Evaluating model on train set\n",
1508415272 "y_pred = grb.predict(X_train)\n",
1508515273 "accuracygrb2 = accuracy_score(y_train, y_pred)\n",
1508615274 "print('Accuracy on train set: {}'.format(accuracygrb2))\n"
2161521803 "name": "python",
2161621804 "nbconvert_exporter": "python",
2161721805 "pygments_lexer": "ipython3",
21618- "version": "3.11.5 "
21806+ "version": "3.11.3 "
2161921807 }
2162021808 },
2162121809 "nbformat": 4,
0 commit comments