## Load Dataset

In [2]:
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['target'] = iris.target

In [4]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [5]:
# missing values
df.isnull().sum()
# We can see that all values are 0. It means that there are no null values over the entire data frame.

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
target               0
dtype: int64

In [6]:
# To see summary statistics
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sepal length (cm),150.0,5.843333,0.828066,4.3,5.1,5.8,6.4,7.9
sepal width (cm),150.0,3.057333,0.435866,2.0,2.8,3.0,3.3,4.4
petal length (cm),150.0,3.758,1.765298,1.0,1.6,4.35,5.1,6.9
petal width (cm),150.0,1.199333,0.762238,0.1,0.3,1.3,1.8,2.5
target,150.0,1.0,0.819232,0.0,0.0,1.0,2.0,2.0


In [7]:
df['target'].value_counts()

target
0    50
1    50
2    50
Name: count, dtype: int64

In [8]:
X = df.iloc[:, :4]
y = df.iloc[:, 4]

In [9]:
from sklearn.model_selection import train_test_split, GridSearchCV

# Split data into train, validation, and test sets (70%, 15%, 15%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [10]:
from sklearn.preprocessing import StandardScaler

# Standardize data for Logistic Regression only
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

### Decision Tree

In [13]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
from sklearn.tree import plot_tree

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import classification_report 

### 2 ب

In [None]:
# Hyperparameter Tuning of DTC

dt = DecisionTreeClassifier(random_state=1)

params = {
                'max_depth': [3, 5, 10, None],
                'min_samples_split': [2, 5, 10]
}


gsearch = GridSearchCV(dt, param_grid=params, cv=3)
gsearch.fit(X_train, y_train)
gsearch.best_params_

{'max_depth': 3, 'min_samples_split': 2}

In [54]:
y_pred_train = gsearch.predict(X_train)
y_prob_train = gsearch.predict_proba(X_train)[:,1]

dt_pred = gsearch.predict(X_test)
dt_prob = gsearch.predict_proba(X_test)[:,1]

### 2 ت

In [55]:
print('Confusion Matrix - Train:','\n',confusion_matrix(y_train,y_pred_train))
print('\n','Confusion Matrix - Test:','\n',confusion_matrix(y_test,dt_pred))

Confusion Matrix - Train: 
 [[31  0  0]
 [ 0 36  1]
 [ 0  4 33]]

 Confusion Matrix - Test: 
 [[ 6  0  0]
 [ 0 10  0]
 [ 0  0  7]]


In [56]:
#Classification for test after hyperparameter tuning
print(classification_report(y_test,dt_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         6
           1       1.00      1.00      1.00        10
           2       1.00      1.00      1.00         7

    accuracy                           1.00        23
   macro avg       1.00      1.00      1.00        23
weighted avg       1.00      1.00      1.00        23



In [57]:
print('Accuracy of Decision Tree-Train: ', accuracy_score(y_pred_train, y_train))
print('Accuracy of Decision Tree-Test: ', accuracy_score(dt_pred, y_test))

Accuracy of Decision Tree-Train:  0.9523809523809523
Accuracy of Decision Tree-Test:  1.0


### 2 پ

In [48]:
dt = DecisionTreeClassifier(max_depth=3, min_samples_leaf=10, random_state=1 )
dt.fit(X_train, y_train)

# dt = DecisionTreeClassifier(random_state=1)
# dt.fit(X_train, y_train)

y_pred_train = dt.predict(X_train)
y_pred = dt.predict(X_test)
y_prob = dt.predict_proba(X_test)

print('Accuracy of Decision Tree-Train: ', accuracy_score(y_pred_train, y_train))
print('Accuracy of Decision Tree-Test: ', accuracy_score(y_pred, y_test))

Accuracy of Decision Tree-Train:  0.9428571428571428
Accuracy of Decision Tree-Test:  0.9565217391304348


#### if max_depth = 1 then
Accuracy of Decision Tree-Train:  0.6476190476190476

Accuracy of Decision Tree-Test:  0.6956521739130435

#### if max_depth = 2 then
Accuracy of Decision Tree-Train:  0.9428571428571428

Accuracy of Decision Tree-Test:  0.9565217391304348

#### if max_depth = 3 then
Accuracy of Decision Tree-Train:  0.9428571428571428

Accuracy of Decision Tree-Test:  0.9565217391304348

#### if max_depth = 5 then
Accuracy of Decision Tree-Train:  0.9428571428571428

Accuracy of Decision Tree-Test:  0.9565217391304348



## Logistic Regression

In [53]:
from sklearn.linear_model import LogisticRegression


lr_params = {'C': [0.1, 1, 10, 100], 'penalty': ['l2'], 'solver': ['lbfgs']}
lr_model = GridSearchCV(LogisticRegression(random_state=42, multi_class='multinomial', max_iter=500), lr_params, cv=3)
lr_model.fit(X_train_scaled, y_train)

# Evaluate Logistic Regression
lr_best = lr_model.best_estimator_
lr_pred = lr_best.predict(X_test_scaled)
print("\nLogistic Regression Results:")
print("Best Parameters:", lr_model.best_params_)
print("Accuracy:", accuracy_score(y_test, lr_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, lr_pred))
print(classification_report(y_test, lr_pred))


Logistic Regression Results:
Best Parameters: {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
Accuracy: 1.0
Confusion Matrix:
 [[ 6  0  0]
 [ 0 10  0]
 [ 0  0  7]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         6
           1       1.00      1.00      1.00        10
           2       1.00      1.00      1.00         7

    accuracy                           1.00        23
   macro avg       1.00      1.00      1.00        23
weighted avg       1.00      1.00      1.00        23





### comparision of Results

In [58]:
# Comparison of Decision Tree and Logistic Regression Results
print("\n--- Comparison of Models ---")
# Decision Tree metrics
dt_accuracy = accuracy_score(y_test, dt_pred)
print("Decision Tree Accuracy:", dt_accuracy)
print("Decision Tree Confusion Matrix:\n", confusion_matrix(y_test, dt_pred))
print(classification_report(y_test, dt_pred))

# Logistic Regression metrics
lr_accuracy = accuracy_score(y_test, lr_pred)
print("Logistic Regression Accuracy:", lr_accuracy)
print("Logistic Regression Confusion Matrix:\n", confusion_matrix(y_test, lr_pred))
print(classification_report(y_test, lr_pred))

# Inductive Bias Discussion
print("\n--- Inductive Bias Discussion ---")
if lr_accuracy > dt_accuracy:
    print("Logistic Regression outperforms Decision Tree in this case, likely because the Iris dataset is relatively linearly separable.")
else:
    print("Decision Tree outperforms Logistic Regression, likely due to its ability to model non-linear boundaries.")
print("Consider using both models depending on the nature of the problem and dataset.")



--- Comparison of Models ---
Decision Tree Accuracy: 1.0
Decision Tree Confusion Matrix:
 [[ 6  0  0]
 [ 0 10  0]
 [ 0  0  7]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         6
           1       1.00      1.00      1.00        10
           2       1.00      1.00      1.00         7

    accuracy                           1.00        23
   macro avg       1.00      1.00      1.00        23
weighted avg       1.00      1.00      1.00        23

Logistic Regression Accuracy: 1.0
Logistic Regression Confusion Matrix:
 [[ 6  0  0]
 [ 0 10  0]
 [ 0  0  7]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         6
           1       1.00      1.00      1.00        10
           2       1.00      1.00      1.00         7

    accuracy                           1.00        23
   macro avg       1.00      1.00      1.00        23
weighted avg       1.00      1.00      1.00        23

