In [1]:
from sklearn.datasets import load_iris
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [2]:
iris_bunch = load_iris()

#Creating a dataframe

iris_df = pd.DataFrame(iris_bunch.data , columns = iris_bunch.feature_names)
iris_df["Class"] = iris_bunch.target
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Class
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [3]:
X = iris_df.drop("Class", axis=1)
y = iris_df["Class"]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size =0.25, random_state=21)

In [5]:
print("Shape of X_train is : ",X_train.shape)
print("Shape of X_test is : ",X_test.shape)
print("Shape of y_train is : ",y_train.shape)
print("Shape of y_test is : ",y_test.shape)

Shape of X_train is :  (112, 4)
Shape of X_test is :  (38, 4)
Shape of y_train is :  (112,)
Shape of y_test is :  (38,)


**Build the SVM Classification Model**

In [6]:
from sklearn.svm import SVC

In [7]:
svc_model = SVC()
svc_model.fit(X_train,y_train)

In [8]:
y_test_pred = svc_model.predict(X_test)
y_train_pred = svc_model.predict(X_train)

In [9]:
print("The accuracy score of the SVM model on test data is : ")
print(accuracy_score(y_test , y_test_pred))

The accuracy score of the SVM model on test data is : 
0.9210526315789473


In [10]:
print("The accuracy score of the SVM model on train data is : ")
print(accuracy_score(y_train , y_train_pred))

The accuracy score of the SVM model on train data is : 
0.9910714285714286


In [11]:
svc_model2 = SVC(kernel="linear")
svc_model2.fit(X_train,y_train)

y_test_pred2 = svc_model2.predict(X_test)
y_train_pred2 = svc_model2.predict(X_train)

In [12]:
print("The accuracy score of the SVM model with linear kernel on test data is : ")
print(accuracy_score(y_test , y_test_pred2))

The accuracy score of the SVM model with linear kernel on test data is : 
0.9473684210526315


In [13]:
print("The accuracy score of the SVM model with linear kernel on train data is : ")
print(accuracy_score(y_train , y_train_pred2))

The accuracy score of the SVM model with linear kernel on train data is : 
1.0


In [14]:
print(classification_report(y_test,y_test_pred2))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       1.00      0.87      0.93        15
           2       0.83      1.00      0.91        10

    accuracy                           0.95        38
   macro avg       0.94      0.96      0.95        38
weighted avg       0.96      0.95      0.95        38



In [15]:
print(classification_report(y_train , y_train_pred2))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        37
           1       1.00      1.00      1.00        35
           2       1.00      1.00      1.00        40

    accuracy                           1.00       112
   macro avg       1.00      1.00      1.00       112
weighted avg       1.00      1.00      1.00       112



**Building Decision Tree model**

In [16]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier() #instantiating the estimator object 
dt_model.fit(X_train,y_train)
y_test_pred = dt_model.predict(X_test)

In [17]:
dt_test_accuracy = accuracy_score(y_test,y_test_pred)
print("The accuracy of the Decision Tree model on testing data is: ",dt_test_accuracy)

The accuracy of the Decision Tree model on testing data is:  0.9210526315789473


In [18]:
y_train_pred = dt_model.predict(X_train)

dt_train_accuracy = accuracy_score(y_train,y_train_pred)
print("The accuracy of the Decision Tree model on training data is: ",dt_train_accuracy)


The accuracy of the Decision Tree model on training data is:  1.0


**Pruning the decision tree**

In [19]:
dt_pruned_model = DecisionTreeClassifier(max_depth=2)
dt_pruned_model.fit(X_train,y_train)

In [20]:
y_test_pred_pruned = dt_pruned_model.predict(X_test)

dt_pruned_test_accuracy = accuracy_score(y_test,y_test_pred_pruned)
print("The accuracy of the Decision Tree model on testing data is: ",dt_pruned_test_accuracy)

The accuracy of the Decision Tree model on testing data is:  0.8421052631578947


In [21]:
y_train_pred_pruned = dt_pruned_model.predict(X_train)

dt_pruned_train_accuracy = accuracy_score(y_train,y_train_pred_pruned)
print("The accuracy of the Decision Tree model on testing data is: ",dt_pruned_train_accuracy)

The accuracy of the Decision Tree model on testing data is:  0.9821428571428571


**From above, we can understand that pruning the decision tree leads, although sometimes helps tackling the overfitting problem, does not enable the decision tree to perform well.Or in other words, pruning is not really enough to increase the performance of the decision tree.**

# Ensemble Models

**Building Random Forest Classifier**

In [22]:
from sklearn.ensemble import RandomForestClassifier

In [23]:
rfc_model = RandomForestClassifier()

In [24]:
rfc_model.fit(X_train,y_train)

In [25]:
y_test_pred_rfc = rfc_model.predict(X_test)
y_train_pred_rfc = rfc_model.predict(X_train)

In [26]:
print("The accuracy of the RFC model on the test dataset is:")
print(accuracy_score(y_test,y_test_pred_rfc))

The accuracy of the RFC model on the test dataset is:
0.9210526315789473


In [27]:
print("The accuracy of the RFC model on the train dataset is:")
print(accuracy_score(y_train,y_train_pred_rfc))

The accuracy of the RFC model on the train dataset is:
1.0


In [28]:
rfc_model2 = RandomForestClassifier(n_estimators = 4)
rfc_model2.fit(X_train,y_train)
y_test_pred_rfc2 = rfc_model2.predict(X_test)
y_train_pred_rfc2 = rfc_model2.predict(X_train)

In [29]:
print("The accuracy of the RFC model on the test dataset is:")
print(accuracy_score(y_test,y_test_pred_rfc2))

The accuracy of the RFC model on the test dataset is:
0.9210526315789473


In [30]:
print("The accuracy of the RFC model on the train dataset is:")
print(accuracy_score(y_train,y_train_pred_rfc2))

The accuracy of the RFC model on the train dataset is:
1.0


**Adaboost Classifier**

In [31]:
from sklearn.ensemble import AdaBoostClassifier

In [32]:
ada_model = AdaBoostClassifier(n_estimators = 80)
ada_model.fit(X_train, y_train)



In [33]:
y_test_pred_ada = ada_model.predict(X_test)
y_train_pred_ada = ada_model.predict(X_train)

In [34]:
print("The accuracy of the Adaboost model on the test dataset is:")
print(accuracy_score(y_test,y_test_pred_ada))

The accuracy of the Adaboost model on the test dataset is:
0.9210526315789473


In [35]:
print("The accuracy of the Adaboost model on the test dataset is:")
print(accuracy_score(y_train,y_train_pred_ada))

The accuracy of the Adaboost model on the test dataset is:
1.0
