# Project Work - IRM24
## Jürgen Aumayr & Natalia Trudova

### Decision Trees

This code demonstrates how to create a decision tree classifier using scikit-learn's DecisionTreeClassifier class.

In [1]:
# Import libraries
from sklearn import tree
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# Load the iris dataset
iris = load_iris()
X1, y1 = iris.data, iris.target

# Split the data into train and test sets
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.5, random_state=42)

# Create a decision tree classifier
clf = tree.DecisionTreeClassifier()

# Train the classifier
clf.fit(X1_train, y1_train)

DecisionTreeClassifier()

## Random Forests

This code demonstrates how to create a random forest classifier using scikit-learn's RandomForestClassifier class.

In [2]:
# Import libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# Load the iris dataset
iris = load_iris()
X2, y2 = iris.data, iris.target

# Split the data into train and test sets
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.5, random_state=42)

# Create a random forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
rf.fit(X2_train, y2_train)

RandomForestClassifier(random_state=42)

In [3]:
# Import libraries
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.datasets import make_classification
#from sklearn.model_selection import train_test_split

# Generate a random dataset
#X2, y2 = make_classification(n_samples=1000, n_features=4, n_informative=2, n_redundant=0, random_state=42)

# Split the data into train and test sets
#X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y, test_size=0.2, random_state=42)

# Create a random forest classifier
#rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
#rf.fit(X_train, y_train)

## Gradient Boosting

This code demonstrates how to create a gradient boosting classifier using scikit-learn's GradientBoostingClassifier class.

In [4]:
# Import libraries
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# Load the iris dataset
iris = load_iris()
X3, y3 = iris.data, iris.target

# Split the data into train and test sets
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=0.5, random_state=42)

# Create a gradient boosting classifier
gb = GradientBoostingClassifier(random_state=42)

# Train the classifier
gb.fit(X3_train, y3_train)

GradientBoostingClassifier(random_state=42)

In [5]:
# Import libraries
#from sklearn.ensemble import GradientBoostingClassifier
#from sklearn.datasets import make_hastie_10_2
#from sklearn.model_selection import train_test_split

# Generate a synthetic dataset
#X3, y3 = make_hastie_10_2(random_state=42)

# Split the data into train and test sets
#X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=0.2, random_state=42)

# Create a gradient boosting classifier
#gb = GradientBoostingClassifier(random_state=42)

# Train the classifier
#gb.fit(X3_train, y3_train)

## Evaluating Models

Once we have trained our models, it's essential to evaluate their performance on a separate test set. This allows us to assess how well the models generalize to unseen data. Common evaluation metrics include accuracy, precision, recall, and F1-score.

In [6]:
# Import libraries
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Evaluate decision tree
y_pred_dt = clf.predict(X1_test)
accuracy_dt = accuracy_score(y1_test, y_pred_dt)
precision_dt = precision_score(y1_test, y_pred_dt, average='weighted')
recall_dt = recall_score(y1_test, y_pred_dt, average='weighted')
f1_dt = f1_score(y1_test, y_pred_dt, average='weighted')

# Evaluate random forest
y_pred_rf = rf.predict(X2_test)
accuracy_rf = accuracy_score(y2_test, y_pred_rf)
precision_rf = precision_score(y2_test, y_pred_rf, average='weighted')
recall_rf = recall_score(y2_test, y_pred_rf, average='weighted')
f1_rf = f1_score(y2_test, y_pred_rf, average='weighted')

# Evaluate gradient boosting
y_pred_gb = gb.predict(X3_test)
accuracy_gb = accuracy_score(y3_test, y_pred_gb)
precision_gb = precision_score(y3_test, y_pred_gb, average='weighted')
recall_gb = recall_score(y3_test, y_pred_gb, average='weighted')
f1_gb = f1_score(y3_test, y_pred_gb, average='weighted')

print("Decision Tree:")
print("---------------------------------------")
print("Accuracy:  " + str(accuracy_dt))
print("Precision: " + str(precision_dt))
print("Recall:    " + str(recall_dt))
print("F1-Score:  " + str(f1_dt) + "\n\n")
print("Random Forest:")
print("---------------------------------------")
print("Accuracy:  " + str(accuracy_rf))
print("Precision: " + str(precision_rf))
print("Recall:    " + str(recall_rf))
print("F1-Score:  " + str(f1_rf) + "\n\n")
print("Gradient Boosting:")
print("---------------------------------------")
print("Accuracy:  " + str(accuracy_gb))
print("Precision: " + str(precision_gb))
print("Recall:    " + str(recall_gb))
print("F1-Score:  " + str(f1_gb) + "\n\n")

Decision Tree:
---------------------------------------
Accuracy:  0.9066666666666666
Precision: 0.907070707070707
Recall:    0.9066666666666666
F1-Score:  0.906622537431048


Random Forest:
---------------------------------------
Accuracy:  0.9866666666666667
Precision: 0.9872222222222223
Recall:    0.9866666666666667
F1-Score:  0.9866603624901497


Gradient Boosting:
---------------------------------------
Accuracy:  0.92
Precision: 0.92
Recall:    0.92
F1-Score:  0.92




In [7]:
# Import libraries
from sklearn import tree
import graphviz, os
os.environ["PATH"] += os.pathsep + 'C:/Program Files/Graphviz/bin'

# Visualize the decision tree
dot_data = tree.export_graphviz(clf, out_file=None, feature_names=iris.feature_names, class_names=iris.target_names, filled=True, rounded=True, special_characters=True)
graph = graphviz.Source(dot_data)
graph.render("iris_tree")

'iris_tree.pdf'

In [8]:
# Import libraries
from sklearn.datasets import make_circles
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# Generate a synthetic dataset
X, y = make_circles(n_samples=100, noise=0.2, factor=0.2, random_state=42)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the models
dt = DecisionTreeClassifier(random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
gb = GradientBoostingClassifier(random_state=42)

dt.fit(X_train, y_train)
rf.fit(X_train, y_train)
gb.fit(X_train, y_train)

# Make predictions and evaluate the models
y_pred_dt = dt.predict(X_test)
y_pred_rf = rf.predict(X_test)
y_pred_gb = gb.predict(X_test)

accuracy_dt = accuracy_score(y_test, y_pred_dt)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
accuracy_gb = accuracy_score(y_test, y_pred_gb)

print(f'Decision Tree Accuracy: {accuracy_dt:.2f}')
print(f'Random Forest Accuracy: {accuracy_rf:.2f}')
print(f'Gradient Boosting Accuracy: {accuracy_gb:.2f}')

Decision Tree Accuracy: 0.95
Random Forest Accuracy: 0.95
Gradient Boosting Accuracy: 0.95
