In [1]:
import time
import numpy as np
from sklearn.datasets import fetch_covtype
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE, SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

In [2]:
from sklearn.tree import DecisionTreeClassifier

In [3]:
# Load the covtype dataset
covtype = fetch_covtype()

In [4]:
# Extract features and target
X = covtype.data
y = covtype.target

In [5]:
# split data to folds
def kfold_indices(data, k):
    fold_size = len(data) // k
    indices = np.arange(len(data))
    folds = []
    for i in range(k):
        test_indices = indices[i * fold_size: (i + 1) * fold_size]
        train_indices = np.concatenate([indices[:i * fold_size], indices[(i + 1) * fold_size:]])
        folds.append((train_indices, test_indices))
    return folds

In [6]:
# feature extraction PCA
st = time.time()
pca = PCA(n_components=25)
X_ext = pca.fit_transform(X)
en = time.time()
print("PCA time taken: ", en - st)

PCA time taken:  4.160186052322388


In [7]:
# feature selection SelectKBset
st = time.time()

#Feature selection part:
sel = SelectKBest(score_func=f_classif, k=10)  # use K-best function
X_sel = sel.fit_transform(X_ext, y)

en = time.time()
print("SelectKBset time taken: ", en - st)

SelectKBset time taken:  0.25893139839172363


In [8]:
# Get the fold indices
fold_indices = kfold_indices(X_sel, 10)

In [9]:
# Specify the maximum depth of the tree
max_depth = 10  # I adjust this value for the tests

# Create the model with the specified maximum depth
model = DecisionTreeClassifier(max_depth=max_depth, max_features= "sqrt")

In [10]:
scores = []
st = time.time()
# Iterate through each fold
for train_indices, test_indices in fold_indices:
    X_train, y_train = X_sel[train_indices], y[train_indices]
    X_test, y_test = X_sel[test_indices], y[test_indices]

    # Train the model on the training data
    model.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = model.predict(X_test)

    # Calculate the accuracy score for this fold
    fold_score = accuracy_score(y_test, y_pred)

    # Append the fold score to the list of scores
    scores.append(fold_score)

en = time.time()

In [11]:
print("Classifier time taken: ", en - st)
# Calculate the mean accuracy across all folds

Classifier time taken:  28.571475744247437


In [12]:
mean_accuracy = np.mean(scores)
print("Mean Accuracy:", mean_accuracy)
print("K-Fold Cross-Validation Scores:", scores)

Mean Accuracy: 0.6191012202888074
K-Fold Cross-Validation Scores: [0.6649283144868419, 0.811862102201339, 0.6945663585824685, 0.5964613345725547, 0.5755322627837731, 0.6073045214368084, 0.5171339563862928, 0.5754462057451679, 0.5605067038433074, 0.5872704428495207]
