In [21]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB  
from sklearn.model_selection import GridSearchCV,cross_val_score, train_test_split
from sklearn.metrics import accuracy_score


train_df = pd.read_csv("digit-train.csv")

test_df = pd.read_csv("digit-test.csv")

train_df.isnull().sum()

train_df.head()


Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
# Separating the target variable (label) from the features (pixel values)
X = train_df.drop(columns=["label"])
y = train_df["label"]

X.head()


Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
# Spliting data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)


In [24]:
# Hyperparameter tuning 
dt_pm_grid = {
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10]
}


In [25]:

decision_tree = DecisionTreeClassifier()
dt_gird = GridSearchCV(decision_tree, param_grid=dt_pm_grid, cv=3)
dt_gird.fit(X_train, y_train)

In [26]:
# Get the best Decision Tree model
dt_best = dt_gird.best_estimator_


In [27]:

# Hyperparameter tuning - Multinomial Naïve Bayes
nb_pm_grid = {
    'alpha': [0.01, 0.02, 0.05, 0.1,0.5]
}


In [28]:

multinomial_naive_bayes = MultinomialNB()
nb_grid = GridSearchCV(multinomial_naive_bayes, param_grid=nb_pm_grid, cv=3)
nb_grid.fit(X_train, y_train)


In [29]:
# Get the best Multinomial Naïve Bayes model
best_multinomial_naive_bayes = nb_grid.best_estimator_


In [30]:
# model evaluation 
dt_pred = dt_best.predict(X_valid)
mnb_pred = best_multinomial_naive_bayes.predict(X_valid)

dt_acc = accuracy_score(y_valid, dt_pred)
mnb_acc = accuracy_score(y_valid, mnb_pred)


In [31]:
print("Decision Tree Validation Accuracy:", dt_acc)
print("Multinomial Naïve Bayes Validation Accuracy:", mnb_acc)


Decision Tree Validation Accuracy: 0.7261904761904762
Multinomial Naïve Bayes Validation Accuracy: 0.8285714285714286


In [32]:
cross_val_scores_mnb = cross_val_score(multinomial_naive_bayes, X_train, y_train, cv=3)
multinomial_naive_bayes_accuracy = cross_val_scores_mnb.mean()
print("Multinomial Naïve Bayes 3-Fold CV Accuracy:", multinomial_naive_bayes_accuracy)

Multinomial Naïve Bayes 3-Fold CV Accuracy: 0.828466636878165


In [33]:
cross_val_scores_dt = cross_val_score(decision_tree, X_train, y_train, cv=3)
decision_tree_accuracy = cross_val_scores_dt.mean()
print("Decision Tree 3-Fold CV Accuracy:", decision_tree_accuracy)


Decision Tree 3-Fold CV Accuracy: 0.7254329971488148


In [34]:
print("Training Dataset Columns:", X_train.columns)
print("Testing Dataset Columns:", test_df.columns)

# aligning column names
X_test = test_df[X_train.columns]  

Training Dataset Columns: Index(['pixel0', 'pixel1', 'pixel2', 'pixel3', 'pixel4', 'pixel5', 'pixel6',
       'pixel7', 'pixel8', 'pixel9',
       ...
       'pixel774', 'pixel775', 'pixel776', 'pixel777', 'pixel778', 'pixel779',
       'pixel780', 'pixel781', 'pixel782', 'pixel783'],
      dtype='object', length=784)
Testing Dataset Columns: Index(['label', 'pixel0', 'pixel1', 'pixel2', 'pixel3', 'pixel4', 'pixel5',
       'pixel6', 'pixel7', 'pixel8',
       ...
       'pixel774', 'pixel775', 'pixel776', 'pixel777', 'pixel778', 'pixel779',
       'pixel780', 'pixel781', 'pixel782', 'pixel783'],
      dtype='object', length=785)


In [35]:
# Load the testing dataset

y_test_df = test_df["label"]
test_df = test_df.drop(columns=["label"])


# predictions using the best Decision Tree model
dt_test_pred = dt_best.predict(test_df)

# predictions using the best Multinomial Naïve Bayes model
mnb_test_pred = best_multinomial_naive_bayes.predict(test_df)


In [36]:
# Compare the results from the two algorithms on testing data
dt_test_acc = accuracy_score(y_test_df, dt_test_pred)
mnb_test_acc = accuracy_score(y_test_df, mnb_test_pred)


In [37]:
print("Decision Tree Testing Accuracy:", dt_test_acc)
print("Multinomial Naïve Bayes Testing Accuracy:", mnb_test_acc)

Decision Tree Testing Accuracy: 0.7653644592663172
Multinomial Naïve Bayes Testing Accuracy: 0.8168175321581705
