# Optical Character Recognition

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [2]:
# Load in data

train_data = pd.read_csv('Letters_train.csv')
test_data = pd.read_csv('Letters_test.csv')

## Predicting whether a letter is B or not

### Baseline Model

In [3]:
train_data['isB'] = train_data['letter'].apply(lambda x: 'Yes' if x == 'B' else 'No')
test_data['isB'] = test_data['letter'].apply(lambda x: 'Yes' if x == 'B' else 'No')
baseline_1_acc = (test_data['isB'] == 'No').mean()
print(f'Baseline Test Accuracy: {baseline_1_acc:.4f}')

Baseline Test Accuracy: 0.7743


### Logistic Regression

In [5]:
X_train = train_data.drop(['Unnamed: 0', 'letter', 'isB'], axis=1)
y_train = train_data['isB']
X_test = test_data.drop(['Unnamed: 0', 'letter', 'isB'], axis=1)
y_test = test_data['isB']

logistic_model = LogisticRegression(random_state=2023, max_iter=1000)
logistic_model.fit(X_train, y_train)
y_pred = logistic_model.predict(X_test)

model_1b_acc = accuracy_score(y_test, y_pred)
print(f'Logistic Regression Test Accuracy: {model_1b_acc:.4f}')

Logistic Regression Test Accuracy: 0.9401


### AUC

In [6]:
y_pred_proba = logistic_model.predict_proba(X_test)[:, 1]
model_1b_auc = roc_auc_score(y_test, y_pred_proba)

print(f'Logistic Regression Test AUC: {model_1b_auc:.4f}')

Logistic Regression Test AUC: 0.9785


### Cross-validated CART

In selecting the `ccp_alpha`, I ran a pretty standard grid search, tossing a bunch of `ccp_alpha` values into the mix to see what stuck. I split the training data into five equal chunks and rotated them through a cycle of training and validation, a method you might know as 5-fold cross-validation. The `ccp_alpha` that came out on top was the one that, on average, guessed right more often than the others when it was put to the test. It's like finding the sweet spot where the tree is just complex enough to get a good read on the data without getting so tangled up that it can't make sense of anything new.

In [7]:
cart_model = DecisionTreeClassifier(random_state=2023)
ccp_alpha_values = np.linspace(0.0001, 0.02, 100)
param_grid = {'ccp_alpha': ccp_alpha_values}
cart_cv = GridSearchCV(cart_model, param_grid, cv=5, scoring='accuracy')
cart_cv.fit(X_train, y_train)
model_1d_best_ccp_alpha = cart_cv.best_params_['ccp_alpha']
best_cart_model = cart_cv.best_estimator_
y_pred_cart = best_cart_model.predict(X_test)
model_1d_acc = accuracy_score(y_test, y_pred_cart)
print(f'CV CART Test Accuracy: {model_1d_acc:.4f}')
print(f'Best ccp_alpha: {model_1d_best_ccp_alpha:.4f}')

CV CART Test Accuracy: 0.9401
Best ccp_alpha: 0.0007


### Random Forest


In [None]:
random_forest_model = RandomForestClassifier(random_state=2023)
random_forest_model.fit(X_train, y_train)
y_pred_rf = random_forest_model.predict(X_test)
model_1e_acc = accuracy_score(y_test, y_pred_rf)

print(f'Random Forest Test Accuracy: {model_1e_acc:.4f}')

Random Forest Test Accuracy: 0.9840


### Performance Comparison

In comparing the performance of the logistic regression, CART, and Random Forest models, it is evident that the Random Forest model outperforms the others in terms of accuracy on the test set. While logistic regression and CART exhibit identical accuracy, Random Forest advances with a higher score, marking it as the superior model for this specific task.

In [None]:
performance_comparison = pd.DataFrame({
    'Model': ['Logistic Regression', 'CART', 'Random Forest'],
    'Accuracy': [model_1b_acc, model_1d_acc, model_1e_acc]
})

performance_comparison

Unnamed: 0,Model,Accuracy
0,Logistic Regression,0.940107
1,CART,0.940107
2,Random Forest,0.983957


## Predicting whether a letter is A, B, P, or R

Part A: Baseline Model (5 points)

In [9]:
y_train_multiclass = train_data['letter']
y_test_multiclass = test_data['letter']
most_frequent_class = y_train_multiclass.mode()[0]
baseline_predictions = [most_frequent_class] * len(y_test_multiclass)
baseline_2_acc = accuracy_score(y_test_multiclass, baseline_predictions)
print(f'Baseline Test Accuracy: {baseline_2_acc:.4f}')

Baseline Test Accuracy: 0.2439


### LDA

In [10]:
lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X_train, y_train_multiclass)
y_pred_lda = lda_model.predict(X_test)
model_2b_acc = accuracy_score(y_test_multiclass, y_pred_lda)
print(f'LDA Test Accuracy: {model_2b_acc:.4f}')

LDA Test Accuracy: 0.9102


### Cross-validated CART

To tune the ccp_alpha parameter of the CART model for the multi-class classification problem, I employed a grid search combined with 5-fold cross-validation. The training data was divided into five subsets, and the model was trained and validated on these subsets iteratively. Each ccp_alpha value was assessed based on its ability to maximize accuracy across the validation sets. The ccp_alpha that yielded the highest average accuracy was selected as the optimal parameter for the final model. This approach ensures that the model is neither too complex (which would risk overfitting) nor too simplistic (which would not capture sufficient detail), aiming for a balance that promotes good generalization to unseen data.​

In [11]:
cart_model_multiclass = DecisionTreeClassifier(random_state=2023)
cart_cv_multiclass = GridSearchCV(cart_model_multiclass, param_grid, cv=5, scoring='accuracy')
cart_cv_multiclass.fit(X_train, y_train_multiclass)
model_2c_best_ccp_alpha = cart_cv_multiclass.best_params_['ccp_alpha']
best_cart_model_multiclass = cart_cv_multiclass.best_estimator_
y_pred_cart_multiclass = best_cart_model_multiclass.predict(X_test)
model_2c_acc = accuracy_score(y_test_multiclass, y_pred_cart_multiclass)
print(f'CART Test Accuracy: {model_2c_acc:.4f}')

CART Test Accuracy: 0.9294


### Vanilla Bagging

In [None]:
vanilla_bagging_model = RandomForestClassifier(max_features=X_train.shape[1], random_state=2023)
vanilla_bagging_model.fit(X_train, y_train_multiclass)
y_pred_vanilla_bagging = vanilla_bagging_model.predict(X_test)
model_2d_acc = accuracy_score(y_test_multiclass, y_pred_vanilla_bagging)

print(f'No CV Random Forest Test Accuracy: {model_2d_acc:.4f}')

No CV Random Forest Test Accuracy: 0.9476


### Cross-validated Random Forest

In fine-tuning the Random Forest model, I conducted a grid search with 5-fold cross-validation to identify the optimal max_features value. This parameter controls the subset of features considered when splitting each node in the decision trees. The cross-validation process involved partitioning the training set into five distinct subsets, systematically training the model on four subsets and validating on the remaining one. The max_features value that led to the highest validation accuracy was selected, suggesting that this parameter setting allows the ensemble of trees to best generalize from the training data to unseen data. The optimal value of max_features indicates that using a small subset of features at each split can enhance the model's diversity and predictive power.​

In [None]:
max_features_range = np.arange(1, X_train.shape[1] + 1, 1)
param_grid_rf = {'max_features': max_features_range}
rf_cv_model = GridSearchCV(RandomForestClassifier(random_state=2023), param_grid_rf, cv=5, scoring='accuracy')
rf_cv_model.fit(X_train, y_train_multiclass)
model_2e_best_max_features = rf_cv_model.best_params_['max_features']
best_rf_model = rf_cv_model.best_estimator_
y_pred_rf_cv = best_rf_model.predict(X_test)
model_2e_acc = accuracy_score(y_test_multiclass, y_pred_rf_cv)
print(f'CV Random Forest Test Accuracy: {model_2e_acc:.4f}')

CV Random Forest Test Accuracy: 0.9765


### Gradient Boosting Classifier

In [None]:
gbc_model = GradientBoostingClassifier(n_estimators=200, max_leaf_nodes=10, random_state=2023)
gbc_model.fit(X_train, y_train_multiclass)
y_pred_gbc = gbc_model.predict(X_test)
model_2f_acc = accuracy_score(y_test_multiclass, y_pred_gbc)
print(f'GBC Test Accuracy: {model_2f_acc:.4f}')

GBC Test Accuracy: 0.9701
