# 1. Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc,precision_recall_curve,f1_score,confusion_matrix
from sklearn.metrics import accuracy_score, precision_score,recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from datetime import datetime


import warnings
warnings.filterwarnings(action="ignore")

ModuleNotFoundError: No module named 'pandas'

In [None]:
df = pd.read_csv('train.csv')
print(df.shape)

In [None]:
df1 = pd.read_csv('test.csv')
print(df1.shape)

# 2. Exploration

In [None]:
# Checking for missing values
df.isna().sum().max()

In [None]:
# Checking for duplicates()
df.duplicated().sum()

In [None]:
# Show the first 5 rows
df.head()

In [None]:
# Check the datatypes
df.info()

In [None]:
# Show the data-summary
df.describe()

In [None]:
# Plot the Histogram for digit occurrences
def plot_label_counts(df):
    # Count the occurrences of each label
    label_counts = df['label'].value_counts()

    # Create bar plot
    plt.bar(label_counts.index, label_counts.values, color='navy')

    plt.title('Number of Occurrences of Each Digit')
    plt.xlabel('Digit')
    plt.ylabel('Frequency')

    plt.show()


plot_label_counts(df)

In [None]:
# Plot the histogram for pixel-distribution
def plot_pixel_histogram(df):
    # Drop the 'label' column and convert the data to a 1D array
    pixel_values = df.drop('label', axis=1).values.flatten()

    # Generate the histogram
    plt.hist(pixel_values, bins=256, color='blue', alpha=0.7)

    plt.title('Histogram of Pixel Values')
    plt.xlabel('Pixel Value (0-255)')
    plt.ylabel('Frequency')
    
    plt.show()


plot_pixel_histogram(df)

In [None]:
# Display Samples Digits

# Select the rows and reshape them to 28x28 images
def display_samples(df, num_samples=5):
    num_digits = df['label'].nunique()
    
    fig, axs = plt.subplots(num_digits, num_samples, figsize=(num_samples, num_digits))
    for i in range(num_digits):
        digits = df[df['label'] == i].sample(num_samples)
        for j in range(num_samples):
            axs[i, j].imshow(digits.iloc[j, 1:].values.reshape(28, 28), cmap=plt.cm.binary)
            axs[i, j].axis('off')
                
    plt.show()

display_samples(df)

# 3. Modelling

In [None]:
# Define the features and target variable for Modelling
X = df.drop('label', axis=1)
y = df['label']

# Initialize the scaler
scaler = StandardScaler()
# Fit and transform the data
X_scaled = scaler.fit_transform(X)


In [None]:
# Split the data
X_train_scaled, X_val_scaled, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
plain_set = ['Random Forest Classifier without PCA', X_train_scaled, X_val_scaled, y_train, y_val]

In [None]:
# Initialize PCA - 0.95 provides the minimum number of components to keep 95% variance
pca = PCA(0.95)

# Record the start time
start = datetime.now()

# Fit and transform the X to PCA
X_pca = pca.fit_transform(X_scaled)

# Calculate and output elapsed time
pca_time = datetime.now() - start
print(f"PCA took {pca_time} seconds")

# Output the number of components used
print(f"Number of components: {pca.n_components_}")

In [None]:
# Split the pca data
X_pca_train_scaled, X_pca_val_scaled, y_pca_train, y_pca_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
pca_set = ['Random Forest Classifier with PCA', X_pca_train_scaled, X_pca_val_scaled, y_pca_train, y_pca_val]

In [None]:
## Defining Random Forest Classifier

def random_forest_classifier(X_train, X_val, y_train, y_val):
    # Record the start time
    start = datetime.now()
    
    # Define the parameter grid for hyperparameter tuning
    param_grid = {
        'n_estimators': [0, 100, 250],    # number of trees in the forest
        'max_depth': [None, 50, 100],   # maximum depth of the tree
        'min_samples_split': [5, 10, 15],  # minimum number of samples required to split an internal node
        'max_features' : ['auto', 'sqrt', 'log2'],   # the number of features to consider when looking for the best split
        'random_state' : [42],    # to make output consistent across multiple function calls
    }
    
    # Initialize the classifier
    rfc = RandomForestClassifier()
    
    # Use GridSearchCV to find the best parameters
    grid_search = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    
    # Fit the grid search to the data
    grid_search.fit(X_train, y_train)
    
    # Record the modelling end-time
    end=datetime.now()
    rc_time = end - start
    
    # Get the best parameters
    best_parameters = grid_search.best_params_
    
    # Get the best score
    best_score = grid_search.best_score_

    # Fit the classifier with the best found parameters
    rfc_best = RandomForestClassifier(**best_parameters)
    rfc_best.fit(X_train, y_train)
    
    # Make predictions on the validation data
    y_val_pred = rfc_best.predict(X_val)
    
    # Evaluate the model by calculating the accuracy of the predictions
    accuracy = accuracy_score(y_val, y_val_pred)

    return best_parameters, best_score, rc_time, accuracy


In [None]:
sets = [plain_set, pca_set]

for value in sets:
    best_parameters, best_score, rc_time, accuracy = random_forest_classifier(*value[1:])
    print(value[0], ':')
    # Print the modelling-time
    print(f'Modelling-Time: %f seconds' % rc_time)
    # Print the Best Parameters
    print("Best Parameters: ", best_parameters)
    # Print the Best Score on Training Set
    print("Best Score: ", best_score)
    # Print the Test Validation Set
    print('Validation Accuracy:', accuracy)  