In [None]:
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
train_df = pd.read_csv("../input/sign_mnist_train.csv", header=0)

In [None]:
labels = train_df["label"].values

In [None]:
train_df.drop(["label"], inplace=True, axis=1)

In [None]:
images = np.array(train_df.iloc[:, :])
images = np.array([np.reshape(i, (28, 28)) for i in images])
images_flat = np.array([i.flatten() for i in images])

In [None]:
scaler = StandardScaler()
images_scaled = scaler.fit_transform(images_flat)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(images_scaled, labels, test_size=0.25, random_state=42)

In [None]:
pca = PCA(n_components=120)

In [None]:
forest = RandomForestClassifier(n_estimators=20)

In [None]:
pipe = Pipeline(steps=[('pca', pca), ('forest', forest)])

In [None]:
pipe.fit(X_train, y_train)

### Caluclate accuracy for the separated test set

In [None]:
test_predictions = pipe.predict(X_test)
precision = accuracy_score(test_predictions, y_test) * 100
print("Accuracy with RandomForest: {0:.6f}%".format(precision))

# Predict on test images

In [None]:
test_df = pd.read_csv("../input/sign_mnist_test.csv", header=0)

In [None]:
test_df.head()

In [None]:
test_labels = test_df["label"].values
test_df.drop(["label"], inplace=True, axis=1)

In [None]:
images = np.array(test_df.iloc[:, :])
images = np.array([np.reshape(i, (28, 28)) for i in images])
images_flat = np.array([i.flatten() for i in images])

In [None]:
test_images_scaled = scaler.transform(images_flat)

### Caluclate accuracy for the original test set

In [None]:
test_predictions = pipe.predict(test_images_scaled)
precision = accuracy_score(test_predictions, test_labels) * 100
print("Accuracy with RandomForest: {0:.6f}%".format(precision))

# Using a Pipieline & GridSearch

In [None]:
param_dict = {"pca__n_components":[60, 80, 100],
              "forest__n_estimators":[20, 30, 40, 50]}

estimator = GridSearchCV(pipe,
                         param_dict,
                         verbose=2)

In [None]:
estimator.fit(X_train, y_train)

In [None]:
print("The best parameters: {0}".format(estimator.best_params_))

In [None]:
pipe.set_params(**estimator.best_params_);

In [None]:
pipe.fit(X_train, y_train);

# Calculate accuracy with the new pipeline

In [None]:
test_predictions = pipe.predict(X_train)
precision = accuracy_score(test_predictions, y_train) * 100
print("Accuracy with RandomForest: {0:.6f}%".format(precision))

In [None]:
test_predictions = pipe.predict(test_images_scaled)
precision = accuracy_score(test_predictions, test_labels) * 100
print("Accuracy with RandomForest: {0:.6f}%".format(precision))