In [None]:
# essential tools
import pandas as pd
import numpy as np
import pickle
from sklearn.ensemble import RandomForestClassifier
from datetime import datetime
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score

# import train data set, identify X and y
data = pd.read_csv('/kaggle/input/Kannada-MNIST/train.csv')
X = data[data.columns[1:]]
y = data[data.columns[0]]

# import test data set, set aside nominal id values
test_data = pd.read_csv('/kaggle/input/Kannada-MNIST/test.csv')
test_ids = test_data[test_data.columns[0]]
X_test = test_data[test_data.columns[1:]]

# merge train and test data sets for PCA
all_data = pd.concat([data, test_data], axis=0, ignore_index=True)
all_data.drop(columns=['id'], inplace=True)
all_data.reset_index(drop=True, inplace=True)

# break out independent features, create PCA transformer instance
X_full = all_data[all_data.columns[1:]]
y_full = all_data[all_data.columns[0]]
principal_components = PCA(n_components=0.95,
                           svd_solver='full')

# fit PCA transformer and report out times/speeds
pca_start=datetime.now()
principal_components.fit(X_full)
pca_end=datetime.now()
print(f'\nPCA Fitting - Start Time: {pca_start}')
print(f'PCA Fitting - End Time: {pca_end}')
print(f'PCA - Time to Fit: {pca_end-pca_start}')

# transform training indepdent features with PCA transformer
# report out times/speeds
X_pca = principal_components.transform(X)

# create new RandomForestClassifier instance, fit w/ transformed train data, report out times/speeds
randomforest2 = RandomForestClassifier(n_estimators=10,
                                      max_features='sqrt',
                                      bootstrap=True,
                                      n_jobs=1,
                                      random_state=0)
rf2_start=datetime.now()
randomforest2.fit(pca, y)
rf2_end=datetime.now()
print(f'\nRandom Forest w/ PCA Fitting - Start Time: {rf2_start}')
print(f'Random Forest w/ PCA Fitting - End Time: {rf2_end}')
print(f'Random Forest w/ PCA - Time to Fit: {rf2_end-rf2_start}')

rf2_score_start=datetime.now()
cv_scores = cross_val_score(randomforest2, X_pca, y)
avg_cv_score = np.mean(cv_scores)
rf2_score_end=datetime.now()

print(f'\nRandom Forest Scoring - Start Time: {rf2_score_start}')
print(f'Random Forest Scoring - End Time: {rf2_score_end}')
print(f'Random Forest Scoring - Time to Fit: {rf2_score_end-rf2_score_start}')

# transform test data set w/ PCA transformer and report out times/speeds
test_pca_transform_start=datetime.now()
X_test_pca = principal_components.transform(X_test)
test_pca_transform_end=datetime.now()
print(f'\nPCA Test Set Transform - Start Time: {test_pca_transform_start}')
print(f'PCA Test Set Transform - End Time: {test_pca_transform_end}')
print(f'PCA Test Set Transform - Time to Transform: {test_pca_transform_end-test_pca_transform_start}')

# generate predictions from trained RandomForestClassifier on transformed test data, report out times/speeds
rf2_test_predict_start=datetime.now()
test_predictions2 = randomforest2.predict(X_test_pca)
rf2_test_predict_end=datetime.now()
print(f'\nRandom Forest w/ PCA Predict - Start Time: {rf2_test_predict_start}')
print(f'Random Forest w/ PCA Predict - End Time: {rf2_test_predict_end}')
print(f'Random Forest w/ PCA - Time to Predict: {rf2_test_predict_end-rf2_test_predict_start}')

print(f'\nCross-validation scores: {cv_scores}')
print(f'Average CV Score: {avg_cv_score}')

# format predictions to dataframe and output submission file
test_predictions2 = pd.DataFrame({'id': test_ids,
                                 'label': test_predictions2})

test_predictions2.to_csv('submission.csv',
                         header=['id', 'label'],
                         index=False)

In [None]:
!pip install reshape

In [None]:
from sklearn.cluster import KMeans
kmeans=KMeans(10)
X_clustered=kmeans.fit_predict(X_pca)