In [1]:
#import packages and image data
import numpy as np
import math
import matplotlib.pyplot as plt
import scipy.io as spio
import scipy.sparse.linalg as ll
import sklearn.preprocessing as skpp
import pandas as pd
from scipy import sparse
import sklearn.utils.graph_shortest_path as sk
import scipy.sparse.linalg as ll
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
#import the images
from PIL import Image
import zipfile
from zipfile import ZipFile
from scipy.stats import multivariate_normal as mvn

#import Mars orbital images
# specifying the zip file name
file_name = "map-proj.zip"
#create a data array of images where each row is a vectorized version of an image
archive = zipfile.ZipFile('map-proj.zip', 'r')
imagefile = archive.open(archive.namelist()[0])
img=Image.open(imagefile)
numpydata = np.asarray(img)
data=numpydata.reshape(1, 227*227) #create first row of data
# create the rest of the rows of data
with ZipFile(file_name, 'r') as zip:
    for i in range(1,3823):
        imagefile = zip.open(zip.namelist()[i])
        img=Image.open(imagefile)
        numpydata = np.asarray(img)
        imgarray=numpydata.reshape(1, 227*227)
        data=np.concatenate((data,imgarray), axis=0)

#remove the rows 791, 951, 3809 because they have no labels
data = np.delete(data,[791,951,3809],0)
# archive = zipfile.ZipFile('map-proj.zip', 'r')
# imgfile = archive.open('map-proj/PSP_009754_2205_RED-0031.jpg')
# imgfile

# pic=data[0,:]
# pic=np.reshape(pic,(227,227))
# pic = (pic).astype(np.uint8)
# im = Image.fromarray(pic,mode='L')

#im.show()
#im.save('result.png')

#create a dictionary that maps label to category name
class_map = {0: 'other',
             1: 'crater',
             2: 'dark_dune',
             3: 'streak',
             4: 'bright_dune',
             5: 'impact',
             6: 'edge'}

# create reverse dictionary
reverse_class_map = {v: k for k,v in class_map.items()}

#upload the labels as pd dataframe
labels_df = pd.read_csv('labels-map-proj.txt', sep=" ", header=None)
labels_df.columns = ["name",'label']
labels_df

#create an np array of labels that corresponds to the rows in data (labels_df is in different order from data)
#also print out the three rows that have no label
labels_list=[]
for i in range(3823):
    name = archive.namelist()[i].split('map-proj/')[1] 
    if name in set(labels_df['name']):
        labels_list.append([int(labels_df.loc[labels_df['name']==name]['label'])])
    else:
        print(i)
labels=np.array(labels_list)

#create dataset consisting of 'crater', 'dark dune', 'streak', and 'bright dune' only
indices_main=np.where(labels==[1,2,3,4])[0]
data_main=data[indices_main]
labels_main=labels[indices_main] #create corresponding labels for data_main

#create dataset for only the 'other' category
indices_other=np.where(labels==0)[0]
data_other=data[indices_other]
labels_other=labels[indices_other] #create corresponding labels for data_other

#create dataset for only the 'crater' category
indices_crater=np.where(labels==1)[0]
data_crater=data[indices_crater]
labels_crater=labels[indices_crater] #create corresponding labels for data_crater

#create dataset for only the 'dark dune' category
indices_dark=np.where(labels==2)[0]
data_dark=data[indices_dark]
labels_dark=labels[indices_dark] #create corresponding labels for data_dark

#create dataset for only the 'bright dune' category
indices_bright=np.where(labels==4)[0]
data_bright=data[indices_bright]
labels_bright=labels[indices_bright] #create corresponding labels for data_bright

#create dataset for only the 'streak' category
indices_streak=np.where(labels==3)[0]
data_streak=data[indices_streak]
labels_streak=labels[indices_streak] #create corresponding labels for data_streak


791
951
3809


In [2]:
#Define X and Y using data_main
X = data_main
labels_main=np.reshape(labels_main,(970,))
Y = labels_main

#Splitting the dataset into the Training set and Test set
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.2, random_state = 2)

In [3]:
#scale by 255
X_train=X_train/255
X_test=X_test/255

In [4]:
#Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)

In [5]:
#perform logistic regression

#Fitting Logistic Regression to the Training set
classifier = LogisticRegression(max_iter=20000)
classifier.fit(X_train,Y_train)

#Predicting the Test set results
Y_pred = classifier.predict(X_test)

#Making the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, Y_pred)
print(cm.trace()/len(Y_test))
print(cm)

from sklearn.metrics import classification_report
print(classification_report(Y_test,Y_pred))

0.634020618556701
[[56 11  4  2]
 [16 57  0 11]
 [ 0  1  2  1]
 [14  6  5  8]]
              precision    recall  f1-score   support

           1       0.65      0.77      0.70        73
           2       0.76      0.68      0.72        84
           3       0.18      0.50      0.27         4
           4       0.36      0.24      0.29        33

    accuracy                           0.63       194
   macro avg       0.49      0.55      0.49       194
weighted avg       0.64      0.63      0.63       194



In [6]:
#Perform cross validation to find the optimal number of neighbors
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
k_range = list(range(1, 31))
param_grid = dict(n_neighbors=k_range)
  
# defining parameter range
grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy', return_train_score=False,verbose=1)
  
# fitting the model for grid search
grid_search=grid.fit(X_train, Y_train) 
# print optimal number of neighbors
print(grid_search.best_params_)
# print corresponding training accuracy
accuracy = grid_search.best_score_ *100
print("Accuracy for our training dataset with tuning is : {:.2f}%".format(accuracy) )

Fitting 10 folds for each of 30 candidates, totalling 300 fits
{'n_neighbors': 3}
Accuracy for our training dataset with tuning is : 71.90%


In [7]:
#Perform KNN using k=3 neighbors
from sklearn.neighbors import KNeighborsClassifier
classifierKNN = KNeighborsClassifier(n_neighbors = 3, metric = 'minkowski', p =2) #euclidean metric
classifierKNN.fit(X_train,Y_train)

#Predicting the Test set results
Y_pred = classifierKNN.predict(X_test)

#Making the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, Y_pred)
print(cm.trace()/len(Y_test))
print(cm)

from sklearn.metrics import classification_report
print(classification_report(Y_test,Y_pred))

0.7268041237113402
[[67  2  3  1]
 [15 68  1  0]
 [ 1  0  3  0]
 [17  6  7  3]]
              precision    recall  f1-score   support

           1       0.67      0.92      0.77        73
           2       0.89      0.81      0.85        84
           3       0.21      0.75      0.33         4
           4       0.75      0.09      0.16        33

    accuracy                           0.73       194
   macro avg       0.63      0.64      0.53       194
weighted avg       0.77      0.73      0.69       194



In [8]:
#Perform cross validation to find the optimal C value for SVM
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
svc = SVC()
C_range = list(np.arange(0.01,.1,.01))
parameters = {'kernel':['linear'], 'C':C_range}
  
grid = GridSearchCV(svc, parameters, cv=10, scoring='accuracy', verbose=1)
  
# fitting the model for grid search
grid_search=grid.fit(X_train[0:300], Y_train[0:300]) #use first 2000 training points for faster computation
# print optimal C value
print(grid_search.best_params_)
# print corresponding training accuracy
accuracy = grid_search.best_score_ *100
print("Accuracy for our training dataset with tuning is : {:.2f}%".format(accuracy) )
# print best estimator
print(grid.best_estimator_)

Fitting 10 folds for each of 9 candidates, totalling 90 fits
{'C': 0.01, 'kernel': 'linear'}
Accuracy for our training dataset with tuning is : 66.67%
SVC(C=0.01, kernel='linear')


In [9]:
#SVM using optimal C
from sklearn.svm import SVC
svclassifier = SVC(kernel='linear', C=.01, decision_function_shape='ovr')
svclassifier.fit(X_train, Y_train)  

#Predicting the Test set results
Y_pred = svclassifier.predict(X_test)

#Making the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, Y_pred)
print(cm.trace()/len(Y_test))
print(cm)

from sklearn.metrics import classification_report
print(classification_report(Y_test,Y_pred))

0.6804123711340206
[[57 11  4  1]
 [18 60  2  4]
 [ 0  1  2  1]
 [11  6  3 13]]
              precision    recall  f1-score   support

           1       0.66      0.78      0.72        73
           2       0.77      0.71      0.74        84
           3       0.18      0.50      0.27         4
           4       0.68      0.39      0.50        33

    accuracy                           0.68       194
   macro avg       0.57      0.60      0.56       194
weighted avg       0.70      0.68      0.68       194



In [10]:
#Perform cross validation to find the optimal C value for kernel SVM
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
svc = SVC()
C_range = list(np.arange(1,5,.5))
parameters = {'kernel':['rbf'], 'C':C_range}
  
grid = GridSearchCV(svc, parameters, cv=10, scoring='accuracy', verbose=1)
  
# fitting the model for grid search
grid_search=grid.fit(X_train[0:300], Y_train[0:300]) #use first 2000 training points for faster computation
# print optimal C value
print(grid_search.best_params_)
# print corresponding training accuracy
accuracy = grid_search.best_score_ *100
print("Accuracy for our training dataset with tuning is : {:.2f}%".format(accuracy) )
# print best estimator
print(grid.best_estimator_)

Fitting 10 folds for each of 8 candidates, totalling 80 fits
{'C': 4.0, 'kernel': 'rbf'}
Accuracy for our training dataset with tuning is : 78.67%
SVC(C=4.0)


In [11]:
#Kernel SVM; use gaussian rbf kernel and optimal C value

svclassifier = SVC(kernel='rbf', C=4)
svclassifier.fit(X_train, Y_train)  # use only first 5000 training data points

#Predicting the Test set results
Y_pred = svclassifier.predict(X_test)

#Making the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, Y_pred)
print(cm.trace()/len(Y_test))
print(cm)

from sklearn.metrics import classification_report
print(classification_report(Y_test,Y_pred))

0.7989690721649485
[[58  7  5  3]
 [ 9 74  0  1]
 [ 1  0  3  0]
 [ 5  8  0 20]]
              precision    recall  f1-score   support

           1       0.79      0.79      0.79        73
           2       0.83      0.88      0.86        84
           3       0.38      0.75      0.50         4
           4       0.83      0.61      0.70        33

    accuracy                           0.80       194
   macro avg       0.71      0.76      0.71       194
weighted avg       0.81      0.80      0.80       194



In [12]:
#Neural networks
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='adam', alpha=1e-5,
                     hidden_layer_sizes=(20,10), max_iter=1000,random_state=1)

clf.fit(X_train, Y_train)
#Predicting the Test set results
Y_pred = clf.predict(X_test)

#Making the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, Y_pred)
print(cm.trace()/len(Y_test))
print(cm)

from sklearn.metrics import classification_report
print(classification_report(Y_test,Y_pred))

0.7061855670103093
[[53 15  4  1]
 [11 68  0  5]
 [ 1  0  2  1]
 [ 9  9  1 14]]
              precision    recall  f1-score   support

           1       0.72      0.73      0.72        73
           2       0.74      0.81      0.77        84
           3       0.29      0.50      0.36         4
           4       0.67      0.42      0.52        33

    accuracy                           0.71       194
   macro avg       0.60      0.61      0.59       194
weighted avg       0.71      0.71      0.70       194

