# Final Project - PHY 256

### Serena Flint

### Data Preparation

##### Imports

In [1]:
import csv
import os 

##### Reading Data from .CSV Files

In [2]:
#CSV file name to be read in
file = "TrainData.csv"

fields = []
rows = []

#adds file data to rows[]
with open(file, 'r') as csvfile:
    csvreader = csv.reader(csvfile)  
    for row in csvreader:
        rows.append(row)
        
    #this is the limiting number of training cases from known galaxy classifications
    print("Total number of rows:", csvreader.line_num)

print("\nArray Formatting:")
print(rows[0])

csvfile.close()

Total number of rows: 61579

Array Formatting:
['GalaxyID', 'Class1.1', 'Class1.2', 'Class1.3', 'Class2.1', 'Class2.2', 'Class3.1', 'Class3.2', 'Class4.1', 'Class4.2', 'Class5.1', 'Class5.2', 'Class5.3', 'Class5.4', 'Class6.1', 'Class6.2', 'Class7.1', 'Class7.2', 'Class7.3', 'Class8.1', 'Class8.2', 'Class8.3', 'Class8.4', 'Class8.5', 'Class8.6', 'Class8.7', 'Class9.1', 'Class9.2', 'Class9.3', 'Class10.1', 'Class10.2', 'Class10.3', 'Class11.1', 'Class11.2', 'Class11.3', 'Class11.4', 'Class11.5', 'Class11.6']


##### Binning


In [3]:
#possible bins
bin1A = [] #elliptical
bin1B = [] #spiral
bin2A = [] #bar/no bar
bin2B = [] #bar/no bar

for i in range (1, 5001):
    if rows[i][1] > rows[i][2]:
        bin1A.append(rows[i][0])
    else:
        if rows[i][4] > rows[i][5]:
            bin2A.append(rows[i][0])
        else:
            bin2B.append(rows[i][0])

print("Number Elliptical:", len(bin1A))
print("Number Spiral A:", len(bin2A))
print("Number Spiral B:", len(bin2B))

print("\nTotal Training Size:", (len(bin1A)+len(bin2A)+len(bin2B)))

Number Elliptical: 2200
Number Spiral A: 517
Number Spiral B: 2283

Total Training Size: 5000


##### Moving Files to Correct Bins

In [4]:
#new path headers
path1A = "bin1A/"
path2A = "bin2A/"
path2B = "bin2B/"

#moving files from the dataset into their proper bins
try:
    for i in range(0,len(bin1A)):
        file1A = bin1A[i]+".jpg"
        os.rename("dataset/photos/"+file1A, path1A+file1A)

    for i in range(0,len(bin2A)):
        file2A = bin2A[i]+".jpg"
        os.rename("dataset/photos/"+file2A, path2A+file2A)

    for i in range(0,len(bin2B)):
        file2B = bin2B[i]+".jpg"
        os.rename("dataset/photos/"+file2B, path2B+file2B)
except FileNotFoundError:
        print("File(s) Not Found")
        print("Images likely already transferred!")


File(s) Not Found
Images likely already transferred!


### Algorithm & Clustering

##### Imports

In [5]:
#machine learning libraries
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.utils import to_categorical
from keras.preprocessing import image
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical

#plotting and visualization libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from tqdm import tqdm

Using TensorFlow backend.


##### Loading Images

In [6]:
training_images = []

#loading 1A images into an array
#target_size resizes images
for i in range (0, len(bin1A)):
    img = image.load_img("bin1A/" + bin1A[i] + ".jpg", target_size=(224,224,3), grayscale = False)
    img = image.img_to_array(img)
    img = img/255
    training_images.append(img)
    
#loading 2A images into an array
for i in range (0, len(bin2A)):
    img = image.load_img("bin2A/" + bin2A[i] + ".jpg", target_size=(224,224,3), grayscale = False)
    img = image.img_to_array(img)
    img = img/255
    training_images.append(img)

#loading 2B images into an array
for i in range (0, len(bin2B)):
    img = image.load_img("bin2B/" + bin2B[i] + ".jpg", target_size=(224,224,3), grayscale = False)
    img = image.img_to_array(img)
    img = img/255
    training_images.append(img)
    
X = np.array(training_images)

##### Creating Label Array & Validation Set

Since the dataset with galaxy images has a more complicated solution set, I'm manually creating a solution set suited more to our needs.

In [7]:
labels = []

for i in range (0,len(bin1A)):
    labels.append("1") #1A

for j in range (0, len(bin2A)):
    labels.append("2") #2A
    
for k in range (0, len(bin2B)):
    labels.append("2") #2B
    
y = to_categorical(labels)
print("Total Labels:", len(labels)) #Sanity Check

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

Total Labels: 5000


##### Defining the Model

In [9]:
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3),activation='relu',input_shape=(224,224,3)))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))

model.compile(loss='categorical_crossentropy',optimizer='Adam',metrics=['accuracy'])

###### Training

In [10]:
model.fit(X_train, y_train, epochs=3, validation_data=(X_test, y_test))

Instructions for updating:
Use tf.cast instead.
Train on 4000 samples, validate on 1000 samples
Epoch 1/3
Epoch 2/3


Epoch 3/3




<keras.callbacks.History at 0x18a31107fd0>

Since the above result is a little congested, here is a summary.

<strong>Epoch 1</strong>
<br>Runtime: 1091s
Starting Loss/Accuracy: 1.0854  /  50%
Ending Loss/Accuracy:   0.5640  /  71.30%

<strong>Epoch 2</strong>
<br>Runtime: 919s
Starting Loss/Accuracy: 0.5390  /  75%
Ending Loss/Accuracy:   0.5555  /  74.20%

<strong>Epoch 3</strong>
<br>Runtime: 907s
Starting Loss/Accuracy: 0.5067  /  81%
Ending Loss/Accuracy:   0.5456  /  72.80%

### Testing our Algorithm

Since our dataset is so large and we are only utilizing 5000 images from the front, I picked 10 random from near the end of the folder to use as test data.

##### Reading Test Data

In [27]:
#creating a list of the images in the test directory
testing_images = []
testing_images = os.listdir("test")

testing_temp = testing_images
testing_IDs = []

#creating a list of the image IDs to look up real values later
for i in range(0, len(testing_temp)):
    testing_IDs.append(testing_temp[i].replace(".jpg",""))

##### Loading Images

In [30]:
to_load = []

for i in range(0,len(testing_images)):
    img = image.load_img("test/"+testing_images[i], target_size=(224,224,3), grayscale = False)
    img = image.img_to_array(img)
    img = img/255
    to_load.append(img)
    
test = np.array(to_load)

##### Predictions!

Remember, 1 corresponds to elliptical, and 2 and 3 correspond to spirals.

In [32]:
prediction = model.predict_classes(test)

print(prediction)

[1 2 1 2 2 1 2 2 2 2]


##### Actual Values

In [40]:
val_arr = []


for i in range (0, len(testing_IDs)): 
    for j in range(0, len(rows)):
         if rows[j][0] == testing_IDs[i]:
                if rows[j][1] > rows[j][2]:
                    val_arr.append(1)
                else:
                    if rows[j][4] > rows[j][5]:
                        val_arr.append(2)
                    else:
                        val_arr.append(3)
                
print(val_arr)
    

[1, 1, 1, 3, 3, 1, 1, 1, 3, 3]


### Displaying Results