<a href="https://colab.research.google.com/github/prachimudholkar04/catandog_ML/blob/main/final_submission.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import libraries
import os
import pandas as pd
import numpy as np

import cv2
from skimage import feature
from skimage.feature import hog

from sklearn.model_selection import train_test_split

from sklearn.decomposition import PCA

# libraries for data plotting
import seaborn as sns
import matplotlib.pyplot as plt 

# library for evaluation
from sklearn.metrics import accuracy_score,confusion_matrix

# libraries for ML algorithms
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

from random import randint
from random import seed

RANDOM_SEED = 100

In [None]:
# mount your Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data_folder = '/content/drive/MyDrive/Applied AI/Image_processing/zipped/data'

In [None]:
# load training data
df_train = pd.read_csv(os.path.join(data_folder, '/content/drive/MyDrive/Applied AI/Image_processing/unzipped/train.csv'))

# summarise the details
print(f'Number of entries: {len(df_train)}')
df_train.head()

Number of entries: 10000


Unnamed: 0,id,label
0,1,cat
1,2,dog
2,3,cat
3,4,cat
4,5,cat


In [None]:

# load testing data
df_test = pd.read_csv(os.path.join(data_folder, '/content/drive/MyDrive/Applied AI/Image_processing/unzipped/test.csv'))

# summarise the details
print(f'Number of entries: {len(df_test)}')
df_test.head()
     

Number of entries: 1000


Unnamed: 0,id
0,1
1,2
2,3
3,4
4,5


In [None]:

# parameters
# ids - list of image ids
# folder_path - path to image folder
# dim - dimensions to resize images
def load_images(ids, folder_path, dim):
  images = []
  for id in ids:
    image_path = os.path.join(folder_path, f'{id}.jpg')
    img = cv2.imread(image_path)

    # Resize if necessary
    if img.shape[0] != dim[1] or img.shape[1] != dim[0]:
      img = cv2.resize(img, dim)
    images.append(img)
  return images

In [None]:
base_dim = (200, 200)

# load train images
train_image_folder = os.path.join(data_folder, '/content/drive/MyDrive/Applied AI/Image_processing/unzipped/train_images')
train_images = load_images(df_train['id'],train_image_folder, base_dim)
print(f'Number of training images loaded: {len(train_images)}')

# load test images
test_image_folder = os.path.join(data_folder, '/content/drive/MyDrive/Applied AI/Image_processing/unzipped/test_images')
test_images = load_images(df_test['id'],test_image_folder, base_dim)
print(f'Number of testing images loaded: {len(test_images)}')
     

Number of training images loaded: 10000
Number of testing images loaded: 1000


In [None]:

# method to plot confusion matrix
def plot_confusion_matrix(matrix):
    plt.clf()
    plt.imshow(matrix, interpolation='nearest', cmap=plt.cm.Set2_r)
    classNames = ['0', '1']
    plt.title('Confusion Matrix')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    tick_marks = np.arange(len(classNames))
    plt.xticks(tick_marks, classNames)
    plt.yticks(tick_marks, classNames)
    s = [['TN','FP'], ['FN', 'TP']]

    for i in range(2):
        for j in range(2):
            plt.text(j,i, str(s[i][j])+" = "+str(matrix[i][j]))
    plt.show()

# method to calculate evaluation results
def evaluate(actuals, predictions):
  accuracy = accuracy_score(actuals, predictions)
  #confusion_matrix = confusion_matrix(actuals, predictions, labels=[0, 1])
  return accuracy #confusion_matrix

In [None]:
def build_rf_model(X_train, X_val, y_train, y_val):
  # build model
  clf = RandomForestClassifier(n_estimators=50, max_features="auto", random_state=RANDOM_SEED)  
  clf.fit(X_train, y_train)
  return clf

In [None]:
'''
# method to get image features
def get_features_m3(images):
  features_list = []
  for img in images:
    # image preprocessing
    img_grayscaled = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # feature extraction
    edges_canny = cv2.Canny(img_grayscaled, 100, 200) 
    features = edges_canny.flatten()

    features_list.append(features)

  features_list = np.array(features_list)
  return features_list
  '''

In [None]:
'''
# feature extraction
features_train = get_features_m3(train_images)
print(features_train.shape)
'''

In [None]:
'''
# data split for train and validation
X_train, X_val, y_train, y_val = train_test_split(features_train, df_train['label'], test_size=0.3, random_state=RANDOM_SEED)
'''

In [None]:
'''
# train model
m3 = build_rf_model(X_train, X_val, y_train, y_val)
'''

In [None]:
'''
# make predictions on validation data
y_pred = m3.predict(X_val)

# evaluate model
accuracy = evaluate(y_val, y_pred)
print(f'Accuracy: {accuracy}')
# plot_confusion_matrix(confusion_matrix)
'''

In [None]:
'''
# feature extraction - test data
#features_test = get_features_m3(test_images)
#print(features_test.shape)

# get model predictions
predictions = m3.predict(features_test)
print(predictions)
#df_test['prediction3'] = predictions
print(predictions)
df_test.head()
'''

In [None]:
# method to get image features
def get_features_m4(images):
  features_list = []
  for img in images:
    # image preprocessing
    img_grayscaled = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Resize image if necessary
    img_resized = cv2.resize(img_grayscaled, (64, 128))

    # feature extraction
    features, hog_image = hog(img_resized, orientations=9, pixels_per_cell=(8, 8), 
                    cells_per_block=(2, 2), visualize=True)

    features_list.append(features)

  features_list = np.array(features_list)
  return features_list

In [None]:
# feature extraction
features_train = get_features_m4(train_images)
print(features_train.shape)

(10000, 3780)


In [None]:
# data split for train and validation
X_train, X_val, y_train, y_val = train_test_split(features_train, df_train['label'], test_size=0.3, random_state=RANDOM_SEED)

In [None]:
# train model
m4 = build_rf_model(X_train, X_val, y_train, y_val)

In [None]:
# make predictions on validation data
y_pred = m4.predict(X_val)

# evaluate model
accuracy = evaluate(y_val, y_pred)
print(f'Accuracy: {accuracy}')
# plot_confusion_matrix(confusion_matrix)

Accuracy: 0.6883333333333334


In [None]:
# feature extraction - test data
features_test = get_features_m4(test_images)
print(features_test.shape)

# get model predictions
predictions = m4.predict(features_test)
print(predictions)
df_test['prediction'] = predictions
print(predictions)
df_test.head()

(1000, 3780)
['dog' 'cat' 'cat' 'cat' 'dog' 'dog' 'dog' 'dog' 'cat' 'cat' 'cat' 'dog'
 'dog' 'dog' 'cat' 'dog' 'cat' 'dog' 'dog' 'cat' 'dog' 'dog' 'dog' 'cat'
 'cat' 'cat' 'cat' 'cat' 'cat' 'dog' 'dog' 'dog' 'dog' 'dog' 'cat' 'cat'
 'dog' 'dog' 'dog' 'cat' 'dog' 'dog' 'dog' 'cat' 'cat' 'cat' 'cat' 'cat'
 'dog' 'dog' 'cat' 'dog' 'cat' 'dog' 'dog' 'cat' 'cat' 'cat' 'cat' 'cat'
 'dog' 'dog' 'cat' 'dog' 'dog' 'cat' 'cat' 'cat' 'cat' 'cat' 'cat' 'cat'
 'cat' 'cat' 'dog' 'cat' 'cat' 'dog' 'dog' 'dog' 'dog' 'dog' 'dog' 'cat'
 'cat' 'dog' 'cat' 'cat' 'dog' 'dog' 'cat' 'dog' 'cat' 'dog' 'dog' 'cat'
 'cat' 'cat' 'dog' 'cat' 'cat' 'cat' 'cat' 'dog' 'dog' 'dog' 'dog' 'dog'
 'dog' 'dog' 'dog' 'cat' 'dog' 'dog' 'cat' 'cat' 'dog' 'dog' 'dog' 'dog'
 'cat' 'cat' 'cat' 'dog' 'cat' 'dog' 'dog' 'cat' 'dog' 'cat' 'dog' 'dog'
 'cat' 'cat' 'dog' 'cat' 'cat' 'dog' 'cat' 'cat' 'dog' 'cat' 'cat' 'cat'
 'cat' 'cat' 'dog' 'cat' 'cat' 'dog' 'cat' 'dog' 'dog' 'dog' 'cat' 'dog'
 'cat' 'cat' 'dog' 'cat' 'cat' 'cat' '

Unnamed: 0,id,prediction
0,1,dog
1,2,cat
2,3,cat
3,4,cat
4,5,dog


In [None]:
df_test.to_json('submission.json',orient = 'records',lines = True)

In [None]:
df_test.head()

Unnamed: 0,id,prediction
0,1,dog
1,2,cat
2,3,cat
3,4,cat
4,5,dog
