In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
import cv2
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Extracting data from zip files**

In [None]:
import zipfile
import glob

files_zip_ext = glob.glob('/kaggle/input/dogs-vs-cats/*.zip')
print(files_zip_ext)
def extract_data_from_zip(file_path):
    with zipfile.ZipFile(file_path, "r") as zip_ref:
        zip_ref.extractall("/kaggle/temp/data")

# Extract train and test
for file_path in files_zip_ext:
    extract_data_from_zip(file_path)

In [None]:
print(f"Train size: {len(os.listdir('/kaggle/temp/data/train'))}")
print(f"Test size: {len(os.listdir('/kaggle/temp/data/test1'))}")

# **Generate labels and adding file names to path**

In [None]:
os.chdir('/kaggle/temp/data')

In [None]:
train_y = (lambda dir_: [1 if file.split('.')[0] == 'dog' else 0 for file in os.listdir(dir_)])('train')

gen_path = lambda dir_: [path for path in os.listdir(dir_)]
train_x = gen_path('train')
test_x = gen_path('test1')

In [None]:
df = pd.DataFrame({'filename': train_x,
                    'category': train_y})

print(df.tail())

sns.displot(df, x='category')

# **Visualization of first image**

In [None]:
def visualize(img_path):
    img = mpimg.imread(img_path) 

    # Displaying the image 
    plt.figure(figsize=(8,8))

    plt.imshow(img)

In [None]:
visualize(f"train/{df['filename'].iloc[0]}")

# **Preprocessing image**

In [None]:
# Convert to gray
def preprocess(img_path):
    gray = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    
    # resize the images
    size = 28
    gray = cv2.resize(gray, (size, size))
    
    # normalize
    normalized = gray.flatten() / 255.0
    
    # global centering
    mean = normalized.mean()
    centered = normalized - mean
    
    return centered.reshape(1, size*size)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

In [None]:
 sgd_clf = linear_model.SGDClassifier()

In [None]:
# Generate batches with preprocessing
# X -> list of file paths and y -> (0 - dog, 1 - cat)

def gen_batches(X, y=None, batch_size=200, image_size=784):
    batch = []
    for i, x in enumerate(X, start=1):
        img = preprocess(x)
        batch.append(img)
        if i % batch_size == 0:
            data = np.asarray(batch).reshape(batch_size, image_size)
            if y:
                targets = y[i-batch_size:i]
                yield data, targets
            else:
                yield data
            batch = []

In [None]:
# Training
os.chdir('/kaggle/temp/data/train/')

categories = np.unique(y_train)
for b in gen_batches(X_train, y_train):
    sgd_clf.partial_fit(b[0], b[1], classes=categories)

In [None]:
# Predict 205 image in training set
img_path = df['filename'].iloc[205]
visualize(img_path)

pred = sgd_clf.predict(preprocess(img_path))
print(f"It thinks it's a {'dog' if pred[0] == 0 else 'cat'}")

In [None]:
def display_metrics(y_test, y_predicted):
    # Predict on validation set
    target_names = ['Dog', 'Cat']
    outcome = pd.DataFrame(confusion_matrix(y_test, y_predicted),index=target_names,
                           columns=target_names)
    
    print("CONFUSION MATRIX")
    print(outcome)
    
    report = classification_report(y_test, y_predicted, target_names=target_names)
    print("CLASSIFICATION REPORT")
    print(report)

In [None]:
# Metrics for SGDClassifier
y_predicted = np.asarray([sgd_clf.predict(b) for b in gen_batches(X_test)]).flatten()
display_metrics(y_test, y_predicted)

# **Try LGBM model**

In [None]:
from lightgbm import LGBMClassifier

In [None]:
lgb = LGBMClassifier(objective='binary', boosting='gbdt')

In [None]:
X_train_full = np.asarray([preprocess(img) for img in X_train]).reshape(len(X_train), 784)

In [None]:
X_test_full = np.asarray([preprocess(img) for img in X_test]).reshape(len(X_test), 784)

In [None]:
lgb.fit(X_train_full, y_train)

In [None]:
y_predicted = lgb.predict(X_test_full)
display_metrics(y_test, y_predicted)

# Make submission

In [None]:
os.chdir('/kaggle/temp/data/test1')

In [None]:
os.getcwd()

In [None]:
test = np.asarray([preprocess(img) for img in test_x]).reshape(len(test_x), 784)

In [None]:
categories = lgb.predict(test)

In [None]:
# change directory to top level
os.chdir('/kaggle/working/')

In [None]:
submission = pd.DataFrame({'id': ind+1, 'label': cat} for ind, cat in enumerate(categories))
submission.to_csv('submission.csv', index=False)

# Dump model

In [None]:
import lightgbm
lightgbm.__version__

In [None]:
from joblib import dump, load

In [None]:
dump(lgb, 'lgb_model.mdl')

In [None]:
lgbm_load = load('lgb_model.mdl')