# Disclaimer
This is supposed to be a baseline notebook for those starting with image classification. Please don't expect SoTA results :D

# Creating Training DF

We will create a dataframe containing the training image ID, label, and path

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_train = pd.read_csv(r'/kaggle/input/cassava-leaf-disease-classification/train.csv')
df_train.head()

In [None]:
import json 

with open(r'/kaggle/input/cassava-leaf-disease-classification/label_num_to_disease_map.json') as json_file: 
    label_map = json.load(json_file) 

label_map = {int(k):v for k,v in label_map.items()}
label_map

In [None]:
df_train['disease'] = df_train['label'].map(label_map)
df_train

In [None]:
import glob
train_path = glob.glob(r'/kaggle/input/cassava-leaf-disease-classification/train_images/*.jpg')
train_path.sort()
print(len(train_path))

In [None]:
df_train['path'] = train_path
df_train

In [None]:
df_train.groupby(['disease']).size().plot(kind='bar')

This is a very imbalanced dataset

# First look at the Images

In [None]:
import matplotlib.pyplot as plt
from PIL import Image

In [None]:
img = Image.open(df_train.path[0])
img

In [None]:
img.size

Each image is of size 800x600 pixels

# Taking a subsample of the set to prevent RAM overflow

In [None]:
from tqdm.notebook import tqdm #to monitor progress
np.random.seed(42) #to get reproducible results

In [None]:
df_samp = pd.DataFrame()

df_samp = df_samp.append(df_train.sample(2000), ignore_index=True)

df_samp.groupby(by='disease').count()

#### To take equal number of samples from each category
df_samp = pd.DataFrame()

for label in tqdm(df_train.label.unique()):
    df_samp = df_samp.append(df_train[df_train.label==label].sample(100), 
                           ignore_index=True)

df_samp.groupby(by='disease').count()

In [None]:
from sklearn.utils import shuffle

df_samp = shuffle(df_samp).reset_index(drop=True) #shuffling the dataframe

# Train-Test split

In [None]:
from sklearn.model_selection import train_test_split

X = df_samp.drop(columns=['label'])
y = df_samp.label

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, stratify=y)

print(X_train.shape)
print(len(y_train))
print(X_valid.shape)
print(len(y_valid))

# Reducing image size and saving images as arrays

In [None]:
compressed_size = (200,150)

In [None]:
train_array = np.array([np.asarray(Image.open(path).resize(compressed_size, Image.ANTIALIAS)) for path in tqdm(X_train.path)])
valid_array = np.array([np.asarray(Image.open(path).resize(compressed_size, Image.ANTIALIAS)) for path in tqdm(X_valid.path)])

print(train_array.shape)
print(valid_array.shape)

In [None]:
plt.figure(figsize=(20,12))

for i, img in tqdm(enumerate(train_array[:5])):
    plt.subplot(1, 5, i+1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    plt.imshow(img)
    plt.title(X_train.disease.iloc[i])
    plt.xlabel(X_train.image_id.iloc[i])

plt.show()

In [None]:
plt.figure(figsize=(20,12))

for i, img in tqdm(enumerate(valid_array[:5])):
    plt.subplot(1, 5, i+1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    plt.imshow(img)
    plt.title(X_valid.disease.iloc[i])
    plt.xlabel(X_valid.image_id.iloc[i])

plt.show()

In [None]:
print(f'Length of the training array is {len(train_array)}')
print(f'Shape of the training array is {train_array.shape}')
print(f'Shape of each training image array is {train_array[0].shape}')

The training input array currently has 700 rows, with each row containing the image array.

Since a LogReg model accepts each input as a row array, we need to reshape our image arrays to a single row having `150`x`200`x`3` values.

The shape of the input array will then become `(700, 150*200*3)`

In [None]:
train_array.resize(len(train_array), train_array.shape[1]*train_array.shape[2]*train_array.shape[3])

print(f'New length of the training array is {len(train_array)}')
print(f'New shape of the training array is {train_array.shape}')
print(f'New shape of each training image array is {train_array[0].shape}')

In [None]:
valid_array.resize(len(valid_array), valid_array.shape[1]*valid_array.shape[2]*valid_array.shape[3])

print(f'New length of the validation array is {len(valid_array)}')
print(f'New shape of the validation array is {valid_array.shape}')
print(f'New shape of each validation image array is {valid_array[0].shape}')

# Model Instantiation and Training

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(class_weight='balanced', verbose=5, n_jobs=-1)

In [None]:
lr.fit(train_array, y_train)

# Prediction and Evaluation

In [None]:
preds = lr.predict(valid_array)
preds

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, f1_score
import seaborn as sns

label = sorted(y_valid.unique())
sns.heatmap(confusion_matrix(y_valid, preds), annot=True, square=True, fmt='g', 
            xticklabels=label, yticklabels=label, cbar=False)

plt.title('Confusion matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

print(classification_report(y_valid, preds))

An accuracy of 54% is definitely not bad for a simple logreg model trained on just 4.3% of the training set, and with the training images compressed by a factor of 8 (random guessing would give an accuracy of 20%).

Lets see how we can improve this accuracy in further versions of this notebook.
Would love you have your suggestions too!

# Submission

In [None]:
test_path = glob.glob(r'/kaggle/input/cassava-leaf-disease-classification/test_images/*.jpg')
test_path.sort()
print(len(test_path))
test_path

In [None]:
Image.open(test_path[0])

In [None]:
test_array = np.array([np.asarray(Image.open(path).resize(compressed_size, Image.ANTIALIAS)) for path in tqdm(test_path)])
test_array.resize(len(test_array), test_array.shape[1]*test_array.shape[2]*test_array.shape[3])

In [None]:
submission = lr.predict(test_array)
submission

In [None]:
submission_df = pd.DataFrame({'image_id':[path.split('/')[-1] for path in test_path], 
                              'label':submission})
submission_df

In [None]:
submission_df.to_csv('/kaggle/working/submission.csv', index=False)