# Severstal: Steel Defect detection - Competition Background
Competitors are asked to use machine/deep learning algorithms to detect areas of fault on a set of steel images. There are 4 classes of defects present in the images and each metal may or may not have defects on its surface. Also, each faulty metal may contain one or multiple defects. 

The labels(classes) are given in _train.csv_. Note that there are 4 rows per image each relates to a defect class i.e. classes 1 to 4. If the defect is present in a class, the corresponding **EncodedPixels** column has a non-null value. The actual image files are also provided.  

# 1. Import and explore data

In [None]:
# import required libraries
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from keras.preprocessing import image
from pathlib import Path
import os
import glob  # used for loading multiple files

In [None]:
# get the list of subdirectories to find out where the data files are 
print(os.listdir("../input"))

In [None]:
# load the trian labels and sample submission files into dataframes
raw_train = pd.read_csv('../input/severstal-steel-defect-detection/train.csv')
sample_submission = pd.read_csv('../input/severstal-steel-defect-detection/sample_submission.csv')

In [None]:
# create a copy of the train dataframe into the labels dataframe
labels = raw_train.copy()
labels.tail(4)

In [None]:
# split the ImageId_ClassId column into image_id and class_id
import re

labels['class_id'] = labels['ImageId_ClassId'].str.extract(r'.jpg_(\d)')  # extract the last digit right after underscore(_)
labels['class_id'] = labels['class_id'].astype(int)                       # change the class_id data type to numeric
labels['image_id'] = labels['ImageId_ClassId'].str.extract(r'(\w*\d*.jpg)_\d')  # extract the first set of characters before underscore

labels = labels[['image_id','EncodedPixels','class_id']]                 # exclude the unnecessary columns from labels at this stage
labels.tail(4)

In [None]:
# create a flag for the faulty images - defect is present where EncodedPixels is not NaN 
labels['has_defects'] = labels.EncodedPixels.apply(lambda x: 1 if not pd.isnull(x) else 0)
labels.tail(4)

In [None]:
# determine the number of defect per image by adding the has_defect values for each image_id

defects = pd.DataFrame(labels.groupby(by="image_id")['has_defects'].sum())
defects.reset_index(inplace=True)  # convert the image_id which is an index to a column so that the dataframes can be joined on that
defects.rename(columns={"has_defects": "no_of_defects"},inplace=True) # rename the aggregated column ready for the join 

# left join the no_of_defects to the labels dataframe
labels = labels.merge(defects, left_on='image_id', right_on='image_id', how='left')
labels.tail(4)

**Caution:**

Do not sum the no_of_defects column to calculate the total number of defects per image_id as this is should be a single scalar per image_id but due to duplicate records per image_id it has been repeated too.

In [None]:
# plot the number of images per class
sns.countplot(labels.class_id[labels.EncodedPixels.notnull()])

In [None]:
labels.class_id[labels.EncodedPixels.notnull()].value_counts().sort_values() # 7,095

In [None]:
# create a dataframe of unique image_id and the number of defects per image
dedup_labels = labels[['image_id','no_of_defects']].drop_duplicates()
dedup_labels.no_of_defects.value_counts().sort_values() # 6,666

In [None]:
# plot the number of images per number of defects 
sns.countplot(dedup_labels.no_of_defects)

# 2. Load the images files from the zip directory

In [None]:
# get a list of items in the /input/severstal-steel-defect-detection directory
print(os.listdir("../input/severstal-steel-defect-detection"))

In [None]:
# get a list of images in the /input/severstal-steel-defect-detection/train_images directory
# print(os.listdir("../input/severstal-steel-defect-detection/train_images"))

In [None]:
# set the variables for the paths of train and test image directories
train_img_path = '../input/severstal-steel-defect-detection/train_images/'
test_img_path = '../input/severstal-steel-defect-detection/test_images/'

In [None]:
 # number of image files in the train_img_path directory
print (len([name for name in os.listdir(train_img_path) if os.path.isfile(os.path.join(train_img_path, name))]))

## 2.1. Import and display sample image from the train set

In [None]:
import cv2

img = cv2.imread(train_img_path+'6dcbc2c43.jpg')
plt.imshow(img)
plt.show()

In [None]:
img.shape

In [None]:
# function to plot n image of class of class_id
# adopted from https://www.kaggle.com/bonhart/simple-cnn-on-pytorch-for-beginers
def metal_plot(class_id,n):
    fig,ax = plt.subplots(1,n,figsize=(15,30))

    for i, idx in enumerate(labels[(labels['class_id'] == class_id) & (labels['has_defects'] == 1)]['image_id'][-n:]):
      path = os.path.join(train_img_path,idx)
      ax[i].imshow(cv2.imread(path)) 

In [None]:
# class 1 sample images
metal_plot(1,3)

In [None]:
# class 2 sample images
metal_plot(2,3)

In [None]:
# class 3 sample images
metal_plot(3,3)

In [None]:
# class 4 sample images
metal_plot(4,3)

In [None]:
# plot metals with different number of defects
def def_vs_no_def(no_of_defects,n):
    fig,ax = plt.subplots(1,n,figsize=(15,30))

    for i, idx in enumerate(labels[labels['no_of_defects'] == no_of_defects]['image_id'][-n:]):
      path = os.path.join(train_img_path,idx)
      ax[i].imshow(cv2.imread(path)) 

In [None]:
# no defects
def_vs_no_def(0,3)

In [None]:
# with 1 defect
def_vs_no_def(1,3)

In [None]:
# with 2 defects
def_vs_no_def(2,3)

In [None]:
# with 3 defects
def_vs_no_def(3,3)

## 2.2. Load the training images

In [None]:
# os.listdir("../input/severstal-steel-defect-detection/train_images")

In [None]:
import glob

folders = glob.glob(train_img_path)

imagenames_list = []
for folder in folders:
#     for f in os.listdir("../input/severstal-steel-defect-detection/train_images"):
    for f in glob.glob(folder+'*.jpg'):
        imagenames_list.append(f)

read_images = [] 
for image in imagenames_list:
    read_images.append(cv2.imread(image, cv2.IMREAD_GRAYSCALE))


In [None]:
# imagenames_list

In [None]:
plt.imshow(read_images[160])

In [None]:
read_images[160].shape

In [None]:
read_images[160]

## 2.3. Alternative approach for loading images

In [None]:
from tensorflow.python.keras.preprocessing.image import load_img,img_to_array
from tensorflow.python.keras.applications.resnet50 import preprocess_input

In [None]:
image_size = 256
def read_and_prep_images(img_paths,img_height=image_size,img_width=image_size):
    imgs = [load_img(img_path,target_size=(img_height,img_width)) for img_path in img_paths]
    img_array = np.array([img_to_array(img) for img in imgs])
    output = preprocess_input(img_array)
    return(output)

In [None]:
from os.path import join
image_dir = train_img_path
img_paths = [join(image_dir, filename) for filename in 
                           ['7bb25cc94.jpg', '2eb516639.jpg', '390e9ea29.jpg', 'fc20db1e0.jpg', '5238bc100.jpg', 'cff9230ae.jpg'
                            , '8088f6b20.jpg', '5b3685c8c.jpg', 'd7939330f.jpg', 'cd4a71d17.jpg', '180478e66.jpg', '20b5096a5.jpg'
                            , 'f3a5aa94c.jpg', 'ea56440ac.jpg', 'c487b1ce1.jpg', 'a6f761c3f.jpg', '3f400c81f.jpg', 'ed1c6be8d.jpg'
                            , '7025a90c1.jpg', '58a9d89c8.jpg', '74bbe241c.jpg', '8ad6b411a.jpg', '0181695f9.jpg', 'a2a8ba02d.jpg'
                            , 'f4296a45d.jpg', '89eec1aae.jpg', 'cc7920c72.jpg', '519e11f0b.jpg', 'f81b617ec.jpg', 'e90bfe49b.jpg'
                            , 'df917bee3.jpg', '2acd6db1e.jpg', '5172a46ee.jpg', '0ddbc9fb5.jpg', 'ac1a64a23.jpg', 'dc59b5377.jpg'
                            , 'bc67d17de.jpg', '22ee0a368.jpg', 'ead245f1f.jpg', 'fdc83849e.jpg', '0ba2d403f.jpg', '49a4b51fa.jpg'
                            , '165a55d5c.jpg', '661c42b97.jpg', '6dbd47d4f.jpg', '1f45f2491.jpg', '4bb7b1660.jpg', 'b3ae9675d.jpg'
                            , '56ba7c882.jpg', 'df4d01acb.jpg', '2e12e1c6a.jpg', 'd3ef4bac1.jpg', '83bd40de8.jpg', '19fd40586.jpg'
                            , 'dae3c563a.jpg', '5663a9e34.jpg', '04c3aade7.jpg', '9a8475c90.jpg', '1ecfcc78b.jpg', '28a1ea8c2.jpg'
                            , 'babdf889d.jpg', '74211b046.jpg', '1065b4d64.jpg', 'cc3a294d4.jpg', 'ae41ecb3f.jpg', 'f375d814f.jpg'
                            , '4d9973900.jpg', '47f5c8e07.jpg', 'cdf44eab9.jpg', '2d18eccdd.jpg', '00c6060db.jpg', 'd91c205e6.jpg'
                            , '3a0e5cad8.jpg', 'b3cea5fb4.jpg', '541707319.jpg', 'ea17260aa.jpg', 'd808c5310.jpg', '3e1ed281b.jpg'
                            , '195e36565.jpg', 'a88757126.jpg', '0e15479f7.jpg', '1f5af3611.jpg', 'f32454873.jpg', '974171041.jpg'
                            , '5432fd9e1.jpg', 'df79cce6c.jpg', '9c2dc4bfc.jpg', 'ce7b7ac0b.jpg', 'f71f9c14b.jpg', '938a15be6.jpg'
                            , '05c05ea43.jpg', '554cbd6bf.jpg', '8fb078599.jpg', 'cd38fd93a.jpg', 'e63ad114c.jpg', 'bae58dc36.jpg'
                           ]]

In [None]:
train_images_df = read_and_prep_images(img_paths,img_height=image_size,img_width=image_size)
train_images_df[1]

## 2.4. Mapping the image files to the image_ids in the training set

In [None]:
raw_train.head(4)

In [None]:
# from keras.preprocessing.image import ImageDataGenerator

# train_datagen = ImageDataGenerator(
#     rescale=1./255,
#     validation_split=0.15,
#     shear_range=0.2,
#     zoom_range=0.2,
#     horizontal_flip=True
# )

# train_generator = train_datagen.flow_from_dataframe(
#     raw_train,
#     directory=train_img_path,
#     subset='training',
#     x_clo='image_id',
#     y_col='class_id',
#     target_size=(256, 1600),
#     class_mode='sparse'
#     )