Predicting the location and type of defects found in steel manufacturing. 
Images are named with a unique ImageId. 
Each image may have no defects, a defect of a single class, or defects of multiple classes. (ClassId = [1, 2, 3, 4]).

File description : 

* train_images/ - folder of training images
* test_images/ - folder of test images (you are segmenting and classifying these images)
* train.csv - training annotations which provide segments for defects (ClassId = [1, 2, 3, 4])
* sample_submission.csv - a sample submission file in the correct format; note, each ImageId 4 rows, one for each of the 4 defect classes

#1 : IMPORTING DATA

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O
import matplotlib.pyplot as plt # Import matplotlib for data visualisation
import seaborn as sns
import pandas_profiling as pp
import os
from collections import defaultdict



In [None]:
print(os.listdir("../input"))
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv('../input/train.csv')
sample_df = pd.read_csv("../input/sample_submission.csv")
train.head(), sample_df.head()

In [None]:
train['defect'] = train['EncodedPixels'].notnull()
train['ClassId'] = train['ImageId_ClassId'].str[-1:]
train['ImageId'] = train['ImageId_ClassId'].str[:-2]
train = train[['ImageId','ClassId','defect','EncodedPixels']]
train.head()

In [None]:
# lets create a dict with class id and encoded pixels and group all the defaults per image
train['ClassId_EncodedPixels'] = train.apply(lambda x: (x['ClassId'], x['EncodedPixels']), axis = 1)
grouped_EncodedPixels = train.groupby('ImageId')['ClassId_EncodedPixels'].apply(list)
grouped_EncodedPixels

In [None]:
train_def = train.groupby(['ImageId'])['defect'].sum().astype('uint8').to_frame(name='NumDef').reset_index()
train_def

## How many classes do each image have?

In [None]:
labels, counts = np.unique(train_def.NumDef, return_counts=True)
plt.bar(labels, counts, align='center')
plt.gca().set_xticks(labels)
plt.gca().set_title('Num. Defects by Images')
for i in range(len(counts)):
    plt.text(x = i-0.1 , y = counts[i]+50, s = counts[i], size = 12, color='r')
plt.show()

# Check image data

In [None]:

from PIL import Image

train_size_dict = defaultdict(int)
train_path = Path("../input/train_images/")

for img_name in train_path.iterdir():
    img = Image.open(img_name)
    train_size_dict[img.size] += 1

In [None]:
train_size_dict

In [None]:
test_size_dict = defaultdict(int)
test_path = Path("../input/test_images/")

for img_name in test_path.iterdir():
    img = Image.open(img_name)
    test_size_dict[img.size] += 1

In [None]:
test_size_dict

In [None]:
submissionCSV = pd.read_csv(
    '../input/sample_submission.csv',
    converters={'EncodedPixels': lambda e: ''})
print(submissionCSV.head())
submissionCSV.to_csv('submission.csv', index=False)