# Data Gathering and Cleaning


Data is collected from google images and manually from youtube, twitter, instagram. After collecting images for both classes `Swiggy` and `Other`. the images are passed to FasterRCNN network to crop person images. The person images are manually cross verified.

In [None]:
#import required library
import os

import numpy as np
import pandas as pd


from sklearn.model_selection import train_test_split

from bs4 import BeautifulSoup
from urllib import request
from io import BytesIO

import torch
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision import transforms

from PIL import Image
import cv2
from imutils.object_detection import non_max_suppression
from imutils import paths
import imutils



## Data Gathering
In this section we collect data from google images web page. As Google Images can provide a variety of different images that can and cant be revelent, manual cross verification is need to be done .

Data was collected from various sources, those are,

1. Goggle Images 
2. Twitter (Videos with #Swiggydelivary, #Zomatodelivary, etc. and then manual crop in certain interval)
3. Instagram (Images and Videos with #Swiggydelivary, #Zomatodelivary, etc.)
4. Facebook (Images with #Swiggydelivary, #Zomatodelivary, etc.)

Images of different food delivary person, like Zomato, Dunzo was collected to train as negative samples.

### Download from Google Images

In [None]:
# request webpage 
header = {'User-Agent':"Mozilla/5.0"}
url="https://www.google.com/search?q=zomato+delivery+boy+shirt&client=firefox-b-d&sxsrf=ALeKk02vh7T2tqrnRYi907Aa1igdlxg78Q:1597761930173&source=lnms&tbm=isch&sa=X&ved=2ahUKEwiGi6KC_6TrAhX8yDgGHUenBb0Q_AUoAXoECA4QAw&biw=1366&bih=654"
html = BeautifulSoup(request.urlopen(request.Request(url, headers=header)),'html.parser')

In [None]:
#read and save all the images images
for ii, img in enumerate(html.find_all("img")):
    try:
        Image.open(BytesIO(request.urlopen(img.attrs['data-src']).read())).save(f"{ii}.jpg")
    except Exception as e:
        print(f"Error : {e}")
        continue
    


In [None]:
# Grouping above code together
def download_images(path:str, url:str):
    """
    Download Images from google image url and save to provided path
    Input: path(str) :: Path where to save images
           url(str) :: Url(Google Images)
    Output: Int :: Number of saved Images
    """
    header = {'User-Agent':"Mozilla/5.0"} #header
    html = BeautifulSoup(request.urlopen(request.Request(url, headers=header)),'html.parser') #request html page
    num_of_images = 0 #varaible to count umber of sucessful images
    
    for ii, img in enumerate(html.find_all("img",{"class":"rg_i Q4LuWd"})):# find all image tages
        try:
            #save image and increment counter
            Image.open(BytesIO(urllib2.request.urlopen(img.attrs['data-src']).read())).save(f"{path}/{ii}.jpg")
            num_of_images +=1
        except Exception as e:
            # image link not present
            print(e.desc)
            continue
    # return no of saved images
    return num_of_images
    

In [None]:
url_swiggy = 
url_zomato = 
url_dunzo =
url_foodpanda = 
url_monkey = 
url_cat =
url_dog = 
url_person = 

print(f"{download_images("../data/multilabel-classification/images/swiggy", url_swiggy)}Number of Swiggy images downloaded")
print(f"{download_images("../data/multilabel-classification/images/others", url_zomato)}Number of zomato images downloaded")
print(f"{download_images("../data/multilabel-classification/images/others", url_dunzo)}Number of dunzo images downloaded")
print(f"{download_images("../data/multilabel-classification/images/others", url_foodpanda)}Number of foodpanda images downloaded")
print(f"{download_images("../data/multilabel-classification/images/non-human/", url_monkey)}Number of monkey images downloaded")
print(f"{download_images("../data/multilabel-classification/images/non-human/", url_cat)}Number of cat images downloaded")
print(f"{download_images("../data/multilabel-classification/images/non-human/", url_dog)}Number of dog images downloaded")
print(f"{download_images("../data/multilabel-classification/images/others", url_person)}Number of person images downloaded")


Images of swiggy, zomato, dunzo, foodpanda, animals(monkey, cat, dog) are extracted.

## Person Crop
Assuming a test image will contains only single person on a frame/image. It will boost accuracy if training sample also contains such images.

In [None]:
# load and preprocess image for model

def load_image(img_path:str):
    """
    Load and tranfrom image that can be send to pytorch trained model
    Input : img_path(str): path to image file
    Output : Torch.tensor  
    
    """
    image = Image.open(img_path).convert('RGB') #load image and change to RGB chanel
    in_transform = transforms.Compose([
                        transforms.ToTensor()]) # cvt image to tensor 
    # discard the transparent, alpha channel (that's the :3) and add the batch dimension
    image = in_transform(image)[:3,:,:].unsqueeze(0)
    return image

In [None]:
#load pretrained Faster RRCNN model
fasterrcnn = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)

#check if GPU is available
use_cuda = torch.cuda.is_available()

#tranfer to gpu if available
if use_cuda:
    fasterrcnn = mfasterrcnnodel.cuda()

# set model to evaluation
fasterrcnn.eval()

In [None]:
def croped_person(path_to_image:str):
    """
    Extract and crop persons from fiven image path. Returns list of croped person images\
    Input : path_to_image(str):: oath to image from which want to extract persin
    Output : List :: List of croped person images
    """
    #load image
    img = load_image(path_to_image)
    #transfer image to gpu if available
    if use_cuda:
        img = img.cuda()
    # get bounding box 
    pred = fasterrcnn(img)
    
    #extract person bounding box with score > 0.9
    person_ids = [ind for ind, val in (filter(lambda x: x[1] == 1, enumerate(pred[0]['labels'])))]
    person_ids = [ind for ind, val in (filter(lambda x: x[1] > 0.9, enumerate(pred[0]['scores'][person_ids])))]
    
    #extracted boxes
    boxes = pred[0]['boxes'][person_ids].cpu().detach().numpy()

    #apply non max supression
    boxes = non_max_suppression(boxes, probs=None, overlapThresh=0.80)
    
    #read image to crop
    img = cv2.imread(path_to_image)
    cropped = [] #list of cropeed person images
    
    for rect in boxes:
        cropped.append(img[rect[1]:rect[3],rect[0]:rect[2]])
        
    return cropped

### Extract person from Swiggy and other dataset.

In [None]:
swiggy_persons = []
path = "../data/multilabel-classification/images/swiggy/"
for img in os.listdir(path):
    path_img = path+img
    swiggy_persons.append(croped_person(path_img))
    
# save extracted person    
np.save('../data/swiggy.npy',np.array(ans))

In [None]:
other_persons = []
path = "../data/multilabel-classification/images/other/"
for img in os.listdir(path):
    path_img = path+img
    other_persons.append(croped_person(path_img))

# save extracted person 
np.save('other.npy',np.array(ans))

## Checkpoint 
Can continue from here if notebook restart or for a new session (after import)

In [None]:
# load extracted list of person for swiggy and other
swiggy = np.load('../data/swiggy.npy', allow_pickle=True)
other = np.load('../data/other.npy', allow_pickle=True)

In [None]:
# save image 
def write_img(path:str, matrix:list):
    """
    Saves images from list of images
    Input : path(str) :: Path in whiuch have to save images
            matrix(list/array) :: matrix conating list of person for each images
    Output : None
    """
    i=0 #file name
    for image in matrix:
        #for each image
        for person in image:
            # for each person detectd in image
            cv2.imwrite(f"{path}/{i}.jpg",person) #write image
            i+=1
            

In [None]:
# save cropped person image
write_img('../data/classification/swiggy', swiggy)
write_img('../data/classification/other', other)

Extracted Person from each images and manually removed/cleaned false images.

### Logo
As Swiggy logo can also be important feature it will be a important turning point if we can identify Swiggy logo in image. I have hand  cropped and labelled Swiggy logo in every image.

## Creating CSV file for images
For training a multi label classifier, it will be convinent to have a csv file with image path and lables. For labels we can suffice only on two, one for `Human` and other for `Swiggy`. The lower value for labels can denote `No-Human` and `Other` respectively.

In [7]:
## data dir
data_path = '../data/multilabel-classification/images/'
classes = ['human/swiggy', 'human/other', 'non-human/swiggy', 'non-human/other']

In [8]:
# creating a dataframe to store image path and one hot encoded labels
list_of_images = []
list_of_tags = []

mapping = {'human':1,
          'non-human':0,
          'swiggy':1,
          'other':0}

for cls in classes:
    #for each class
    for img in os.listdir(f"{data_path}{cls}"):
        # for each image in a class
        img_path = f"{data_path}{cls}/{img}"
        
        # labeling image as [Human,Swiggy] 
        labels = list(map(lambda x : mapping[x],cls.split('/')))
        
        list_of_images.append(img_path)
        list_of_tags.append(labels)

print("Sample 5 Items")
list(zip(list_of_images[:5], list_of_tags[:5]))
            


Sample 5 Items


[('../data/multilabel-classification/images/human/swiggy/163.jpg', [1, 1]),
 ('../data/multilabel-classification/images/human/swiggy/65.jpg', [1, 1]),
 ('../data/multilabel-classification/images/human/swiggy/150.jpg', [1, 1]),
 ('../data/multilabel-classification/images/human/swiggy/221.jpg', [1, 1]),
 ('../data/multilabel-classification/images/human/swiggy/158.jpg', [1, 1])]

In [9]:
# load into DataFrame

df = pd.DataFrame([list_of_images, list_of_tags]).T
df.columns = ['path', 'labels']

df.sample(10)

Unnamed: 0,path,labels
299,../data/multilabel-classification/images/human...,"[1, 0]"
365,../data/multilabel-classification/images/human...,"[1, 0]"
254,../data/multilabel-classification/images/human...,"[1, 0]"
46,../data/multilabel-classification/images/human...,"[1, 1]"
761,../data/multilabel-classification/images/non-h...,"[0, 0]"
168,../data/multilabel-classification/images/human...,"[1, 1]"
308,../data/multilabel-classification/images/human...,"[1, 0]"
27,../data/multilabel-classification/images/human...,"[1, 1]"
559,../data/multilabel-classification/images/non-h...,"[0, 1]"
800,../data/multilabel-classification/images/non-h...,"[0, 0]"


In [10]:
# one-hot encoding 

df['human'] = df['labels'].apply(lambda x : x[0])
df['swiggy'] = df['labels'].apply(lambda x : x[1])

df.sample(10)

Unnamed: 0,path,labels,human,swiggy
768,../data/multilabel-classification/images/non-h...,"[0, 0]",0,0
351,../data/multilabel-classification/images/human...,"[1, 0]",1,0
189,../data/multilabel-classification/images/human...,"[1, 1]",1,1
429,../data/multilabel-classification/images/human...,"[1, 0]",1,0
646,../data/multilabel-classification/images/non-h...,"[0, 0]",0,0
450,../data/multilabel-classification/images/non-h...,"[0, 1]",0,1
65,../data/multilabel-classification/images/human...,"[1, 1]",1,1
210,../data/multilabel-classification/images/human...,"[1, 1]",1,1
250,../data/multilabel-classification/images/human...,"[1, 0]",1,0
130,../data/multilabel-classification/images/human...,"[1, 1]",1,1


In [11]:
# shuffle datframe
df = df.sample(frac=1).reset_index(drop=True)

df.head()

Unnamed: 0,path,labels,human,swiggy
0,../data/multilabel-classification/images/human...,"[1, 0]",1,0
1,../data/multilabel-classification/images/human...,"[1, 0]",1,0
2,../data/multilabel-classification/images/non-h...,"[0, 1]",0,1
3,../data/multilabel-classification/images/non-h...,"[0, 0]",0,0
4,../data/multilabel-classification/images/human...,"[1, 0]",1,0


In [12]:
# save labels file as csv
df.to_csv('../data/multilabel-classification/labels.csv', index=False)

In [15]:
# Random Split lables into train, test, val

train_size = 0.7

train, other = train_test_split(df, train_size=train_size)
val, test = train_test_split(other, test_size=0.5)

train.to_csv('../data/multilabel-classification/train.csv', index=False)
val.to_csv('../data/multilabel-classification/val.csv', index=False)
test.to_csv('../data/multilabel-classification/test.csv', index=False)

### Done !!
In this Notebook we have 
* Collected data
* Extracted Person from Images
* Extracted logo from Images
* Created labels csv for multi label classification
* Split Dataset into train, val, test