# Объединяем сегментацию и распознавание в одно работающее целое

In [1]:
!pip install segmentation-models-pytorch

In [2]:
import sys
import os
import numpy as np
import random

from tqdm import tqdm
import cv2
import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2
import segmentation_models_pytorch as smp
import matplotlib.pyplot as plt


In [3]:
config = {
    "model": {
        "name": "Unet",
        "parameters": {
            "encoder_name": "resnet34",
            "encoder_weights": None,
            "in_channels": 3,
            "classes": 1,
        }
    },
    "seed": 42,
    "transforms": A.Compose([
        A.Resize(width=512, height=512),
        A.Normalize(),
        ToTensorV2(),
    ]),

    "post-transforms": A.Resize(width=3000, height=3000),
    "threshold": 0.7,
}

#'../input/tryyyy/data/data/train_segmentation/images/0_0_eng.jpg'

In [4]:
"""model_state = torch.load_state_dict(torch.load('../input/tryyyy/weights/model-segmentation.pth'))
print(model_state)

"""
model_path = "../input/tryyyy/weights/model-segmentation.pth"
model = getattr(smp, config["model"]["name"])(**config["model"]["parameters"])
model_state = torch.load(model_path, map_location=torch.device('cpu'))["model_state"]
# print(model_state.keys())

In [5]:
#A.transforms.CLAHE


import albumentations as A

transform = A.Compose([
    A.transforms.CLAHE(clip_limit=(1.0, 2.0))], p=1)

image = '../input/tryyyy/data/data/train_segmentation/images/0_0_eng.jpg'
image = cv2.imread(image)
plt.figure(figsize=(15, 15))
plt.imshow(image)
plt.show()
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

# Augment an image
transformed = transform(image=image)
transformed_image = transformed["image"]




plt.figure(figsize=(15, 15))
plt.imshow(transformed_image)
plt.show()

In [9]:
class Predictor:
    def __init__(self, model, device="cpu", threshold=0.7):
        self.model = model
        self.device = torch.device(device)
        self.threshold = threshold

    def __call__(self, loader):
        predictions = []

        self.model.to(self.device)
        with torch.no_grad():
            for batch in tqdm(loader, position=0, leave=True, desc="Predicting: "):
                inputs = batch.to(self.device)
                batch_predictions = self.model(inputs)
                batch_predictions = batch_predictions.to("cpu").detach()
                batch_predictions = torch.where(batch_predictions > self.threshold, 1, 0)
                batch_predictions = batch_predictions.permute(0, 2, 3, 1).numpy()
                predictions.extend(batch_predictions)

        predictions = np.array(predictions)
        return predictions


class SegmentationDataset(Dataset):
    def __init__(self, pathes, masks=None, transforms=None):
        self.pathes = pathes
        self.masks = masks
        self.transforms = transforms

    def __len__(self):
        return len(self.pathes)

    def __getitem__(self, index):
        image_path = self.pathes[index]
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        if self.masks:
            mask = self.masks[index].astype(np.int8)

        if self.transforms:
            if self.masks:
                transformed = self.transforms(image=image, mask=mask)
                image = transformed["image"]
                mask = transformed["mask"]
            else:
                transformed = self.transforms(image=image)
                image = transformed["image"]

        if self.masks:
            return image, mask

        return image

def main():
    #test_image_path, output_path = sys.argv[1:]
    test_image_path = '../input/tryyyy/data/data/train_segmentation/images/0_1_eng.jpg'
    output_path = './try'
    #test_images_filenames = os.listdir(test_image_path)
    test_images_pathes = [test_image_path]
    #test_images_pathes = [os.path.join(test_image_path, file) for file in test_images_filenames]
    shapes = np.array([np.asarray(cv2.imread(path)).shape[1::-1] for path in test_images_pathes])

    print(shapes)


    dataset = SegmentationDataset(pathes=test_images_pathes, transforms=config["transforms"])
    loader = DataLoader(dataset=dataset,
                        batch_size=9,
                        shuffle=False,
                        num_workers=0,
                        pin_memory=True,
                        drop_last=False)

    print(f"Loaded dataset with: {len(dataset)} samples")

    model_path = "../input/tryyyy/weights/model-segmentation.pth"
    model = getattr(smp, config["model"]["name"])(**config["model"]["parameters"])
    model_state = torch.load(model_path, map_location=torch.device('cpu'))["model_state"]
    model.load_state_dict(model_state)
    print(f"Loaded {model_path}")
    model.eval()

    predictor = Predictor(model=model, device="cpu", threshold=config["threshold"])
    predictions = predictor(loader)

    resized_predictions = []
    for prediction, shape in zip(predictions, shapes):
        prediction = prediction.astype(np.int8)
        resized_prediction = cv2.resize(prediction,shape, interpolation=cv2.INTER_NEAREST)
        resized_predictions.append(resized_prediction)

    resized_predictions = np.array(resized_predictions)
    results = {k: v for k, v in zip(test_images_pathes, resized_predictions)}
    np.savez_compressed(output_path, **results)

    print(f"Saved results: {output_path}")
    return results

#хочу бви (-_-)
if __name__ == "__main__":
    image_path = '../input/tryyyy/data/data/train_segmentation/images/0_0_eng.jpg'
    img = cv2.imread(image_path)
    mask = main()
    for contour in mask:
        cv2.drawContours(img, np.array([contour]), -1, (0, 255, 0), 2)

    plt.figure(figsize=(15, 15))
    plt.imshow(img)
    plt.show()

In [10]:
def get_contours_from_mask(mask, min_area=20):
    mask = torch.from_numpy(mask)
    contours, hierarchy = cv2.findContours(mask.astype(np.uint8),
                                           cv2.RETR_LIST,
                                           cv2.CHAIN_APPROX_SIMPLE)
    contour_list = []
    for contour in contours:
        if cv2.contourArea(contour) >= min_area:
            contour_list.append(contour)
    return contour_list


if __name__ == "__main__":
    image_path = '../input/tryyyy/data/data/train_segmentation/images/0_0_eng.jpg'
    img = cv2.imread(image_path)
    mask = main()
    print(mask)
    contours = get_contours_from_mask(mask[image_path])
    img = cv2.imread(image_path)

    for contour in contours:
        cv2.drawContours(img, np.array([contour]), -1, (0, 255, 0), 2)

    plt.figure(figsize=(15, 15))
    plt.imshow(img)
    plt.show()

In [11]:

image_path = '../input/tryyyy/data/data/train_segmentation/images/0_1_eng.jpg'
img = cv2.imread(image_path)

    
fig = plt.figure(figsize=(20, 20))


ax1 = fig.add_subplot(1, 3, 2)
ax1.imshow(mask[image_path].squeeze())
ax1.set_title("Ground Truth")



In [12]:
def crop_img_by_polygon(img, polygon):
    # https://stackoverflow.com/questions/48301186/cropping-concave-polygon-from-image-using-opencv-python
    pts = np.array(polygon)
    rect = cv2.boundingRect(pts)
    x,y,w,h = rect
    croped = img[y:y+h, x:x+w].copy()
    pts = pts - pts.min(axis=0)
    mask = np.zeros(croped.shape[:2], np.uint8)
    cv2.drawContours(mask, [pts], -1, (255, 255, 255), -1, cv2.LINE_AA)
    dst = cv2.bitwise_and(croped, croped, mask=mask)
    return dst

image_path = '../input/tryyyy/data/data/train_segmentation/images/0_1_eng.jpg'
img = cv2.imread(image_path)

for contour in contours:
    if contour is not None:
        crop = crop_img_by_polygon(img, contour)
        #print(crop)
        


# Щас будем творить вторую часть OCR

In [14]:
import torch
import torch.nn as nn
import torchvision

import numpy as np
import cv2
import os
import sys
import json

In [15]:
OOV_TOKEN = '<OOV>'
CTC_BLANK = '<BLANK>'


def get_char_map(alphabet):
    """Make from string alphabet character2int dict.
    Add BLANK char fro CTC loss and OOV char for out of vocabulary symbols."""
    char_map = {value: idx + 2 for (idx, value) in enumerate(alphabet)}
    char_map[CTC_BLANK] = 0
    char_map[OOV_TOKEN] = 1
    return char_map


class Tokenizer:
    """Class for encoding and decoding string word to sequence of int
    (and vice versa) using alphabet."""

    def __init__(self, alphabet):
        self.char_map = get_char_map(alphabet)
        self.rev_char_map = {val: key for key, val in self.char_map.items()}

    def encode(self, word_list):
        """Returns a list of encoded words (int)."""
        enc_words = []
        for word in word_list:
            enc_words.append(
                [self.char_map[char] if char in self.char_map
                 else self.char_map[OOV_TOKEN]
                 for char in word]
            )
        return enc_words

    def get_num_chars(self):
        return 150#len(self.char_map)

    def decode(self, enc_word_list):
        """Returns a list of words (str) after removing blanks and collapsing
        repeating characters. Also skip out of vocabulary token."""
        dec_words = []
        for word in enc_word_list:
            word_chars = ''
            for idx, char_enc in enumerate(word):
                # skip if blank symbol, oov token or repeated characters
                if (
                    char_enc != self.char_map[OOV_TOKEN]
                    and char_enc != self.char_map[CTC_BLANK]
                    # idx > 0 to avoid selecting [-1] item
                    and not (idx > 0 and char_enc == word[idx - 1])
                ):
                    word_chars += self.rev_char_map[char_enc]
            dec_words.append(word_chars)
        return dec_words

### 2.2. Базовые трансформы модели

Здесь мы задаем базовые трансформы для инференса OCR

In [16]:
from albumentations.pytorch.transforms import ToTensorV2

class Normalize:
    def __call__(self, img):
        img = img.astype(np.float32) / 255
        return img


class MoveChannels:
    """Move the channel axis to the zero position as required in pytorch."""

    def __init__(self, to_channels_first=True):
        self.to_channels_first = to_channels_first

    def __call__(self, image):
        if self.to_channels_first:
            return np.moveaxis(image, -1, 0)
        else:
            return np.moveaxis(image, 0, -1)


class ImageResize:
    def __init__(self, height, width):
        self.height = height
        self.width = width

    def __call__(self, image):
        image = cv2.resize(image, (self.width, self.height),
                           interpolation=cv2.INTER_LINEAR)
        return image


def get_val_transforms(height, width):
    transforms = torchvision.transforms.Compose([
        ImageResize(height, width),
        MoveChannels(to_channels_first=True),
        Normalize(),
        ToTensorV2(),
    ])
    return transforms

### 2.3. Здесь определяем саму модель - CRNN

In [None]:
#torch.load('../input/tryyyy/model-crnn_Tema2.ckpt')

### 2.4. Определяем класс для использования OCR-модели на инференсе

In [64]:
# Конфиг для модели OCR

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

config_json = {
    "alphabet": ''' !"%\'()*+,-./0123456789:;<=>?ABCDEFGHIJKLMNOPRSTUVWXY[]_abcdefghijklmnopqrstuvwxyz|}ЁАБВГДЕЖЗИКЛМНОПРСТУФХЦЧШЩЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё№''',
    "save_dir": "./experiments/test",
    "num_epochs": 100,
    "image": {
        "width": 256,
        "height": 64
    },
    "train": {
        "root_path": "../input/tryyyy/data/data/train_recognition/images",
        "json_path": "./train_labels_splitted.json",
        "batch_size": 128
    },
    "val": {
        "root_path": "../input/tryyyy/data/data/train_recognition/images",
        "json_path": "./val_labels_splitted.json",
        "batch_size": 128
    }
}

In [68]:
def get_resnet34_backbone(pretrained=True):
    m = torchvision.models.resnet34(pretrained=True)
    input_conv = nn.Conv2d(3, 64, 7, 1, 3)
    blocks = [input_conv, m.bn1, m.relu,
              m.maxpool, m.layer1, m.layer2, m.layer3]
    return nn.Sequential(*blocks)


class BiLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout=0.15):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size, hidden_size, num_layers,
            dropout=dropout, batch_first=True, bidirectional=True)

    def forward(self, x):
        out, _ = self.lstm(x)
        return out


class CRNN(nn.Module):
    def __init__(
        self, number_class_symbols, time_feature_count=256, lstm_hidden=256,
        lstm_len=2,
    ):
        super().__init__()
        self.feature_extractor = get_resnet34_backbone(pretrained=True)
        self.avg_pool = nn.AdaptiveAvgPool2d(
            (time_feature_count, time_feature_count))
        self.bilstm = BiLSTM(time_feature_count, lstm_hidden, lstm_len)
        self.classifier = nn.Sequential(
            nn.Linear(lstm_hidden * 2, time_feature_count),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(time_feature_count, number_class_symbols),
        )

    def forward(self, x):
        x = self.feature_extractor(x)
        b, c, h, w = x.size()
        x = x.view(b, c * h, w)
        x = self.avg_pool(x)
        x = x.transpose(1, 2)
        x = self.bilstm(x)
        x = self.classifier(x)
        x = nn.functional.log_softmax(x, dim=2).permute(1, 0, 2)
        return x

In [70]:
class InferenceTransform:
    def __init__(self, height, width):
        self.transforms = get_val_transforms(height, width)

    def __call__(self, images):
        transformed_images = list()
        for image in images:
            image = self.transforms(image=image)
            transformed_images.append(image['image'])
        transformed_tensor = torch.stack(transformed_images, 0)
        return transformed_tensor


class OcrPredictor:
    def __init__(self, model_path, config, device='cuda'):
        self.tokenizer = Tokenizer(config['alphabet'])
        self.device = torch.device(device)
        # load model
        self.model = CRNN(number_class_symbols=self.tokenizer.get_num_chars())

        self.model.load_state_dict(torch.load(model_path))
        self.model.to(self.device)

        self.transforms = InferenceTransform(
            height=config['image']['height'],
            width=config['image']['width'],
        )

    def __call__(self, images):
        if isinstance(images, (list, tuple)):
            one_image = False
        elif isinstance(images, np.ndarray):
            images = [images]
            one_image = True
        else:
            raise Exception(f"Input must contain np.ndarray, "
                            f"tuple or list, found {type(images)}.")

        images = self.transforms(images)
        pred = predict(images, self.model, self.tokenizer, self.device)

        if one_image:
            return pred[0]
        else:
            return pred

In [71]:
predictor = OcrPredictor(
    model_path='../input/weight/model-7-0.2085.ckpt',
    config=config_json
)

In [60]:


pred_json = {}

# img_path = "../input/tryyyy/data/data/train_recognition/images/0.png"
# img = cv2.imread(img_path)
# pred = predictor(img)

# img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
# plt.imshow(img)
# plt.show()

# print('Prediction: ', pred)
# print('True: ', val_img[1])


print_images = True
for val_img in val_data_splitted[20:25]:
    img = cv2.imread(f'../input/tryyyy/data/data/train_recognition/images/{val_img[0]}')
    
    pred = predictor(img)
    pred_json[val_img[0]] = pred

    if print_images:
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        plt.imshow(img)
        plt.show()
        print('Prediction: ', predictor(img))
        print('True: ', val_img[1])

### Применим модель

In [59]:
import pandas as pd

train_csv = pd.read_csv('../input/tryyyy/data/data/train_recognition/labels.csv')
train_csv = train_csv.sample(frac = 1)

train_data = dict(train_csv[['file_name','text']].values)

train_data = [(k, v) for k, v in train_data.items()]
print('train len', len(train_data))

split_coef = 0.75
train_len = int(len(train_data)*split_coef)

train_data_splitted = train_data[:train_len]
val_data_splitted = train_data[train_len:]

print('train len after split', len(train_data_splitted))
print('val len after split', len(val_data_splitted))



In [41]:
with open('./train_labels_splitted.json', 'w') as f:
    json.dump(dict(train_data_splitted), f)
    
with open('./val_labels_splitted.json', 'w') as f:
    json.dump(dict(val_data_splitted), f)

RuntimeError: Error(s) in loading state_dict for CRNN:
	size mismatch for classifier.3.weight: copying a param with shape torch.Size([151, 256]) from checkpoint, the shape in current model is torch.Size([150, 256]).
	size mismatch for classifier.3.bias: copying a param with shape torch.Size([151]) from checkpoint, the shape in current model is torch.Size([150]).

In [None]:
#['state_dict']
temp = torch.load('../input/tryyyy/model-crnn_Tema2.ckpt')
print(list(temp.values())[0].shape)

In [None]:
ckpt = torch.load("../input/tryyyy/model-crnn_Tema2.ckpt")

"""for key in ckpt:
    print(key)"""

In [None]:
from PIL import ImageFont, ImageDraw, Image

def get_image_visualization(img, pred_data, fontpath, font_koef=50):
    h, w = img.shape[:2]
    font = ImageFont.truetype(fontpath, int(h/font_koef))
    empty_img = Image.new('RGB', (w, h), (255, 255, 255))
    draw = ImageDraw.Draw(empty_img)

    for prediction in pred_data['predictions']:
        polygon = prediction['polygon']
        pred_text = prediction['text']
        cv2.drawContours(img, np.array([polygon]), -1, (0, 255, 0), 2)
        x, y, w, h = cv2.boundingRect(np.array([polygon]))
        draw.text((x, y), pred_text, fill=0, font=font)

    vis_img = np.array(empty_img)
    vis = np.concatenate((img, vis_img), axis=1)
    return vis



def crop_img_by_polygon(img, polygon):
    # https://stackoverflow.com/questions/48301186/cropping-concave-polygon-from-image-using-opencv-python
    pts = np.array(polygon)
    rect = cv2.boundingRect(pts)
    x,y,w,h = rect
    croped = img[y:y+h, x:x+w].copy()
    pts = pts - pts.min(axis=0)
    mask = np.zeros(croped.shape[:2], np.uint8)
    cv2.drawContours(mask, [pts], -1, (255, 255, 255), -1, cv2.LINE_AA)
    dst = cv2.bitwise_and(croped, croped, mask=mask)
    return dst



class PiepleinePredictor:
    def __init__(self, segm_model_path, ocr_model_path, ocr_config):
        model_path = "../input/tryyyy/weights/model-segmentation.pth"
        model = getattr(smp, config["model"]["name"])(**config["model"]["parameters"])
        model_state = torch.load(model_path, map_location=torch.device('cpu'))["model_state"]
        model.load_state_dict(model_state)
        model.eval()

        self.segm_predictor = Predictor(model=model)
        self.ocr_predictor = OcrPredictor(
            model_path='../input/tryyyy/weights/model-ocr.ckpt',
            config=config_json)
        
    def __call__(self, img):
        output = {'predictions': []}
        contours = self.segm_predictor(img)
        for contour in contours:
            if contour is not None:
                crop = crop_img_by_polygon(img, contour)
                print(crop)
                pred_text = self.ocr_predictor(crop)
                output['predictions'].append(
                    {
                        'polygon': [[int(i[0][0]), int(i[0][1])] for i in contour],
                        'text': pred_text
                    }
                )
        return output
    
    


In [None]:

pipeline_predictor = PiepleinePredictor(
    segm_model_path='../input/tryyyy/weights/model-segmentation.pth',
    ocr_model_path='../input/tryyyy/weights/model-ocr.ckpt',
    ocr_config=config_json
)
    
image_path = '../input/tryyyy/data/data/train_segmentation/images/0_0_eng.jpg'

img = cv2.imread(image_path)
output = pipeline_predictor(img)

vis = get_image_visualization(img, output, 'font.otf')

plt.figure(figsize=(20, 20))
plt.imshow(vis)
plt.show()

RuntimeError: Error(s) in loading state_dict for CRNN:
	size mismatch for classifier.3.weight: copying a param with shape torch.Size([114, 256]) from checkpoint, the shape in current model is torch.Size([151, 256]).
	size mismatch for classifier.3.bias: copying a param with shape torch.Size([114]) from checkpoint, the shape in current model is torch.Size([151]).