In [1]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from torch.autograd import Variable
from PIL import Image

In [2]:
# Load the pretrained model
model = models.resnet18(pretrained=True)
# Use the model object to select the desired layer
layer = model._modules.get('avgpool')

In [3]:
# Set model to evaluation mode
model.eval()

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [4]:
scaler = transforms.Scale((224, 224))
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
to_tensor = transforms.ToTensor()

  "please use transforms.Resize instead.")


In [5]:
def get_vector(image_name):
    # 1. Load the image with Pillow library
    img = Image.open(image_name)
    # 2. Create a PyTorch Variable with the transformed image
    t_img = Variable(normalize(to_tensor(scaler(img))).unsqueeze(0))
    # 3. Create a vector of zeros that will hold our feature vector
    #    The 'avgpool' layer has an output size of 512
    my_embedding = torch.zeros(512)
    # 4. Define a function that will copy the output of a layer
    def copy_data(m, i, o):
        my_embedding.copy_(o.data.squeeze())
    # 5. Attach that function to our selected layer
    h = layer.register_forward_hook(copy_data)
    # 6. Run the model on our transformed image
    model(t_img)
    # 7. Detach our copy function from the layer
    h.remove()
    # 8. Return the feature vector
    return my_embedding

In [6]:
import os, shutil
folder = '../data/copy/spam/'
for the_file in os.listdir(folder):
    file_path1 = os.path.join(folder, the_file)
#     print(file_path)
    if os.path.isfile(file_path1):
        for the_file in os.listdir(folder):
            file_path2 = os.path.join(folder, the_file)
            if file_path1 != file_path2:
                pic_one_vector = get_vector(file_path1)
                pic_two_vector = get_vector(file_path2)
                cos = nn.CosineSimilarity(dim=1, eps=1e-6)
                cos_sim = cos(pic_one_vector.unsqueeze(0),
                              pic_two_vector.unsqueeze(0))
    #             print(cos_sim.data.tolist()[0])
                if cos_sim.data[0] >= 0.95:
                    print(file_path1,file_path2,' found duplicate')
                    try:
                        os.unlink(file_path2)
                    except Exception as e:
                        print(e)

../data/copy/spam/115.jpg ../data/copy/spam/extra_937.jpg  found duplicate
../data/copy/spam/630.jpg ../data/copy/spam/extra_670.jpg  found duplicate
../data/copy/spam/630.jpg ../data/copy/spam/extra_887.jpg  found duplicate
../data/copy/spam/796.jpg ../data/copy/spam/extra_178.jpg  found duplicate
../data/copy/spam/796.jpg ../data/copy/spam/extra_485.jpg  found duplicate
../data/copy/spam/extra_529.jpg ../data/copy/spam/extra_530.jpg  found duplicate
../data/copy/spam/extra_555.jpg ../data/copy/spam/extra_561.jpg  found duplicate
../data/copy/spam/extra_555.jpg ../data/copy/spam/extra_562.jpg  found duplicate
../data/copy/spam/extra_914.jpg ../data/copy/spam/extra_993.jpg  found duplicate
../data/copy/spam/extra_914.jpg ../data/copy/spam/656.jpg  found duplicate
../data/copy/spam/extra_980.jpg ../data/copy/spam/extra_265.jpg  found duplicate
../data/copy/spam/extra_980.jpg ../data/copy/spam/extra_266.jpg  found duplicate
../data/copy/spam/extra_425.jpg ../data/copy/spam/extra_449.jpg 

In [7]:
folder = '../data/copy/posts/'
for the_file in os.listdir(folder):
    file_path1 = os.path.join(folder, the_file)
#     print(file_path)
    if os.path.isfile(file_path1):
        for the_file in os.listdir(folder):
            file_path2 = os.path.join(folder, the_file)
            if file_path1 != file_path2:
                pic_one_vector = get_vector(file_path1)
                pic_two_vector = get_vector(file_path2)
                cos = nn.CosineSimilarity(dim=1, eps=1e-6)
                cos_sim = cos(pic_one_vector.unsqueeze(0),
                              pic_two_vector.unsqueeze(0))
    #             print(cos_sim.data.tolist()[0])
                if cos_sim.data[0] >= 0.95:
                    print(file_path1,file_path2,' found duplicate')
                    try:
                        os.unlink(file_path2)
                    except Exception as e:
                        print(e)

../data/copy/posts/463.jpg ../data/copy/posts/559.jpg  found duplicate
../data/copy/posts/463.jpg ../data/copy/posts/578.jpg  found duplicate
../data/copy/posts/463.jpg ../data/copy/posts/814.jpg  found duplicate
../data/copy/posts/463.jpg ../data/copy/posts/818.jpg  found duplicate
../data/copy/posts/463.jpg ../data/copy/posts/885.jpg  found duplicate
../data/copy/posts/463.jpg ../data/copy/posts/91.jpg  found duplicate
../data/copy/posts/463.jpg ../data/copy/posts/819.jpg  found duplicate
../data/copy/posts/463.jpg ../data/copy/posts/938.jpg  found duplicate
../data/copy/posts/463.jpg ../data/copy/posts/959.jpg  found duplicate
../data/copy/posts/463.jpg ../data/copy/posts/96.jpg  found duplicate
../data/copy/posts/463.jpg ../data/copy/posts/982.jpg  found duplicate
../data/copy/posts/471.jpg ../data/copy/posts/392.jpg  found duplicate
../data/copy/posts/652.jpg ../data/copy/posts/376.jpg  found duplicate
../data/copy/posts/654.jpg ../data/copy/posts/506.jpg  found duplicate
../data/

KeyboardInterrupt: 