In [122]:
import pandas as pd
import numpy as np
from sklearn import preprocessing as pp

# parse features and labels for train and test data
def parse_data(fpath):
    return pd.read_csv(fpath, header=None)
#     features = df.iloc[:,1:]
#     labels = df.iloc[:,0]
#     print(features, labels)
#     return np.array(features), np.array(labels)

train_feat_path = 'data/features_train/features_resnet1000_train.csv'
test_feat_path = 'data/features_test/features_resnet1000_test.csv'

train_features = parse_data(train_feat_path)
test_features = parse_data(test_feat_path)

In [123]:
import glob
import re
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords

# preprocess descriptions to remove noises
def preprocess_descriptions(fpath):
    lmt = WordNetLemmatizer()
    stemmer = PorterStemmer()
    descriptions = []
    
    for fname in glob.glob(fpath):
        file = open(fname, 'r')
        desc = file.read()
        desc = np.char.lower(desc)

        # replace punctuations in each set of descriptions
        desc = re.sub('[^\w\s]', ' ' , str(desc))

        words = []
        for i, word in enumerate(desc.split()):
            # if not stopword, lemmatize and stem word
            if word not in stopwords.words('english'):
                word = lmt.lemmatize(word)
                words.append(stemmer.stem(word))
        description = ' '.join(words)
        image_idx = int(fname.split('/')[-1].split('.')[0])
        descriptions.insert(image_idx, description)
    return descriptions

train_desc_fpath = 'data/descriptions_train/*.txt'
test_desc_fpath = 'data/descriptions_test/*.txt'

train_desc = preprocess_descriptions(train_desc_fpath)
test_desc = preprocess_descriptions(test_desc_fpath)

In [119]:
print (train_desc)

['skateboard put show use picnic tabl stage skateboard pull trick top picnic tabl man ride skateboard top tabl skate boarder trick picnic tabl person ride skateboard picnic tabl crowd watch', 'bowl soup carrot shrimp noodl healthi food bowl readi eat soup carrot shrimp sit next chopstick tasti bowl ramen serv someon enjoy bowl asian noodl soup shrimp carrot', 'teddi bear cloth hang cloth line outsid window stuf toy hang laundri line item left air dri cloth teddi bear pin outdoor cloth line stuf bear hang window', 'cat ground shoe kitten play shoe lace pair blue shoe kitten play shoelac floor grey tabbi kitten play navi blue shoe string shoe gray tiger cat walk across brick floor', 'man walk across street busi intersect ice cream truck drive man walk behind ice cream truck man cross street near icecream truck man walk behind concess bu', 'two woman walk togeth near tree pretti exot woman dress pretti robe orient woman costum onlook nearbi outdoor walkway woman geisha dress tourist area 

In [10]:
large_word_dict = {}
# create dictionary based on train set
def create_word_dict():
    
    #build a list of words from train descriptions
    for desc in train_desc:
        for word in desc.split():
            if word in large_word_dict:
                large_word_dict[word] += 1
            else:
                large_word_dict[word] = 1
                
    for desc in test_desc:
        for word in desc.split():
            if word in large_word_dict:
                large_word_dict[word] += 1
            else:
                large_word_dict[word] = 1
    
                        
create_word_dict()

{'skateboard': 1379, 'put': 120, 'show': 363, 'use': 383, 'picnic': 79, 'tabl': 3115, 'stage': 25, 'pull': 351, 'trick': 400, 'top': 2718, 'man': 7245, 'ride': 2106, 'skate': 342, 'boarder': 62, 'person': 2504, 'crowd': 389, 'watch': 565, 'bowl': 651, 'soup': 82, 'carrot': 259, 'shrimp': 14, 'noodl': 31, 'healthi': 13, 'food': 1336, 'readi': 391, 'eat': 1013, 'sit': 6637, 'next': 3755, 'chopstick': 10, 'tasti': 26, 'ramen': 3, 'serv': 220, 'someon': 294, 'enjoy': 108, 'asian': 77, 'teddi': 480, 'bear': 1108, 'cloth': 169, 'hang': 440, 'line': 463, 'outsid': 848, 'window': 828, 'stuf': 378, 'toy': 245, 'laundri': 5, 'item': 293, 'left': 90, 'air': 768, 'dri': 144, 'pin': 22, 'outdoor': 249, 'cat': 1872, 'ground': 455, 'shoe': 153, 'kitten': 99, 'play': 1717, 'lace': 4, 'pair': 325, 'blue': 1421, 'shoelac': 2, 'floor': 593, 'grey': 164, 'tabbi': 27, 'navi': 13, 'string': 39, 'gray': 195, 'tiger': 20, 'walk': 1831, 'across': 362, 'brick': 350, 'street': 3116, 'busi': 307, 'intersect': 225

In [None]:
from sklearn import preprocessing as pp

# build bag of words for test and train descriptions
word_dict = {}
def build_bag_of_words(path, thresh):

    index = 0
    
#     word_dict = large_word_dict
    
    for w in large_word_dict:
        if large_word_dict[w] > thresh:
            word_dict[w] = index
            index += 1  
    
    des_vec = {}
    
    for ind, desc in enumerate(path):
        # Lemmatize and remove stop words
        cur = [0.] * len(word_dict)
#         print (desc.split())
        for word in desc.split():
            if word in word_dict:
#            print ("success")
                cur[word_dict[word]] += 1.
#             try:
                
#             except:
#                 pass
        des_vec[ind] = cur
#         print (len(cur))
#         des_vec = pp.normalize(des_vec, norm='l2')
    for k, v in des_vec.items():
        post = pp.normalize(np.array(v).reshape(1, -1), norm='l2')
        des_vec[k] = post.flatten()
    return des_vec

train_desc_features = build_bag_of_words(train_desc, 0)
test_desc_features = build_bag_of_words(test_desc, 0)

In [171]:
print (len(train_desc_features)) 
# print (test_desc_features) 

In [172]:
# from sklearn import preprocessing as pp

# def post_process_descriptions(desc_feat):
#     for k, v in desc_feat:
#         post = pp.normalize(np.array(v).reshape(1, -1), norm='l2')
#         desc_feat[k] = post.flatten()

# post_process_descriptions(train_desc_features)
# post_process_descriptions(test_desc_features)

In [173]:
from sklearn.neighbors import KNeighborsClassifier


knn = KNeighborsClassifier(n_neighbors=1)
train_input, train_output = [], []

for idx in train_desc_features:
    train_input.append(train_desc_features[idx])
    train_output.append(idx)
    
train_input = np.array(train_input)
train_output = np.array(train_output)

knn.fit(train_input, train_output)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')

In [174]:
# build dictionary of train labels to train features
tf_dict = {}
def map_train_labels_to_features():
    tf = train_features.as_matrix()
    for i  in range(len(tf)):
        name = int(tf[i][0].split("/")[1].split(".")[0])
        tf_dict[name] = np.array(tf[i][1:], dtype=float)

map_train_labels_to_features()

In [175]:
import scipy as sp

def get_dist(feat1, feat2):
    dist = sp.spatial.distance.cdist(feat1.reshape(1, -1), feat2.reshape(1, -1), 'euclidean').flatten()
    return dist[0]


def get_dist2(feat1, feat2):
    dist = sp.spatial.distance.cdist(feat1.reshape(1, -1), feat2.reshape(1, -1), 'cosine').flatten()
    return dist[0]

In [176]:
import operator

def get_top_20(des):
    # for each test description feature, predict the most similar train image based on KNN
    desc = np.array(test_desc_features[des])
    
    pred = knn.predict(desc.reshape(1, -1))[0]
    print (pred)
#     pred_int = int(pred[0].split(".")[0])
    
    # get train desc feature from train image
    tf_train = tf_dict[pred]

    feat_score = {}
    res = []

    # for each test feature, compute distance to train feature to select closest 20
    for tf_test in test_features.as_matrix():
        name = tf_test[0,].split("/")[1]
        #print(tf_test[1:], tf_train)
        score = get_dist(tf_test[1:], tf_train)
        feat_score[name] = score
        
    sorted_feat_score = sorted(feat_score.items(), key=operator.itemgetter(1))
    for i in range(20):
        res.append(sorted_feat_score[i][0])
    return np.array(res)

In [195]:
number = get_top_20(3)
print (test_desc[3])
print (train_desc[8439])


8439
young man hold white guitar hero guitar stand front tv boy play guitar next pot plant monitor child us pretend guitar play video game kid video game guitar larg tv wall behind video game boy hold guitar control guitar hero
man guitar front microphon guy play guitar stage center man play guitar stage man wear white shirt black tie play guitar young man sing play guitar


In [191]:
from PIL import Image
import matplotlib.pyplot as plt


def display_sample_pred():
    for index in range(1):
        top = get_top_20(index)
        print(index, top)
        for image in top:
            img = Image.open('data/images_test/' + image)
            img = np.asarray(img)
            plt.imshow(img)
            plt.show()

# display_sample_pred()

In [None]:
import csv

def write_submissions(output_filename):
    output_file = open(output_filename, "w")
    writer = csv.writer(output_file)
    #write headers
    writer.writerow(["Descritpion_ID", "Top_20_Image_IDs"])
    # get top 20 images for each test description
    for index, value in test_desc_features:
        images = " ".join(get_top_20(index))
        writer.writerow([index + ".txt", images])

write_submissions('sample_submission.csv')