In [2]:
import pandas as pd
import numpy as np
from sklearn import preprocessing as pp

# parse features and labels for train and test data
def parse_data(fpath):
    df = pd.read_csv(fpath, header=None)
    features = df.iloc[:,1:]
    labels = df.iloc[:,0]
#     print(features, labels)
    return np.array(features), np.array(labels)

train_feat_path = 'data/features_train/features_resnet1000_train.csv'
test_feat_path = 'data/features_test/features_resnet1000_test.csv'

train_features, train_labels = parse_data(train_feat_path)
test_features, test_labels = parse_data(test_feat_path)

In [6]:
import glob
import re
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords

# preprocess descriptions to remove noises
def preprocess_descriptions(fpath):
    lmt = WordNetLemmatizer()
    stemmer = PorterStemmer()
    descriptions = []
    
    for fname in glob.glob(fpath):
        file = open(fname, 'r')
        desc = file.read()
        desc = np.char.lower(desc)

        # replace punctuations in each set of descriptions
        desc = re.sub('[^\w\s]', ' ' , str(desc))

        words = []
        for i, word in enumerate(desc.split()):
            # if not stopword, lemmatize and stem word
            if word not in stopwords.words('english'):
                word = lmt.lemmatize(word)
                words.append(stemmer.stem(word))
        description = ' '.join(words)
        image_idx = int(fname.split('/')[-1].split('.')[0])
        descriptions.insert(image_idx, description)
    return descriptions

train_desc_fpath = 'data/descriptions_train/*.txt'
test_desc_fpath = 'data/descriptions_test/*.txt'

train_desc = preprocess_descriptions(train_desc_fpath)
test_desc = preprocess_descriptions(test_desc_fpath)

In [10]:
large_word_dict = {}
# create dictionary based on train set
def create_word_dict():
    
    #build a list of words from train descriptions
    for desc in train_desc:
        for word in desc.split():
            if word in large_word_dict:
                large_word_dict[word] += 1
            else:
                large_word_dict[word] = 1
                
    for desc in test_desc:
        for word in desc.split():
            if word in large_word_dict:
                large_word_dict[word] += 1
            else:
                large_word_dict[word] = 1
    
                        
create_word_dict()

{'skateboard': 1379, 'put': 120, 'show': 363, 'use': 383, 'picnic': 79, 'tabl': 3115, 'stage': 25, 'pull': 351, 'trick': 400, 'top': 2718, 'man': 7245, 'ride': 2106, 'skate': 342, 'boarder': 62, 'person': 2504, 'crowd': 389, 'watch': 565, 'bowl': 651, 'soup': 82, 'carrot': 259, 'shrimp': 14, 'noodl': 31, 'healthi': 13, 'food': 1336, 'readi': 391, 'eat': 1013, 'sit': 6637, 'next': 3755, 'chopstick': 10, 'tasti': 26, 'ramen': 3, 'serv': 220, 'someon': 294, 'enjoy': 108, 'asian': 77, 'teddi': 480, 'bear': 1108, 'cloth': 169, 'hang': 440, 'line': 463, 'outsid': 848, 'window': 828, 'stuf': 378, 'toy': 245, 'laundri': 5, 'item': 293, 'left': 90, 'air': 768, 'dri': 144, 'pin': 22, 'outdoor': 249, 'cat': 1872, 'ground': 455, 'shoe': 153, 'kitten': 99, 'play': 1717, 'lace': 4, 'pair': 325, 'blue': 1421, 'shoelac': 2, 'floor': 593, 'grey': 164, 'tabbi': 27, 'navi': 13, 'string': 39, 'gray': 195, 'tiger': 20, 'walk': 1831, 'across': 362, 'brick': 350, 'street': 3116, 'busi': 307, 'intersect': 225

In [None]:
print(large_word_dict)

In [36]:
# build bag of words for test and train descriptions
word_dict = {}
def build_bag_of_words(path, thresh):

    index = 0
    
#     word_dict = large_word_dict
    
    for w in large_word_dict:
        if large_word_dict[w] > thresh:
            word_dict[w] = index
            index += 1  
    
    des_vec = []
    
    for desc in path:
        # Lemmatize and remove stop words
        cur = [0] * len(word_dict)
#         print (desc.split())
        for word in desc.split():
            try:
                if word in word_dict:
#                     print ("success")
                    cur[word_dict[word]] += 1
            except:
                pass
#         des_vec = pp.normalize(des_vec, norm='l2')
        des_vec.append(cur)
    des_vec = pp.normalize(des_vec, norm='l2')
    return des_vec

train_desc_features = build_bag_of_words(train_desc, 10)
test_desc_features = build_bag_of_words(test_desc, 10)

In [37]:
print (train_desc_features[0])

[ 0.4472136  0.1118034  0.1118034 ...,  0.         0.         0.       ]
