In [1]:
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.figure
from matplotlib import mlab
import os
import json
import sys 
import nltk
import collections
from nltk.collocations import *


In [2]:
vineDenseCap = "../Logs/denseCap_vine.json"
SelfieDenseCap = "../Logs/denseCap_selfie.json"

In [9]:
def getJsonResults(filename):
    f = open(filename ,'r')
    vineData = json.load(f)
    f.close()
    return vineData["results"]

def getCaptions(records):
    captions = []
    for res in records:
        capList = res['captions']
        for cap in capList:
            captions.append(cap)
    return captions

def tokenizeList(captions):
    tokens = []
    tokenizer = nltk.tokenize.TreebankWordTokenizer()
    for sent in captions:
        t = tokenizer.tokenize(sent)
        for token in t:
            tokens.append(token)
    return tokens

def getNounsAdj(tokens):
    Nouns = []
    Adjective = []
    Adj_noun = []
    tags = nltk.pos_tag(tokens)
    for tag in  tags:

        if tag[1] == 'NN':
            Nouns.append(tag[0])
            Adj_noun.append(tag[0])
       
        if tag[1] == 'JJ':
            Adjective.append(tag[0])
            Adj_noun.append(tag[0])
    return Nouns, Adjective, Adj_noun

def getANP(sentences):
    ANP = []
    tokenizer = nltk.tokenize.TreebankWordTokenizer()
    for s in sentences:
        nounFlag = 0
        adjFlag = 0
        ANP_dict = {'Adj':'','NN':''}
        t = tokenizer.tokenize(s)
        tags = nltk.pos_tag(t)
        for tag in tags:
            if tag[1] == 'NN':
                if nounFlag < 1:
                    ANP_dict['NN'] = tag[0]
                    nounFlag += 1
            if tag[1] == 'JJ':
                if adjFlag < 1:
                    ANP_dict['Adj'] = tag[0]
                    adjFlag += 1
        ANP.append(ANP_dict['Adj'] + "_" + ANP_dict['NN'])
    return ANP


In [4]:
vineResults = getJsonResults(vineDenseCap)

selfieResults = getJsonResults(SelfieDenseCap)

In [5]:
print len(vineResults)
print len(selfieResults)

1000
1000


In [6]:

vineCaptions = getCaptions(vineResults)
print len(vineCaptions)

69726


In [7]:
selfieCaptions = getCaptions(selfieResults)
print len(selfieCaptions)

68622


In [11]:
ANP_Vine = getANP(vineCaptions)

In [12]:
ANP_Selfie = getANP(selfieCaptions)

In [15]:
Vine_AdjNoun_counts = collections.Counter(ANP_Vine)
Selfie_AdjNoun_counts = collections.Counter(ANP_Selfie)

In [16]:
print Selfie_AdjNoun_counts

Counter({u'white_wall': 3419, u'_man': 3338, u'brown_hair': 3215, u'white_sky': 2970, u'_woman': 2537, u'white_shirt': 2192, u'_eye': 2102, u'_hair': 1798, u'white_blue': 1705, u'_nose': 1640, u'black_shirt': 1518, u'_ear': 1471, u'black_hair': 1280, u'_hand': 922, u'black_cat': 829, u'_part': 777, u'_arm': 733, u'_mouth': 704, u'blue_shirt': 628, u'black_man': 542, u'white_line': 519, u'red_shirt': 488, u'black_woman': 437, u'_window': 435, u'white_woman': 409, u'_head': 389, u'_wall': 389, u'white_pillow': 385, u'_person': 380, u'_shadow': 355, u'white_door': 352, u'white_paper': 331, u'long_woman': 320, u'white_cat': 318, u'white_towel': 309, u'black_background': 302, u'white_man': 299, u'white_curtain': 297, u'white_part': 295, u'white_tile': 295, u'white_hair': 290, u'black_wall': 274, u'white_plate': 270, u'short_man': 267, u'_picture': 261, u'white_pink': 261, u'_word': 260, u'white_sign': 257, u'white_brown': 252, u'black_photo': 246, u'brown_woman': 237, u'brown_wooden': 234, 

In [17]:
print Vine_AdjNoun_counts

Counter({u'_man': 5517, u'white_sky': 4698, u'white_wall': 3730, u'_woman': 2041, u'black_man': 1691, u'white_shirt': 1587, u'black_shirt': 1584, u'_word': 1432, u'brown_hair': 1278, u'_nose': 1184, u'_hand': 1055, u'_hair': 1042, u'white_blue': 891, u'_ear': 854, u'_eye': 755, u'short_man': 755, u'_person': 741, u'_wall': 714, u'black_background': 688, u'_window': 686, u'white_man': 661, u'black_hair': 607, u'_mouth': 553, u'black_woman': 539, u'_light': 534, u'white_sign': 497, u'white_door': 476, u'_head': 454, u'red_shirt': 425, u'white_woman': 423, u'_arm': 410, u'_picture': 401, u'blue_shirt': 389, u'_part': 381, u'white_line': 347, u'_tree': 339, u'clear_sky': 338, u'black_cat': 336, u'black_wall': 334, u'black_photo': 332, u'black_bag': 307, u'red_sign': 306, u'_shadow': 296, u'white_window': 291, u'black_person': 287, u'white_paper': 277, u'white_towel': 275, u'white_table': 274, u'white_curtain': 270, u'white_letter': 265, u'white_plate': 257, u'red_man': 247, u'black_sign': 

In [None]:
tokenized_vine_sentences = tokenizeList(vineCaptions)
tokenized_selfie_sentences = tokenizeList(selfieCaptions)

In [None]:
Nouns_vine, Adjective_vine , Adj_noun_vine  = getNounsAdj(tokenized_vine_sentences)
Nouns_selfie, Adjective_selfie , Adj_noun_selfie  = getNounsAdj(tokenized_selfie_sentences)
print len(Nouns_vine)
print len(Adjective_vine)

In [None]:
print ANP_vine[:10]

In [None]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(Adj_noun_vine, window_size = 2)
finder.nbest(bigram_measures.pmi, 10)

In [None]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(Adj_noun_selfie, window_size = 2)
finder.nbest(bigram_measures.pmi, 10)

In [None]:
print Selfie_Adj_counts