#### Load word2vec predictions

In [1]:
import word2vec_wrapped

2018-05-24 20:13:34,335 : INFO : loading projection weights from data/GoogleNews-vectors-negative300.bin
2018-05-24 20:13:37,321 : INFO : loaded (200000, 300) matrix from data/GoogleNews-vectors-negative300.bin


In [2]:
word2vec_predictions_series = word2vec_wrapped.linear_svc_predictions()

In [3]:
print(word2vec_predictions_series["000000154087.jpg"])

skis


#### Load doc2vec predictions

In [4]:
import doc2vec_wrapped

In [5]:
doc2vec_predictions_series = doc2vec_wrapped.linear_svc_predictions()

2018-05-24 20:13:39,129 : INFO : collecting all words and their counts
2018-05-24 20:13:39,131 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2018-05-24 20:13:39,149 : INFO : collected 3702 word types and 10 unique tags from a corpus of 1503 examples and 65463 words
2018-05-24 20:13:39,150 : INFO : Loading a fresh vocabulary
2018-05-24 20:13:39,155 : INFO : min_count=5 retains 1058 unique words (28% of original 3702, drops 2644)
2018-05-24 20:13:39,156 : INFO : min_count=5 leaves 61217 word corpus (93% of original 65463, drops 4246)
2018-05-24 20:13:39,160 : INFO : deleting the raw counts dictionary of 3702 items
2018-05-24 20:13:39,163 : INFO : sample=0.001 downsamples 71 most-common words
2018-05-24 20:13:39,164 : INFO : downsampling leaves estimated 39737 word corpus (64.9% of prior 61217)
2018-05-24 20:13:39,168 : INFO : estimated required memory for 1058 words and 400 dimensions: 3932600 bytes
2018-05-24 20:13:39,168 : INFO : resetting layer weight

In [6]:
print(doc2vec_predictions_series["000000154087.jpg"])

skis


In [7]:
doc2vec_predictions_lr_series = doc2vec_wrapped.logistic_regression_predictions() # logistic regression

2018-05-24 20:13:42,987 : INFO : collecting all words and their counts
2018-05-24 20:13:42,988 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2018-05-24 20:13:43,010 : INFO : collected 3702 word types and 10 unique tags from a corpus of 1503 examples and 65463 words
2018-05-24 20:13:43,011 : INFO : Loading a fresh vocabulary
2018-05-24 20:13:43,015 : INFO : min_count=5 retains 1058 unique words (28% of original 3702, drops 2644)
2018-05-24 20:13:43,017 : INFO : min_count=5 leaves 61217 word corpus (93% of original 65463, drops 4246)
2018-05-24 20:13:43,023 : INFO : deleting the raw counts dictionary of 3702 items
2018-05-24 20:13:43,024 : INFO : sample=0.001 downsamples 71 most-common words
2018-05-24 20:13:43,024 : INFO : downsampling leaves estimated 39737 word corpus (64.9% of prior 61217)
2018-05-24 20:13:43,029 : INFO : estimated required memory for 1058 words and 400 dimensions: 3932600 bytes
2018-05-24 20:13:43,030 : INFO : resetting layer weight

In [8]:
print(doc2vec_predictions_lr_series["000000154087.jpg"])

skis


### Load data and assign predictions

In [9]:
import pandas as pd

In [10]:
df = pd.read_json("data/COCO/coco-easier.txt", lines=True)
df = df[['file_name', 'category']]
df = df.sample(frac=1).reset_index(drop=True)

df[:5]

Unnamed: 0,file_name,category
0,000000171740.jpg,clock
1,000000498857.jpg,giraffe
2,000000443303.jpg,cat
3,000000167898.jpg,toilet
4,000000499313.jpg,pizza


In [11]:
df["word2vec"] = df.apply(lambda row: word2vec_predictions_series[row["file_name"]], axis=1)
df[:5]

Unnamed: 0,file_name,category,word2vec
0,000000171740.jpg,clock,clock
1,000000498857.jpg,giraffe,giraffe
2,000000443303.jpg,cat,cat
3,000000167898.jpg,toilet,toilet
4,000000499313.jpg,pizza,pizza


In [12]:
df["doc2vec"] = df.apply(lambda row: doc2vec_predictions_series[row["file_name"]], axis=1)
df[:5]

Unnamed: 0,file_name,category,word2vec,doc2vec
0,000000171740.jpg,clock,clock,clock
1,000000498857.jpg,giraffe,giraffe,giraffe
2,000000443303.jpg,cat,cat,cat
3,000000167898.jpg,toilet,toilet,toilet
4,000000499313.jpg,pizza,pizza,pizza


In [13]:
df["doc2vec_lr"] = df.apply(lambda row: doc2vec_predictions_lr_series[row["file_name"]], axis=1)
df[:5]

Unnamed: 0,file_name,category,word2vec,doc2vec,doc2vec_lr
0,000000171740.jpg,clock,clock,clock,clock
1,000000498857.jpg,giraffe,giraffe,giraffe,giraffe
2,000000443303.jpg,cat,cat,cat,cat
3,000000167898.jpg,toilet,toilet,toilet,toilet
4,000000499313.jpg,pizza,pizza,pizza,pizza


In [14]:
collision_df = df.loc[(df.word2vec != df.doc2vec) | (df.word2vec != df.doc2vec_lr) | (df.doc2vec != df.doc2vec_lr)]
collision_df = collision_df.sample(frac=1).reset_index(drop=True)
collision_df[:5]

Unnamed: 0,file_name,category,word2vec,doc2vec,doc2vec_lr
0,000000558073.jpg,cat,cat,cat,toilet
1,000000508917.jpg,clock,traffic light,clock,clock
2,000000169169.jpg,traffic light,traffic light,clock,clock
3,000000078565.jpg,dog,surfboard,dog,dog
4,000000535253.jpg,pizza,dog,pizza,pizza


In [15]:
def vote_prediction(row):
    predictions = [row["word2vec"], row["doc2vec"], row["doc2vec_lr"]]
    return max(set(predictions), key=predictions.count)

In [16]:
df["voted"] = df.apply(lambda row: vote_prediction(row), axis=1)
df[:5]

Unnamed: 0,file_name,category,word2vec,doc2vec,doc2vec_lr,voted
0,000000171740.jpg,clock,clock,clock,clock,clock
1,000000498857.jpg,giraffe,giraffe,giraffe,giraffe,giraffe
2,000000443303.jpg,cat,cat,cat,cat,cat
3,000000167898.jpg,toilet,toilet,toilet,toilet,toilet
4,000000499313.jpg,pizza,pizza,pizza,pizza,pizza


In [17]:
collision_df = df.loc[(df.word2vec != df.doc2vec) | (df.word2vec != df.doc2vec_lr) | (df.doc2vec != df.doc2vec_lr)]
collision_df = collision_df.sample(frac=1).reset_index(drop=True)
collision_df[:5]

Unnamed: 0,file_name,category,word2vec,doc2vec,doc2vec_lr,voted
0,000000279145.jpg,cat,clock,cat,cat,cat
1,000000534270.jpg,dog,traffic light,dog,dog,dog
2,000000458255.jpg,cat,cat,dog,dog,dog
3,000000289393.jpg,giraffe,dog,giraffe,giraffe,giraffe
4,000000146825.jpg,clock,traffic light,clock,clock,clock


## Evaluation and testing

In [18]:
from sklearn.metrics import accuracy_score

In [19]:
score = accuracy_score(df["category"], df["voted"])
print("Accuracy: %0.2f" % (score))

Accuracy: 0.97


#### Misclassified

In [20]:
k1 = df.loc[(df.category != df.voted)]
k1 = k1.sample(frac=1).reset_index(drop=True)
k1

Unnamed: 0,file_name,category,word2vec,doc2vec,doc2vec_lr,voted
0,000000327592.jpg,clock,pizza,pizza,pizza,pizza
1,000000350148.jpg,clock,pizza,pizza,pizza,pizza
2,000000169169.jpg,traffic light,traffic light,clock,clock,clock
3,000000366884.jpg,dog,clock,cat,cat,cat
4,000000270883.jpg,clock,toilet,dog,dog,dog
5,000000213255.jpg,traffic light,dog,dog,dog,dog
6,000000076547.jpg,traffic light,clock,clock,clock,clock
7,000000371749.jpg,dog,clock,clock,clock,clock
8,000000289343.jpg,dog,traffic light,traffic light,traffic light,traffic light
9,000000491464.jpg,traffic light,tennis racket,tennis racket,tennis racket,tennis racket
