## 1.0 Predict the Countries from Capitals

In [1]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from utils import get_vectors

In [3]:
data = pd.read_csv('capitals.txt', delimiter=' ')
data.columns = ['city1', 'country1', 'city2', 'country2']
data.head(5)

Unnamed: 0,city1,country1,city2,country2
0,Athens,Greece,Bangkok,Thailand
1,Athens,Greece,Beijing,China
2,Athens,Greece,Berlin,Germany
3,Athens,Greece,Bern,Switzerland
4,Athens,Greece,Cairo,Egypt


In [4]:
word_embeddings = pickle.load(open("word_embeddings_subset.p", "rb"))
len(word_embeddings)

243

In [5]:
print("dimension: {}".format(word_embeddings['Spain']))

dimension: [ 3.14941406e-02 -1.82617188e-01  1.63085938e-01  1.24511719e-01
 -1.85546875e-02 -1.73828125e-01 -1.21093750e-01 -1.32812500e-01
 -1.25000000e-01  2.45117188e-01 -2.18750000e-01  4.29687500e-02
  1.66015625e-01 -1.14746094e-01 -7.37304688e-02  2.81982422e-02
  7.71484375e-02  1.39648438e-01  1.06445312e-01  3.98437500e-01
  3.18359375e-01 -3.34472656e-02  3.80859375e-02  2.15820312e-01
 -3.93676758e-03  4.95605469e-02 -1.81640625e-01 -1.77734375e-01
 -2.17773438e-01  2.28515625e-01  1.47460938e-01  2.80761719e-02
  4.08935547e-03 -9.32617188e-02 -1.15722656e-01 -1.01318359e-02
 -3.69140625e-01 -1.53320312e-01  6.07910156e-02  1.50146484e-02
  1.42578125e-01  4.63867188e-03  7.12890625e-02  2.69531250e-01
  1.96289062e-01  6.73828125e-02  1.64794922e-03 -7.61718750e-02
 -8.15429688e-02  2.51953125e-01 -1.38671875e-01  2.61718750e-01
  1.13525391e-02 -3.22265625e-02 -1.71875000e-01  1.67968750e-01
 -3.35937500e-01  8.10546875e-02 -1.99218750e-01  1.17187500e-01
 -2.94189453e-

### 1.2 Cosine Similarity


In [7]:
def cosine_similarity(A, B):
    dot = np.dot(A,B)
    norma = np.sqrt(np.dot(A,A))
    normb = np.sqrt(np.dot(B,B))
    cos = dot / (norma*normb) 
    return cos

In [11]:
king = word_embeddings['king']
queen = word_embeddings['queen']

cosine_similarity(king, queen)

0.6510956

### 1.3 Euclidean distance

In [9]:
def euclidean(A, B):
    d = np.linalg.norm(A-B)
    return d


In [10]:
euclidean(king, queen)

2.4796925

### 1.4 Finding the country of each capital


In [13]:
def get_country(city1, country1, city2, embeddings):
    group = set((city1, country1, city2))
    city1_emb = word_embeddings[city1]

    country1_emb = word_embeddings[country1]
    city2_emb = word_embeddings[city2]
    vec = country1_emb - city1_emb + city2_emb 
    similarity = -1
    country = ''
    for word in embeddings.keys():
        if word not in group:
            word_emb = word_embeddings[word]
            cur_similarity = cosine_similarity(vec,word_emb)

            if cur_similarity > similarity:
                similarity = cur_similarity
                country = (word, similarity)

    return country

In [17]:
# Testing your function, note to make it more robust you can return the 5 most similar words.
get_country('Athens', 'Greece', 'Tehran', word_embeddings)

('Iran', 0.8067936)

### 1.5 Model Accuracy

In [18]:
def get_accuracy(word_embeddings, data):
    num_correct = 0

    for i, row in data.iterrows():

        city1 = row['city1']
        country1 = row['country1']
        city2 =  row['city2']
        country2 = row['country2']
        predicted_country2, _ = get_country(city1,country1,city2,word_embeddings)
        if predicted_country2 == country2:
            num_correct += 1
    m = len(data)
    accuracy = num_correct/m
    return accuracy


In [19]:
accuracy = get_accuracy(word_embeddings, data)
print(f"Accuracy is {accuracy:.2f}")

Accuracy is 0.92
