-----

# Word2Vec in 6 lines of Python
    
    1  import requests
    2  import gensim
    3  url = 'http://www.gutenberg.org/cache/epub/1041/pg1041.txt'
    4  text = requests.get(url).text
    5  tokens = gensim.utils.simple_preprocess(t)
    6  model = gensim.models.Word2Vec([tokens], min_count=3, size=100)
    
-----

## Breakdown of this code

In [122]:
import requests

In [123]:
def fetch(url):
    '''input url, output text from page'''
    r = requests.get(url).text
    return r

In [128]:
# fetch Shakespeare's sonnets
text = fetch('http://www.gutenberg.org/cache/epub/1041/pg1041.txt')

In [129]:
text[1000:3001]

u"own bright eyes,\r\n  Feed'st thy light's flame with self-substantial fuel,\r\n  Making a famine where abundance lies,\r\n  Thy self thy foe, to thy sweet self too cruel:\r\n  Thou that art now the world's fresh ornament,\r\n  And only herald to the gaudy spring,\r\n  Within thine own bud buriest thy content,\r\n  And tender churl mak'st waste in niggarding:\r\n    Pity the world, or else this glutton be,\r\n    To eat the world's due, by the grave and thee.\r\n\r\n  II\r\n\r\n  When forty winters shall besiege thy brow,\r\n  And dig deep trenches in thy beauty's field,\r\n  Thy youth's proud livery so gazed on now,\r\n  Will be a tatter'd weed of small worth held:\r\n  Then being asked, where all thy beauty lies,\r\n  Where all the treasure of thy lusty days;\r\n  To say, within thine own deep sunken eyes,\r\n  Were an all-eating shame, and thriftless praise.\r\n  How much more praise deserv'd thy beauty's use,\r\n  If thou couldst answer 'This fair child of mine\r\n  Shall sum my c

In [132]:
import gensim
tokenized = gensim.utils.simple_preprocess(text)

In [136]:
len(tokenized)

20247

In [135]:
tokenized[1000:1021]

[u'in',
 u'single',
 u'life',
 u'ah',
 u'if',
 u'thou',
 u'issueless',
 u'shalt',
 u'hap',
 u'to',
 u'die',
 u'the',
 u'world',
 u'will',
 u'wail',
 u'thee',
 u'like',
 u'makeless',
 u'wife',
 u'the',
 u'world']

## Explore the corpus, find most frequently occuring tokens

In [261]:
# find the frequency of each word in list
def freq(tokens):
    '''input list tokens, output frequency of each token'''
    word_freq = [tokens.count(p) for p in tokens]
    return zip(tokens, word_freq)

In [262]:
freq_list = freq(tokenized)

In [263]:
len(freq_list)

20247

In [268]:
freq_list[100:121]

[(u'by', 118),
 (u'william', 4),
 (u'shakespeare', 9),
 (u'from', 97),
 (u'fairest', 5),
 (u'creatures', 2),
 (u'we', 24),
 (u'desire', 11),
 (u'increase', 4),
 (u'that', 338),
 (u'thereby', 2),
 (u'beauty', 70),
 (u'rose', 6),
 (u'might', 26),
 (u'never', 16),
 (u'die', 12),
 (u'but', 168),
 (u'as', 132),
 (u'the', 613),
 (u'riper', 2),
 (u'should', 46)]

In [269]:
# the above method is okay, but its not sorted.

# A more direct and easy route that 
# sorts by most frequently occuring in corpus,
# uses the Counter method from the collections library.
# And its much less code!

from collections import Counter
c = Counter(tokenized)
c.most_common(20)

[(u'the', 613),
 (u'and', 560),
 (u'to', 495),
 (u'of', 488),
 (u'in', 380),
 (u'my', 372),
 (u'that', 338),
 (u'thy', 281),
 (u'thou', 235),
 (u'with', 228),
 (u'for', 198),
 (u'love', 195),
 (u'is', 194),
 (u'not', 188),
 (u'you', 183),
 (u'but', 168),
 (u'me', 164),
 (u'thee', 162),
 (u'be', 160),
 (u'or', 157)]

## Build model & start testing/analyzing

In [233]:
model = gensim.models.Word2Vec([tokenized], min_count=3, size=100)

In [234]:
model.wv.similarity('man', 'woman')

0.99791601478479208

In [235]:
model.wv.similarity('woman', 'woman')

1.0000000000000002

In [236]:
model['woman']

array([ 0.09490858,  0.09576362, -0.03184532,  0.11927754, -0.0824075 ,
        0.01422762, -0.04995862, -0.0985888 , -0.01279412,  0.09472705,
        0.0349883 , -0.10929005, -0.04712148,  0.03875189, -0.01393333,
        0.12059111,  0.05010965, -0.06369404,  0.01640897, -0.02410512,
       -0.1487816 , -0.06996666,  0.07345931,  0.16995013, -0.01546913,
        0.02747393, -0.06288467,  0.11962786, -0.06722205, -0.09957583,
       -0.01633924,  0.16259933,  0.0771884 , -0.00798618,  0.02566331,
        0.05760846,  0.00768478,  0.01273172, -0.00411591, -0.08337752,
       -0.03925778,  0.05824149, -0.08774456, -0.04724528,  0.0063543 ,
       -0.0654636 ,  0.03776062,  0.01148849, -0.04262467,  0.04399591,
        0.00581395, -0.00190668, -0.03522281, -0.03243601, -0.03173142,
        0.00098932, -0.00621357, -0.03562969, -0.15038134,  0.03011033,
       -0.07072259,  0.04718942, -0.02941588, -0.03766086, -0.0211026 ,
        0.00745422, -0.10056863,  0.06429652,  0.09747244,  0.00

In [237]:
vec1 = model['woman']

In [238]:
model.wv.similar_by_vector(vec1, topn=15)

[(u'woman', 1.0),
 (u'she', 0.9991060495376587),
 (u'of', 0.9990603923797607),
 (u'might', 0.9990493059158325),
 (u'it', 0.9990442395210266),
 (u'hate', 0.999043881893158),
 (u'still', 0.9990429282188416),
 (u'upon', 0.9990413188934326),
 (u'when', 0.999036967754364),
 (u'should', 0.9990242719650269),
 (u'fair', 0.9990196824073792),
 (u'you', 0.9990196228027344),
 (u'what', 0.9990196228027344),
 (u'yet', 0.999017059803009),
 (u'her', 0.9990150332450867)]

In [239]:
v = model['men']
model.wv.similar_by_vector(v, topn=15)

[(u'men', 0.9999999403953552),
 (u'thou', 0.999750554561615),
 (u'their', 0.9997481107711792),
 (u'and', 0.999745786190033),
 (u'of', 0.9997451305389404),
 (u'with', 0.9997432827949524),
 (u'as', 0.9997431039810181),
 (u'my', 0.9997419118881226),
 (u'the', 0.9997345209121704),
 (u'his', 0.9997340440750122),
 (u'heart', 0.9997338652610779),
 (u'me', 0.9997330904006958),
 (u'to', 0.9997327327728271),
 (u'all', 0.9997305274009705),
 (u'be', 0.9997302293777466)]

In [240]:
v = model['women']
model.wv.similar_by_vector(v, topn=15)

[(u'women', 0.9999999403953552),
 (u'gift', 0.9904922246932983),
 (u'mistress', 0.9904735684394836),
 (u'smell', 0.9904466271400452),
 (u'youth', 0.9904308319091797),
 (u'fire', 0.9903542995452881),
 (u'cheeks', 0.9902323484420776),
 (u'must', 0.9902098178863525),
 (u'away', 0.9901643991470337),
 (u'could', 0.9901115894317627),
 (u'worse', 0.9901033639907837),
 (u'poor', 0.9900951385498047),
 (u'we', 0.9900863766670227),
 (u'then', 0.9900714755058289),
 (u'there', 0.9900569915771484)]

In [241]:
v = model['love']
model.wv.similar_by_vector(v)

[(u'love', 1.0000001192092896),
 (u'and', 0.9999521374702454),
 (u'my', 0.9999481439590454),
 (u'in', 0.9999479651451111),
 (u'of', 0.9999454021453857),
 (u'so', 0.9999450445175171),
 (u'thy', 0.9999448657035828),
 (u'thou', 0.9999443888664246),
 (u'to', 0.9999439120292664),
 (u'that', 0.9999434947967529)]