In [1]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('punkt')

import numpy as np
import pandas as pd
from gensim.models import word2vec

from google.colab import drive
drive.mount('/content/drive')

import re # For regular expressions

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Mounted at /content/drive


## (a) Load the dataset

In [2]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def load_data():
    """ Read tweets from the file.
        Return:
            list of lists (list_words), with words from each of the processed tweets
    """
    tweets = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CS420/Assignment 2/Corona_Tweets_large.csv', names=['text'])
    list_words = []
    ### iterate over all tweets from the dataset
    for i in tweets.index:
      ### remove non-letter.
      text = re.sub("[^a-zA-Z]"," ", tweets.loc[i, 'text'])
      ### tokenize
      words = nltk.word_tokenize(text)

      new_words = []
      ### iterate over all words of a tweet
      stop_words = set(stopwords.words('english'))
      ## TODO: remove the stop words and convert a word (w) to the lower case
      for w in words:
        if w not in stop_words:
          new_words.append(w.lower())

      list_words.append(new_words)
      # print(new_words)
    return list_words
# check a few samples of twitter corpus
twitter_corpus = load_data()
print(twitter_corpus[:3])

[['if', 'i', 'smelled', 'scent', 'hand', 'sanitizers', 'today', 'someone', 'past', 'i', 'would', 'think', 'intoxicated', 'https', 'co', 'qzvybrogb'], ['hey', 'yankees', 'yankeespr', 'mlb', 'made', 'sense', 'players', 'pay', 'respects', 'a', 'https', 'co', 'qvw', 'zgypu'], ['diane', 'wdunlap', 'realdonaldtrump', 'trump', 'never', 'claimed', 'covid', 'hoax', 'we', 'claim', 'effort', 'https', 'co', 'jkk', 'vhwhb']]


## (b) Word2Vec

In [3]:
# Creating the word2vec model and setting values for the various parameters

# Initializing the train model.
num_features = 125   # Word vector dimensionality
min_word_count = 10  # Minimum word count. You can change it also.
num_workers = 4     # Number of parallel threads, can be changed
context = 10        # Context window size
downsampling = 1e-3 # (0.001) Downsample setting for frequent words, can be changed
# Initializing the train model
print("Training Word2Vec model....")
model = word2vec.Word2Vec(twitter_corpus,\
                          workers=num_workers,\
                          vector_size=num_features,\
                          min_count=min_word_count,\
                          window=context,
                          sample=downsampling)

# To make the model memory efficient
model.init_sims(replace=True)

Training Word2Vec model....


  model.init_sims(replace=True)


In [4]:
model.wv.most_similar("covid") #this word2vec trained model on tweets

[('coronavirus', 0.4285234808921814),
 ('contained', 0.40343719720840454),
 ('jakarta', 0.3946060538291931),
 ('plus', 0.38701462745666504),
 ('prompts', 0.38632798194885254),
 ('more', 0.3799235224723816),
 ('viru', 0.37390199303627014),
 ('successfully', 0.3696480691432953),
 ('ukcovid', 0.36930474638938904),
 ('hurricane', 0.367278128862381)]

In [5]:
model.wv.most_similar("grocery")

[('store', 0.925132691860199),
 ('shop', 0.9137942790985107),
 ('shopping', 0.8718840479850769),
 ('socially', 0.8441929221153259),
 ('stores', 0.8441144227981567),
 ('barber', 0.8361119627952576),
 ('comfortable', 0.8312827944755554),
 ('distanced', 0.8270167708396912),
 ('shoppers', 0.8251228928565979),
 ('restaurant', 0.8118851184844971)]

In [6]:
model.wv.most_similar("virus")

[('deadly', 0.6358702182769775),
 ('herd', 0.6308822631835938),
 ('contagious', 0.61180579662323),
 ('control', 0.5976990461349487),
 ('viruses', 0.5927829742431641),
 ('wuhan', 0.5793089270591736),
 ('controlled', 0.5730675458908081),
 ('underestimate', 0.5659732818603516),
 ('disease', 0.5549526214599609),
 ('known', 0.5524728894233704)]

In [7]:
model.wv.most_similar("corona")

[('coronainfoch', 0.6885880827903748),
 ('worldnews', 0.6820882558822632),
 ('genetically', 0.5453908443450928),
 ('bat', 0.5453872680664062),
 ('source', 0.5437778830528259),
 ('cpho', 0.5250560641288757),
 ('utc', 0.5163848996162415),
 ('spectatorindex', 0.5046815872192383),
 ('wuhan', 0.4883095920085907),
 ('coronavirusapp', 0.4879246652126312)]

In [8]:
model.wv.most_similar("pandemic")

[('crisis', 0.7297680377960205),
 ('pan', 0.5557994246482849),
 ('warming', 0.5437407493591309),
 ('wemerry', 0.5380669236183167),
 ('pandemi', 0.5360347032546997),
 ('disruption', 0.5346295237541199),
 ('fears', 0.5333794355392456),
 ('crises', 0.5305386781692505),
 ('threat', 0.5175558924674988),
 ('gracepoint', 0.5024921298027039)]

In [9]:
model.wv.most_similar("lockdown")

[('restrictions', 0.7503029108047485),
 ('lifted', 0.6747542023658752),
 ('imposed', 0.6722403168678284),
 ('easing', 0.6710984110832214),
 ('quarantine', 0.6448826193809509),
 ('melbourne', 0.6286118030548096),
 ('curfew', 0.6204643845558167),
 ('auckland', 0.6159437298774719),
 ('travel', 0.6059140563011169),
 ('road', 0.6056724190711975)]