# Experiment - 6 Word2Vec 

Importing Libraries

In [2]:
import pandas as pd
import re

import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from gensim.models import Word2Vec

Importing Data

In [3]:
df = pd.read_csv('elon_musk_tweets.csv')

tweets = df['text']

tweets

0       @BillyM2k I find the gold toe sock – inevitabl...
1                      Sock Con, the conference for socks
2       Always something new for the magazine cover an...
3                        @ExplainThisBob This guy gets it
4       Sock tech is so advanced that you can get pret...
                              ...                        
3536    Check out my spicy 🌶️ 🔥 OnlyGANs!! https://t.c...
3537                @derek_j_morris @pmarca Major problem
3538                                        @EvaFoxU Nice
3539    @DavidSacks A Russia-China alliance is inevita...
3540                                 @PeterDiamandis Yeah
Name: text, Length: 3541, dtype: object

Preprocessing with Porter Stemmer and WordNetLemmatizer

In [4]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

tweetList = []

stopWords = set(stopwords.words('english'))

for tweet in tweets:
    # Convert the tweet to lowercase using REGEX
    for f in re.findall("([A-Z]+)", tweet):
        tweet = tweet.replace(f, f.lower())

    # Removing special characters and replacing them with a space
    tweet = re.sub("[^A-Za-z0-9]", " ", tweet, 0, re.IGNORECASE)
    
    # From a single sentence, store all the words 
    wordsInTweet = nltk.word_tokenize(tweet)
    
    # Filter out all the stop words
    wordsInTweet = [word for word in wordsInTweet if word not in stopWords]
    
    # Stem each of the tweets
    wordsInTweet = [stemmer.stem(word) for word in wordsInTweet if word not in stopWords]
    
    # Lemmatize each of the tweets
    wordsInTweet = [lemmatizer.lemmatize(word) for word in wordsInTweet]
    
    # Append them to a list
    tweetList.append(wordsInTweet)

Final Tokenized Tweets

In [11]:
tweetList

[['billym2k',
  'find',
  'gold',
  'toe',
  'sock',
  'inevit',
  'kilter',
  'amp',
  'wash',
  'littl',
  'troubl',
  'esthet',
  'amp',
  'arguabl',
  'bit',
  'corpo'],
 ['sock', 'con', 'confer', 'sock'],
 ['alway', 'someth', 'new', 'magazin', 'cover', 'articl', 'practic', 'write'],
 ['explainthisbob', 'guy', 'get'],
 ['sock',
  'tech',
  'advanc',
  'get',
  'pretti',
  'much',
  'anyth',
  'sock',
  'form',
  'day'],
 ['must', 'confess', 'penchant', 'creativ', 'sock'],
 ['slashdot', 'time'],
 ['tonyadevitti',
  'historydefin',
  'success',
  'fact',
  'due',
  'part',
  'super',
  'fun',
  'parti',
  'spoke',
  'wrote',
  'incred',
  'well'],
 ['historydefin',
  'bleak',
  'post',
  'mayb',
  'gener',
  'click',
  'happier',
  'moment',
  'histori',
  'would',
  'nice'],
 ['mishaboar', 'boringcompani', 'support', 'doge', 'wherev', 'possibl'],
 ['moon', 'brought', 'u', 'togeth', '69', 'mar', 'futur'],
 ['without', 'common', 'goal', 'human', 'fight'],
 ['ppathol', 'exactli'],
 ['a

### Implementing Word2Vec Model by Gensim

In [7]:
model = Word2Vec(sentences=tweetList, min_count=1, vector_size=100, workers=4, window=4,sg=0)

In [8]:
similarWords = model.wv.most_similar('wonder', topn=5)  # get other similar words
similarWords

[('data', 0.5848771929740906),
 ('lol', 0.5823049545288086),
 ('ad', 0.5750864148139954),
 ('real', 0.5685648322105408),
 ('time', 0.5676756501197815)]

References:- Gensim Word2Vec Implementation by Medium