In [None]:
import re
import numpy as np
import pandas as pd
from time import time
from collections import defaultdict
import spacy

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
import multiprocessing

import matplotlib.pyplot as plt
%matplotlib inline
 
import seaborn as sns
sns.set_style("darkgrid")

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [None]:
df = pd.read_csv('../input/trump-tweets/trumptweets.csv')
df.shape

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
nlp=spacy.load('en',disable=['ner','parser'])

def cleaning(doc):
    txt=[token.lemma_ for token in doc if not token.is_stop]
    if len(txt)>2:
        return ' '.join(txt)

In [None]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['content'])

In [None]:
txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_threads=-1)]

In [None]:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

In [None]:
sent = [row.split() for row in df_clean['clean']]
phrases = Phrases(sent, min_count=30, progress_per=10000)
bigram = Phraser(phrases)
sentences = bigram[sent]

In [None]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
print(len(word_freq))

In [None]:
sorted(word_freq, key=word_freq.get, reverse=True)[:5]

In [None]:
cores = multiprocessing.cpu_count()
print(cores)

In [None]:
w2v_model = Word2Vec(min_count=20,window=2,vector_size=100,sample=6e-5, 
                alpha=0.03, min_alpha=0.0007, negative=20,workers=cores-1)

In [None]:
w2v_model.build_vocab(sentences, progress_per=10000)
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)
w2v_model.init_sims(replace=True)

In [None]:
w2v_model.wv.most_similar(positive=["biden"])

In [None]:
w2v_model.wv.similarity('sleepy_joe','biden')

In [None]:
w2v_model.wv.doesnt_match(['trump', 'biden', 'obama'])

In [None]:
w2v_model.wv.most_similar(positive=['biden','hunter'], negative=['trump'], topn=3)

In [None]:
def tsnescatterplot(model, word, list_names):

    arrays = np.empty((0, 100), dtype='f')
    word_labels = [word]
    color_list  = ['red']

    arrays = np.append(arrays, model.wv.__getitem__([word]), axis=0)
    close_words = model.wv.most_similar([word])
    
    for wrd_score in close_words:
        wrd_vector = model.wv.__getitem__([wrd_score[0]])
        word_labels.append(wrd_score[0])
        color_list.append('blue')
        arrays = np.append(arrays, wrd_vector, axis=0)
    
    for wrd in list_names:
        wrd_vector = model.wv.__getitem__([wrd])
        word_labels.append(wrd)
        color_list.append('green')
        arrays = np.append(arrays, wrd_vector, axis=0)
        
    reduc =  PCA(n_components=10).fit_transform(arrays)
    np.set_printoptions(suppress=True)
    Y = TSNE(n_components=2, random_state=0, perplexity=15).fit_transform(reduc)
    
    df = pd.DataFrame({'x': [x for x in Y[:, 0]],
                       'y': [y for y in Y[:, 1]],
                       'words': word_labels,
                       'color': color_list})
    
    fig, _ = plt.subplots()
    fig.set_size_inches(9, 9)
    
    p1 = sns.regplot(data=df,
                     x="x",
                     y="y",
                     fit_reg=False,
                     marker="o",
                     scatter_kws={'s': 40,
                                  'facecolors': df['color']
                                 }
                    )
    
    for line in range(0, df.shape[0]):
         p1.text(df["x"][line],
                 df['y'][line],
                 '  ' + df["words"][line].title(),
                 horizontalalignment='left',
                 verticalalignment='bottom', size='medium',
                 color=df['color'][line],
                 weight='normal'
                ).set_size(15)
    
    plt.xlim(Y[:, 0].min()-50, Y[:, 0].max()+50)
    plt.ylim(Y[:, 1].min()-50, Y[:, 1].max()+50)
    plt.title('t-SNE visualization for {}'.format(word.title()))

In [None]:
tsnescatterplot(w2v_model,'biden',['china','japan','uk','france','germany','russia','usa'])

In [None]:
tsnescatterplot(w2v_model,'trump',['china','japan','uk','france','germany','russia','usa'])

In [None]:
tsnescatterplot(w2v_model,'obama',['china','japan','uk','france','germany','russia','usa'])