# Tweet Similarity

In this notebook, I will enter a query and print out the most similar tweets from the 3 presidential candidates 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re

import gensim
from gensim import corpora
from gensim import similarities
from gensim import models

from nltk.corpus import stopwords
from nltk.stem.porter import *
stemmer = PorterStemmer()



### Read the clean data

In [2]:
trump = pd.read_csv("data/DonaldTrumpClean.csv")
obama = pd.read_csv("data/BarackObamaClean.csv")
clinton = pd.read_csv("data/HillaryClintonClean.csv")

### Further clean the text

In [3]:
### Clean the text further to remove anything that is not useful in calculating similarity
def clean(docs):
    #Convert all to lower case
    docs_lower = [[w.lower() for w in doc] for doc in docs]
    
    #Remove all non-word characters
    docs_regex= [[w for w in doc if re.search('^[a-z]+$',w)] for doc in docs_lower]
    
    #Remove all stopwords
    stop_words = set(stopwords.words('english'))
    docs_stop = [[w for w in doc if w not in stop_words] for doc in docs_regex]
    
    #Stem all the worsd
    docs_stem = [[stemmer.stem(w) for w in doc] for doc in docs_stop]
    return docs_stem

### Map each word to an ID

In [4]:
#We need to map each unique word in the vocabulary of this corpus to an ID or index first. 
#These mappings from words to IDs are represented by a class called Dictionary in Gensim.

text_trump = trump['text'].map(lambda x: x.split())
docs_trump = clean(text_trump)
dictionaryTrump = corpora.Dictionary(docs_trump)
dictionaryTrump2 = corpora.Dictionary(text_trump)

text_obama = obama['text'].map(lambda x: x.split())
docs_obama = clean(text_obama)
dictionaryObama = corpora.Dictionary(docs_obama)

text_clinton = clinton['text'].map(lambda x: x.split())
docs_clinton = clean(text_clinton)
dictionaryClinton = corpora.Dictionary(docs_clinton)

In [5]:
print(dictionaryClinton)

Dictionary(3375 unique tokens: ['let', 'bright', 'everyon', 'go', 'know']...)


### Obtain the data where each token is mapped to an ID

In [6]:
#We can use dictionary.token2id to obtain a dict object which contains all the mappings.
token_to_id_trump = dictionaryTrump.token2id
token_to_id_obama = dictionaryObama.token2id
token_to_id_clinton = dictionaryClinton.token2id
print(type(token_to_id_obama))

<class 'dict'>


In [7]:
print(dictionaryTrump)

Dictionary(5803 unique tokens: ['realli', 'badli', 'drop', 'establish', 'let']...)


### Convert documents to vectors

In [10]:
#Next, we use the function doc2bow to convert our documents to vectors . Here bow stands for bag of words
vec_trump = [dictionaryTrump.doc2bow(doc) for doc in docs_trump]
vec_obama = [dictionaryObama.doc2bow(doc) for doc in docs_obama]
vec_clinton = [dictionaryClinton.doc2bow(doc) for doc in docs_clinton]

### Compute similarities

In [11]:
index_trump = similarities.SparseMatrixSimilarity(vec_trump, len(dictionaryTrump.items()))
index_obama = similarities.SparseMatrixSimilarity(vec_obama, len(dictionaryObama.items()))
index_clinton = similarities.SparseMatrixSimilarity(vec_clinton, len(dictionaryClinton.items()))

### Convert to TFIDF model

In [12]:
tfidf_trump = models.TfidfModel(vec_trump)
vecs_with_tfidf_trump = []
for doc in vec_trump:
    vecs_with_tfidf_trump.append(tfidf_trump[doc])

In [13]:
tfidf_obama = models.TfidfModel(vec_obama)
vecs_with_tfidf_obama = []
for doc in vec_obama:
    vecs_with_tfidf_obama.append(tfidf_obama[doc])

In [14]:
tfidf_clinton = models.TfidfModel(vec_clinton)
vecs_with_tfidf_clinton = []
for doc in vec_clinton:
    vecs_with_tfidf_clinton.append(tfidf_clinton[doc])

### Retrieve the most similar tweet

In [35]:
#Enter query here
query = 'Mexico'
query_stem = [stemmer.stem(query)]

In [36]:
def findSimilarTweet(dictionary, tfidf_model, index, df):
    query_vec = dictionary.doc2bow(query_stem)
    query_vec_tfidf = tfidf_model[query_vec]
    q_similarity = index[query_vec_tfidf]
    q_sim_sorted = sorted(enumerate(q_similarity), key = lambda item: -item[1])
    similar_tweet = df['text'][q_sim_sorted[0][0]]
    return similar_tweet.strip()

In [37]:
similar_trump = findSimilarTweet(dictionaryTrump, tfidf_trump, index_trump, trump)
similar_obama = findSimilarTweet(dictionaryObama, tfidf_obama, index_obama, obama)
similar_clinton = findSimilarTweet(dictionaryClinton, tfidf_clinton, index_clinton, clinton)

In [38]:
print("Query: ", query)
print("\nTrump's similar tweet: ", similar_trump)
print("\nObama's similar tweet: ", similar_obama)
print("\nClinton's similar tweet: ", similar_clinton)

Query:  Mexico

Trump's similar tweet:  Mexico will pay for the wall!

Obama's similar tweet:  Denying climate change is dangerous. Join  supporters in standing up for bold action now http//ofa.bo/2dZNTRx #ActOnClimate

Clinton's similar tweet:  A wall that Mexico will pay for a bad idea from an even worse negotiator.
