In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pymongo import MongoClient
from bson.objectid import ObjectId
import pymongo
from scipy import spatial
import re
import json
import itertools
from sklearn.metrics.pairwise import cosine_similarity
from gensim import corpora, models, similarities, matutils
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
import os.path as path
from sklearn import cross_validation
from textblob import TextBlob, Word, WordList

%matplotlib inline

In [None]:
### Function to add two word vectors together
def matrix_sum(lst):
    if len(lst) > 0:
        return np.array([sum(x) for x in zip(*[w2v[i] for i in lst])])
    else:
        return np.zeros((300,), dtype=np.int)


### A series of operations to clean up the Twitter text    
def preprocess(lst):
    stop = stopwords.words('english')
    punctuation = ['.', ',', '"', ':', ';', '""', '!', '?']
    stop.append('rt')
    stoplist = stop
    tknzr = TweetTokenizer()

    lst = re.sub(r'https?:\/\/.*[\r\n]*', '', lst) 

    lst = tknzr.tokenize(lst)
    lst = ' '.join(lst)

    text = [word for word in lst.lower().split() if word not in stoplist]
    text = [word for word in text if word not in punctuation]
    
    for i in text:
        if '@' in i:
            text.remove(i)
#         elif '@' in w:
#             i.remove(w)
        else:
            next
            
    return text


### Measures the similarity between emoji vectors
def emoji_similarity(lst):
    if len(lst) > 0:
        cosine_test_vec = [w2v[i] for i in lst]
        return np.mean(cosine_similarity(cosine_test_vec))
    else:
        return 0

In [None]:
### Spins up the Mongo client to query for tweets from the emojidb database
client = MongoClient('localhost', 27017, connect=False)
emojidb = client.emojidb
emojitweets = emojidb.emojitweets

In [None]:
### Returns the text of the Twitter corpus and places into a dataframe
cursor = emojitweets.find({}, { 'text': 1})
df_tweets = pd.DataFrame(list(cursor))

In [None]:
### A number of operations to build the analytical dataframe:

### Converts tweets to strings, in the event they are not already
df_tweets['text'] = df_tweets['text'].astype(str)

### Cleans up the text using our preprocess function
df_tweets['words'] = df_tweets['text'].map(lambda x: preprocess(x))

### Pulls out the emoji used in a given tweet 
df_tweets['tweet_emoji'] = df_tweets['words'].map(lambda x: [i for i in x if i in emoji_list])

### Pulls out the text (no emoji) from each tweet
df_tweets['tweet_text'] = df_tweets['words'].map(lambda x: [i for i in x if i not in emoji_list])

### Counts the number of total emoji used
df_tweets['emoji_used'] = df_tweets['tweet_emoji'].map(lambda x: len(x))

### Counts the number of unique emoji used
df_tweets['unique_emoji_used'] = df_tweets['tweet_emoji'].map(lambda x: len(set(x)))

### Trains Word2Vec model on corpus

In [None]:
### Creates list of lemmatized and cleaned tweets from our dataframe
vec_tweet_list = list(df_tweets['words'])

In [None]:
### Trains Word2Vec model on tweets with 300 dimensions
w2v = models.Word2Vec(vec_tweet_list, size=300, window=10, min_count=1)

In [None]:
### Saves model
w2v.save('w2v_01.model')

### Uses output of Word2Vec model to measure emoji complexity

In [None]:
### Calculates aggregate text vector
df_tweets['text_sum'] = df_tweets['tweet_text'].map(matrix_sum)

### Calculates aggregate emoji vector
df_tweets['emoji_sum'] = df_tweets['tweet_emoji'].map(matrix_sum)

### Calculates similarity of emoji used
df_tweets['emoji_similarity'] = df_tweets['tweet_emoji'].map(emoji_similarity)

In [None]:
"""
Because calculating cosine similarities on 300-dimension vectors 1 million times is resource-intensive,
I split out the vectors into lists to make it a little more efficient
"""
vec_emoji_list = list(df_tweets['emoji_sum'])
vec_text_list = list(df_tweets['text_sum'])
vec_cosine_list = list(zip(vec_emoji_list, vec_text_list))

In [None]:
### Calculates the cosine similarity between emoji and text,
### puts it into a list so we can add it back to our dataframe
cosine_sim = []

for i in vec_cosine_list:
    sim = 1 - spatial.distance.cosine(i[0], i[1])
#     sim  = cosine_similarity(i[0], i[1])[0]
    cosine_sim.append(sim)

In [None]:
### Makes our cosine similarity list an array
m = np.asarray(cosine_sim)

### Adds cosine similarities back into df_tweets
df_tweets["emoji_text_similarity"] = m[df_tweets.index]

### Replaces NaN's
df_tweets["emoji_text_similarity"] = df_tweets["emoji_text_similarity"].replace(np.nan, 0)

### Normalizes the similarities on a scale from 0 to 1, rather than -1 to 1
df_tweets["emoji_text_similarity"] = df_tweets["emoji_text_similarity"].map(lambda x: (x + 1) / 2)

#### Implementation of the simple dumb model of emoji complexity

In [None]:
df_tweets['complexity'] = (df_tweets['unique_emoji_used'] / df_tweets['emoji_used']) - df_tweets['emoji_similarity'] - df_tweets['emoji_text_similarity']

In [None]:
### Normalizes complexity on a scale of 0 to 1 using the max and min final complexity values
df_tweets['complexity'] = df_tweets['complexity'].map(lambda x: (x - (-1.806203)) / ((0.235635) - (-1.806203)))

### Builds the network analysis

In [None]:
### Builds out the JSON of emoji cross-usage for network analysis
df_network = df_tweets[df_tweets['unique_emoji_used'] > 2]
df_network.head()

In [None]:
### Creates a list of emojis in the corpus by tweet to begin network pairings
tweet_emoji_list = list(df_network['tweet_emoji'])
tweet_emoji_list = [' '.join(x) for x in tweet_emoji_list]
tweet_emoji_list = list(filter(None, tweet_emoji_list))

In [None]:
"""
Here we cycle through each emoji, see if that emoji is in each tweet, and if it is, append it to a list of emoji used
with the search emoji. This is to build a "network" of emoji that are used with one another.
"""

emoji_network_json = []

for e in emoji_list:
    emoji_dict = {}
    emoji_node_links = []
    for m in tweet_emoji_list:
        if e in m:
            m_list = m.split()
            m_list = [i for i in m_list if i in keep_emoji]
            emoji_node_links.extend(m_list)

        else:
            next

        emoji_node_links = list(set(emoji_node_links))

    if len(emoji_node_links) > 1:    
        emoji_dict["name"] = e
        emoji_dict['size'] = 2000
        emoji_dict['imports'] = emoji_node_links
        emoji_network_json.append(emoji_dict)
    else:
        next

In [None]:
### Saves the full JSON of emoji couplings
with open('emoji_network.json', 'w', encoding='string_escape') as fp:
    json.dump(emoji_network, fp)

In [None]:
### Saves the top 100 emoji most used with other emoji for a cleaner visualization
emoji_network_sorted = sorted(emoji_network_json, key=lambda k:len(k), reverse = True)
emoji_network = emoji_network_sorted[:100]
keep_emoji = [i['name'] for i in emoji_network]

for i in emoji_network:
    for n in i['imports']:
        if n not in keep_emoji:
            i['imports'].remove(n)
        else:
            next