from gensim.models import Word2Vec
model = Word2Vec.load_word2vec_format('word2vec.bin')


In [1]:
from gensim.models import Word2Vec 
model = Word2Vec.load_word2vec_format('word2vec.bin', binary=True)


In [2]:
%matplotlib inline

import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib
import matplotlib.pyplot as plt
import re
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import pairwise_distances
from nltk.util import ngrams as nltk_ngrams
from collections import Counter
import networkx as nx

custom_stopwords = ['rt', '&amp', '#', '', '&amp;', '-', 'amp', '.', 'QQQQQQQQQ', 'll','re','ve']
stopwords_english = stopwords.words('english')
stopwords_spanish = stopwords.words('spanish')
tknzr = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)


In [145]:
data = pd.read_csv('/home/dima/Downloads/jan.csv')

In [153]:
def is_word_blacklisted(word):
    return word.startswith('http') or \
           word.startswith('//') or \
           word.startswith('@') or \
           word.startswith('#') or \
           word in custom_stopwords or \
           word in stopwords_english or \
           word in stopwords_spanish

def clean_text(text, remove_non_chars=True):
    new_text = re.sub(r'@\S+', '', text)
    new_text = re.sub(r'#\S+', '', new_text)
    new_text = re.sub(r',', '.', new_text)
    new_text = re.sub(r"'", ' ', new_text)
    new_text = re.sub(r'https://\S+', '', new_text)
    new_text = re.sub(r'http://\S+', '', new_text)
    words = tknzr.tokenize(new_text)

    if remove_non_chars:
        words = [re.sub(r'\W+', ' ', w) for w in words]

    return ' '.join([w for w in words if not is_word_blacklisted(w)])

def get_tweet_vector(text):
    words = text.split()
    N = len(words)
    vector = np.zeros(300)
    
    if N > 0:
        for word in words:
            if word in model:
                vector += model[word]
        vector /= N
    return vector

def find_all_paths(G):
    result = []
    try:
        result = []
        start_nodes = [k for k,v in G.in_degree().iteritems() if v == 0]

        for start in start_nodes:
            paths = nx.shortest_path(G, start)
            longest = max(paths, key= lambda x: len(set(paths[x])))
            result.append(paths[longest])
    except Exception as e:
        logging.info("Error, " + e.message)

    return result

def get_path_tuple(p, l):
    d= dict(l)
    prev = p[0]
    sum = 0
    count = 0
    min_cnt = 9999999
    min_term = ''
    phrase = p[0].split()[0]
    for i in range(1, len(p)):
        terms  = (prev + ' ' + p[i]).split()
        term = terms[0] + ' ' + terms[1] + ' ' + terms[3]

        if term in d:
            cnt = d[term]

            if cnt < min_cnt:
                min_cnt = cnt
                min_term = term

            count+=1
            sum += cnt

        prev = p[i]
        phrase += ' ' + p[i].split()[0]

    phrase += ' ' +  p[-1].split()[1]
    return phrase, min_cnt, min_term

def build_phrases_from_aggs(aggs):
    l = sorted(aggs, key=lambda x: x[1], reverse=True)

    G = nx.DiGraph()
    for i in range(len(l)):

        bgram = l[i][0]
        count = l[i][1]
        terms = bgram.split()

        term1 = terms[0] + ' ' + terms[1]
        term2 = terms[1] + ' ' + terms[2]

        G.add_edge(term1, term2, weight=count)

    paths = find_all_paths(G)
    phrases = [get_path_tuple(p,l) for p in paths]

    keys = list(set([x[2] for x in phrases]))
    d = [( k , [(x[0],x[1]) for x in phrases if x[2] == k]) for k in keys]

    result = [{'phrase' : v[0][0], 'count' : v[0][1], 'alias' : k} for k,v in dict(d).iteritems()]


    return sorted(result, key=lambda x : x['count'], reverse=True)


def get_grams(text):
    return [ ' '.join(g) for g in nltk_ngrams(text.split(),3)]

def get_cluster_phrases(cluster_tweets):
    cnt = Counter()
    for t in cluster_tweets:
        cnt.update(get_grams(t))
    return build_phrases_from_aggs(cnt.most_common(10))[:2]


def get_cluster_top_words(cluster_tweets):
    big_text = ' '.join(cluster_tweets)
    cnt = Counter(big_text.split())
    return cnt.most_common(10)

In [154]:
data = data[~data['Body'].str.contains('@OfficialJimRohn')]
data = data[~data['Body'].str.startswith('RT @')]
data['Body_clean'] = data['Body']
data['Body_clean'] = data['Body_clean'].apply(clean_text)
data = data.reset_index()
len(data)


7119

In [155]:
vectors = []
for i in range(len(data)):
    tweet_vector = get_tweet_vector(data['Body_clean'][i])
    vectors.append(tweet_vector)
len(vectors)

7119

In [156]:
num_of_clusters = 100
y = KMeans(n_clusters=num_of_clusters,n_jobs=3).fit_predict(vectors)
clusters = {}
for i in range(len(y)):
    if y[i] not in clusters:
        clusters[y[i]] = {'tweets' : [], 'vectors' : [], 'tweets_clean' : []}
    clusters[y[i]]['tweets'].append(data['Body'][i])
    clusters[y[i]]['tweets_clean'].append(data['Body_clean'][i])
    clusters[y[i]]['vectors'].append(vectors[i])
    
distance_sum = 0
distance_count = 0
for cluster_index, cluster in clusters.iteritems():
    clusters[cluster_index]['distance'] = np.sum(pairwise_distances(cluster['vectors'])) / (2 * len(cluster['vectors']))
    
    if clusters[cluster_index]['distance'] > 0:
        distance_count += 1
        distance_sum += clusters[cluster_index]['distance']
        
avg_distance = distance_sum / distance_count
print 'avg distance: ' + str(avg_distance)


avg distance: 52.4542159983


In [157]:
clusters_array = [clusters[k] for k,v in clusters.iteritems() if len(clusters[k]['tweets']) > 10]

In [158]:
C = sorted(clusters_array, key=lambda x: x['distance'])

In [159]:
print ''
for i in range(len(C)):
    print 'Topic (' + str(i) + ') Number of Tweets: ' + str(len(C[i]['tweets'])) + ', D: ' + str(C[i]['distance'])
    print '-------------------------------------------------------------------------------------------'
    words = get_cluster_top_words(C[i]['tweets_clean'])
    print 'Top Words:'
    print words
    phrases = get_cluster_phrases(C[i]['tweets_clean'])
    print '\nPhrases:'
    for phrase in phrases:
        print phrase['phrase'] + ' (' + str(phrase['count']) + ')'
    print '\nExamples:'
    for t in C[i]['tweets'][:10]:
        print t 
    print ''


Topic (0) Number of Tweets: 12, D: 0.0
-------------------------------------------------------------------------------------------
Top Words:
[(u'thank', 12)]

Phrases:

Examples:
Thank you @CarolSankar 😘 https://t.co/Amf1rFhno6
@mcannon79 @JoC_AWC Thank you...😊
Thank you @iamthatgirl ♥ http://twitter.com/DolansBrasil/status/744293648246935552/photo/1
@Ravager619 @TheMarySue thank you
@AAUWPolicy @AAUW Thank you!
.@GCGodfrey @MickTKipper @USabadini @BlueShareForum @smarkus @GeorgeO @AbdelQaader @Sentifi_UK @GlobalFXTrader1 Thank you for ❤/RT🙏😃#TRADING
@iamthatgirl Thank YOU! 🙈💖
Thank you for this. https://t.co/SOi4EQVQF2
@Pennwomen thank you!
@SalesforceAmy @Applango @GirlsWhoCode @BlackGirlsCode thank you!! 👍🏻🙋🏻

Topic (1) Number of Tweets: 11, D: 0.0
-------------------------------------------------------------------------------------------
Top Words:
[(u'staggering', 11), (u'keynote', 11), (u'stats', 11), (u'founder', 11)]

Phrases:
staggering stats keynote founder (11)

Examples:


In [136]:
model.most_similar('phone')

[(u'telephone', 0.822401762008667),
 (u'cell_phone', 0.7831963896751404),
 (u'cellphone', 0.7629483938217163),
 (u'Phone', 0.7060798406600952),
 (u'phones', 0.6894921064376831),
 (u'landline', 0.6263924837112427),
 (u'voicemail', 0.6252244710922241),
 (u'caller_id', 0.6023746728897095),
 (u'RingCentral_cloud_computing', 0.5935890078544617),
 (u'telephones', 0.5929964780807495)]