In [52]:
%matplotlib inline

import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib
import matplotlib.pyplot as plt
import re
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import pairwise_distances
from nltk.util import ngrams as nltk_ngrams
from collections import Counter
import networkx as nx

custom_stopwords = ['rt', '&amp', '#', '', '&amp;', '-', 'amp', '.', 'QQQQQQQQQ', 'll','re','ve']
stopwords_english = stopwords.words('english')
stopwords_spanish = stopwords.words('spanish')
tknzr = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
def is_word_blacklisted(word):
    return word.startswith('http') or \
           word.startswith('//') or \
           word.startswith('@') or \
           word.startswith('#')# or \
        #   word in custom_stopwords or \
         #  word in stopwords_english or \
         #  word in stopwords_spanish

def clean_text(text, remove_non_chars=True):
    new_text = re.sub(r'@\S+', '', text)
    new_text = re.sub(r'#\S+', '', new_text)
    new_text = re.sub(r',', '.', new_text)
    new_text = re.sub(r"'", ' ', new_text)
    new_text = re.sub(r'https://\S+', '', new_text)
    new_text = re.sub(r'http://\S+', '', new_text)
    words = tknzr.tokenize(new_text)

    if remove_non_chars:
        words = [re.sub(r'\W+', ' ', w) for w in words]

    return ' '.join([w for w in words if not is_word_blacklisted(w)])

def find_all_paths(G):
    result = []
    try:
        result = []
        start_nodes = [k for k,v in G.in_degree().iteritems() if v == 0]

        for start in start_nodes:
            paths = nx.shortest_path(G, start)
            longest = max(paths, key= lambda x: len(set(paths[x])))
            result.append(paths[longest])
    except Exception as e:
        logging.info("Error, " + e.message)

    return result

def get_path_tuple(p, l):
    d= dict(l)
    prev = p[0]
    sum = 0
    count = 0
    min_cnt = 9999999
    min_term = ''
    phrase = p[0].split()[0]
    for i in range(1, len(p)):
        terms  = (prev + ' ' + p[i]).split()
        term = terms[0] + ' ' + terms[1] + ' ' + terms[3]

        if term in d:
            cnt = d[term]

            if cnt < min_cnt:
                min_cnt = cnt
                min_term = term

            count+=1
            sum += cnt

        prev = p[i]
        phrase += ' ' + p[i].split()[0]

    phrase += ' ' +  p[-1].split()[1]
    return phrase, min_cnt, min_term

def build_phrases_from_aggs(aggs):
    l = sorted(aggs, key=lambda x: x[1], reverse=True)

    G = nx.DiGraph()
    for i in range(len(l)):

        bgram = l[i][0]
        count = l[i][1]
        terms = bgram.split()

        term1 = terms[0] + ' ' + terms[1]
        term2 = terms[1] + ' ' + terms[2]

        G.add_edge(term1, term2, weight=count)

    paths = find_all_paths(G)
    phrases = [get_path_tuple(p,l) for p in paths]

    keys = list(set([x[2] for x in phrases]))
    d = [( k , [(x[0],x[1]) for x in phrases if x[2] == k]) for k in keys]

    result = [{'phrase' : v[0][0], 'count' : v[0][1], 'alias' : k} for k,v in dict(d).iteritems()]


    return sorted(result, key=lambda x : x['count'], reverse=True)


def get_grams(text):
    return [ ' '.join(g) for g in nltk_ngrams(text.split(),3)]

def get_cluster_phrases(cluster_tweets):
    cnt = Counter()
    for t in cluster_tweets:
        cnt.update(get_grams(t))
    return build_phrases_from_aggs(cnt.most_common(int(np.sqrt(len(cluster_tweets)))))[:10]

In [53]:
data = pd.read_csv('/home/dima/Downloads/jan.csv')

In [54]:

data['Body_clean'] = data['Body']
data = data[~data['Body'].str.contains('@OfficialJimRohn #Leadership #Success')]
data = data[~data['Body'].str.startswith('RT @')]
data['Body_clean'] = data['Body_clean'].apply(clean_text)

In [55]:
phrases = get_cluster_phrases(data['Body_clean'].values)
for p in phrases:
    print p['phrase'] + '( '+ str(p['count']) + ')'

instagram photo by higher perspective jun( 165)
i just entered the competition to a bunch of goodies want to scupper my chances( 100)
you don t( 65)
like a boss girls( 65)
www kaylaitsines com app( 61)
when you know what you( 56)
want it bad enough you( 56)
you ll find a way to get it( 52)
we get paid for( 52)
only for you( 50)


In [42]:
len(new_data)

8289