According to a study by Pear Analytics [16], about 40% of all the tweets are pointless “babbles” like “have to get something from the minimart downstairs”

In [190]:
from langdetect import detect
import pickle
from os import path
import re
from nltk.tokenize import TweetTokenizer
import nltk
import numpy as np
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
import CMUTweetTagger
from sklearn.metrics.pairwise import pairwise_distances
import scipy.cluster.hierarchy as sch
import fastcluster
from collections import Counter

In [2]:
DIR_DATA = path.join('data', 'twitter data')
DIR_GEO = path.join('data', 'geofiles')

In [3]:
# Loading the saved file is as easy as running these lines of code
with open(path.join(DIR_DATA, 'clean_data.pkl'), 'rb') as in_file:
    df = pickle.load(in_file)

In [4]:
df.sort_values(by='createdAt', ascending=1, inplace = True)

# Preprocessing

In [6]:
# Here we normalize the text, the code is taken from 
#https://github.com/heerme/twitter-topics/blob/master/twitter-topics-from-json-text-stream.py
def normalize_text(text):
    if type(text) is not str:
        print(text)
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(pic\.twitter\.com/[^\s]+))','', text)
    text = re.sub('@[^\s]+','', text)
    text = re.sub('#([^\s]+)', '', text)
    text = re.sub('[:;>?<=*+()/,\-#!$%\{˜|\}\[^_\\@\]1234567890’‘]',' ', text)
    text = re.sub('[\d]','', text)
    text = text.replace(".", '')
    text = text.replace("'", ' ')
    text = text.replace("\"", ' ')
    #text = text.replace("-", " ")
    #normalize some utf8 encoding
    text = text.replace("\x9d",' ').replace("\x8c",' ')
    text = text.replace("\xa0",' ')
    text = text.replace("\x9d\x92", ' ').replace("\x9a\xaa\xf0\x9f\x94\xb5", ' ').replace("\xf0\x9f\x91\x8d\x87\xba\xf0\x9f\x87\xb8", ' ').replace("\x9f",' ').replace("\x91\x8d",' ')
    text = text.replace("\xf0\x9f\x87\xba\xf0\x9f\x87\xb8",' ').replace("\xf0",' ').replace('\xf0x9f','').replace("\x9f\x91\x8d",' ').replace("\x87\xba\x87\xb8",' ')	
    text = text.replace("\xe2\x80\x94",' ').replace("\x9d\xa4",' ').replace("\x96\x91",' ').replace("\xe1\x91\xac\xc9\x8c\xce\x90\xc8\xbb\xef\xbb\x89\xd4\xbc\xef\xbb\x89\xc5\xa0\xc5\xa0\xc2\xb8",' ')
    text = text.replace("\xe2\x80\x99s", " ").replace("\xe2\x80\x98", ' ').replace("\xe2\x80\x99", ' ').replace("\xe2\x80\x9c", " ").replace("\xe2\x80\x9d", " ")
    text = text.replace("\xe2\x82\xac", " ").replace("\xc2\xa3", " ").replace("\xc2\xa0", " ").replace("\xc2\xab", " ").replace("\xf0\x9f\x94\xb4", " ").replace("\xf0\x9f\x87\xba\xf0\x9f\x87\xb8\xf0\x9f", "")
    return text

In [7]:
df.dropna(subset = ['text'],inplace=True)

In [8]:
df['processed_text'] = df['text'].apply(lambda x: normalize_text(x))
df.reset_index(inplace = True, drop = True)

1,index,id,userId,createdAt,longitude,latitude,text,day,month,year,daily_tweets,processed_text
0,3629,10510244063,17197547,2010-03-15 09:31:16,8.50810,47.3807,JavaScript tools talk at local.ch Javascript h...,15,3,2010,11,JavaScript tools talk at localch Javascript ha...
1,3630,10511082338,17197547,2010-03-15 10:06:41,8.54438,47.3673,Mission in progress at Javascript special hack...,15,3,2010,11,Mission in progress at Javascript special hack...
2,3631,10511125150,17197547,2010-03-15 10:08:29,8.54441,47.3676,Progressive Enhancement demonstrated with pean...,15,3,2010,11,Progressive Enhancement demonstrated with pean...
3,3632,10512734279,17197547,2010-03-15 11:10:47,8.54551,47.3683,OO and inheritance in JS explained by @ivanjov...,15,3,2010,11,OO and inheritance in JS explained by slide...
4,3633,10513231687,17197547,2010-03-15 11:28:17,8.54529,47.3675,Those flying M&Ms from @lejoe 's talk http://l...,15,3,2010,11,Those flying M&Ms from s talk earlier
5,3634,10515858205,17197547,2010-03-15 12:50:12,8.54528,47.3675,Functional Javascript after lunch with @michae...,15,3,2010,11,Functional Javascript after lunch with and
6,3635,10516759544,17197547,2010-03-15 13:15:11,8.54528,47.3675,Module pattern thanks to @triggerer http://lii...,15,3,2010,11,Module pattern thanks to
7,3636,10517040964,17197547,2010-03-15 13:22:41,8.54528,47.3675,Module pattern mission in progress http://yfro...,15,3,2010,11,Module pattern mission in progress
8,3637,10518509361,17197547,2010-03-15 14:00:16,8.54537,47.3676,jQuery pluton abuse following http://liip.to/j...,15,3,2010,11,jQuery pluton abuse following
9,3638,10519588824,17197547,2010-03-15 14:26:52,8.54527,47.3675,And finally @michaelk_ch minified version of h...,15,3,2010,11,And finally minified version of his nodejs ta...


In [207]:
dfTest = df.iloc[:20000]
dfTest

1,id,userId,createdAt,longitude,latitude,text,day,month,year,daily_tweets,processed_text
0,10510244063,17197547,2010-03-15 09:31:16,8.50810,47.3807,JavaScript tools talk at local.ch Javascript h...,15,3,2010,11,JavaScript tools talk at localch Javascript ha...
1,10511082338,17197547,2010-03-15 10:06:41,8.54438,47.3673,Mission in progress at Javascript special hack...,15,3,2010,11,Mission in progress at Javascript special hack...
2,10511125150,17197547,2010-03-15 10:08:29,8.54441,47.3676,Progressive Enhancement demonstrated with pean...,15,3,2010,11,Progressive Enhancement demonstrated with pean...
3,10512734279,17197547,2010-03-15 11:10:47,8.54551,47.3683,OO and inheritance in JS explained by @ivanjov...,15,3,2010,11,OO and inheritance in JS explained by slide...
4,10513231687,17197547,2010-03-15 11:28:17,8.54529,47.3675,Those flying M&Ms from @lejoe 's talk http://l...,15,3,2010,11,Those flying M&Ms from s talk earlier
5,10515858205,17197547,2010-03-15 12:50:12,8.54528,47.3675,Functional Javascript after lunch with @michae...,15,3,2010,11,Functional Javascript after lunch with and
6,10516759544,17197547,2010-03-15 13:15:11,8.54528,47.3675,Module pattern thanks to @triggerer http://lii...,15,3,2010,11,Module pattern thanks to
7,10517040964,17197547,2010-03-15 13:22:41,8.54528,47.3675,Module pattern mission in progress http://yfro...,15,3,2010,11,Module pattern mission in progress
8,10518509361,17197547,2010-03-15 14:00:16,8.54537,47.3676,jQuery pluton abuse following http://liip.to/j...,15,3,2010,11,jQuery pluton abuse following
9,10519588824,17197547,2010-03-15 14:26:52,8.54527,47.3675,And finally @michaelk_ch minified version of h...,15,3,2010,11,And finally minified version of his nodejs ta...


In [11]:
#  filter the blank cells
filter_text = (df["processed_text"] != "") & (df["processed_text"] != " ") & (df["processed_text"] != "  ") \
    & (df["processed_text"] != "   ") 
df = df[filter_text]
df.reset_index(inplace=True,drop = True)

In [None]:
tknzr = TweetTokenizer()
tknzr.tokenize(df.iloc[0].prcessed_text)

In [12]:
stop_words = nltk.corpus.stopwords.words('english')
stop_words.extend(nltk.corpus.stopwords.words('french'))
stop_words.extend(nltk.corpus.stopwords.words('italian'))
stop_words.extend(nltk.corpus.stopwords.words('german'))

In [29]:
def nltk_tokenize(text):
    tokens = []
    pos_tokens = []
    entities = []
    features = []
    try:
        tokens = text.split()
        for word in tokens:
            if word.lower() not in stop_words and len(word) > 1:
                features.append(word)
    except: 
        pass
    return [tokens, pos_tokens, entities, features]

In [35]:
def custom_tokenize_text(text):
    REGEX = re.compile(r",\s*")
    tokens = []
    for tok in REGEX.split(text):
        #if "@" not in tok and "#" not in tok:
        if "@" not in tok:
            #tokens.append(stem(tok.strip().lower()))
            tokens.append(tok.strip().lower())
    return tokens

In [44]:
tweet_unixtime_old = -1
#fout.write("time window size in mins: " + str(time_window_mins))
tid_to_raw_tweet = {}
window_corpus = []
tid_to_urls_window_corpus = {}
tids_window_corpus = []
dfVocTimeWindows = {}
t = 0
ntweets = 0
for index, row in df.loc[:10000].iterrows():
    text = row.processed_text
    [tokens, pos_tokens, entities, features] = nltk_tokenize(text)
    tweet_bag = ""
    try:
#         for user in set(users):
#             tweet_bag += "@" + user.decode('utf-8').lower() + ","
#         for tag in set(hashtags):
#             if tag.decode('utf-8').lower() not in stop_words: 
#                 tweet_bag += "#" + tag.decode('utf-8').lower() + ","
        for feature in features:
            tweet_bag += feature + ","
            
        
    except:
        #print "tweet_bag error!", tweet_bag, len(tweet_bag.split(","))
        pass

#print tweet_bag.decode('utf-8')
    if  len(features) > 3 and len(tweet_bag.split(",")) > 4 and not str(features).upper() == str(features):
        tweet_bag = tweet_bag[:-1]
        window_corpus.append(tweet_bag)
        tids_window_corpus.append(row.id)
#         tid_to_urls_window_corpus[row.id] = media_urls
        tid_to_raw_tweet[row.id] = text


In [117]:
# The reason for min_df is that the cluster need to gather enough tweet to be considered a topic
vectorizer = CountVectorizer(tokenizer=custom_tokenize_text, binary=True,\
                            min_df=max(int(len(window_corpus)*0.0025), 10), ngram_range=(2,3))
X = vectorizer.fit_transform(window_corpus)
map_index_after_cleaning = {}
Xclean = np.zeros((1, X.shape[1]))
for i in range(0, X.shape[0]):
    #keep sample with size at least 5
    if X[i].sum() > 4:
        Xclean = np.vstack([Xclean, X[i].toarray()])
        map_index_after_cleaning[Xclean.shape[0] - 2] = i

In [170]:
Xclean = Xclean[1:,]
X = Xclean
Xdense = np.matrix(X).astype('float')
X_scaled = preprocessing.scale(Xdense) # doing some preprocessing to make the 
                                        #data suitable for machin learning algorithms
X_normalized = preprocessing.normalize(X_scaled, norm='l2')
#vocX = [str.encode(w) for w in vectorizer.get_feature_names()]
vocX = vectorizer.get_feature_names()
boost_entity = {}
pos_tokens = CMUTweetTagger.runtagger_parse([term.upper() for term in vocX],\
                                           run_tagger_cmd="java -XX:ParallelGCThreads=2 -Xmx500m -jar data/ark-tweet-nlp-0.3.2.jar")
for l in pos_tokens:
    term =''
    for gr in range(0, len(l)):
        term += l[gr][0].lower() + " "
    if "^" in str(l):
        boost_entity[term.strip()] = 2.5
    else: 
        boost_entity[term.strip()] = 1.0

b'BECAME MAYOR\nHARRY POTTER\nHEUTE RICHTIG\nHEUTE RICHTIG ARSCH\nLOOKS LIKE\nMAL SCHAUEN\nOKAY TWITTER\nOKAY TWITTER SCHEINT\nRICHTIG ARSCH\nSCHEINT HEUTE\nSCHEINT HEUTE RICHTIG\nSOCIAL MEDIA\nTWITTER SCHEINT\nTWITTER SCHEINT HEUTE' (b'BECAME\tV\t0.9798\nMAYOR\t^\t0.6688\n\nHARRY\t^\t0.9733\nPOTTER\t^\t0.9940\n\nHEUTE\t^\t0.3783\nRICHTIG\t^\t0.6048\n\nHEUTE\t^\t0.3783\nRICHTIG\t^\t0.7218\nARSCH\t^\t0.9132\n\nLOOKS\tV\t0.9878\nLIKE\tP\t0.8516\n\nMAL\t^\t0.3223\nSCHAUEN\t^\t0.8690\n\nOKAY\t!\t0.9264\nTWITTER\t^\t0.9936\n\nOKAY\t!\t0.9264\nTWITTER\t^\t0.9952\nSCHEINT\t^\t0.5115\n\nRICHTIG\t!\t0.3198\nARSCH\t!\t0.5046\n\nSCHEINT\t^\t0.2012\nHEUTE\t^\t0.9023\n\nSCHEINT\t^\t0.2012\nHEUTE\t^\t0.9315\nRICHTIG\t^\t0.7092\n\nSOCIAL\tA\t0.7770\nMEDIA\tN\t0.9625\n\nTWITTER\t^\t0.9771\nSCHEINT\t^\t0.4234\n\nTWITTER\t^\t0.9771\nSCHEINT\t^\t0.6369\nHEUTE\t^\t0.9343\n\n', b'Listening on stdin for input.  (-h for help)\nDetected text input format\nTokenized and tagged 14 tweets (32 tokens) in 0.5 seco

In [176]:
dfX = X.sum(axis=0)
dfVoc = {}
wdfVoc = {}
boosted_wdfVoc = {}
keys = vocX
vals = dfX
t = 5
for k,v in zip(keys, vals):
    dfVoc[k] = v
for k in dfVoc: 
    try:
        dfVocTimeWindows[k] += dfVoc[k]
        avgdfVoc = (dfVocTimeWindows[k] - dfVoc[k])/(t - 1)
    except:
        dfVocTimeWindows[k] = dfVoc[k]
        avgdfVoc = 0
    
    wdfVoc[k] = (dfVoc[k] + 1) / (np.log(avgdfVoc + 1) + 1)
    try:
        boosted_wdfVoc[k] = wdfVoc[k] * boost_entity[k]
    except: 
        boosted_wdfVoc[k] = wdfVoc[k]
        

In [196]:
distMatrix = pairwise_distances(X_normalized, metric='cosine')
L = fastcluster.linkage(distMatrix, method='average')
dt = 0.5
indL = sch.fcluster(L, dt*distMatrix.max(), 'distance')
freqTwCl = Counter(indL)
npindL = np.array(indL)
freq_th = max(10, int(X.shape[0]*0.0025))
cluster_score = {}
for clfreq in freqTwCl.most_common(50):
    cl = clfreq[0]
    freq = clfreq[1]
    cluster_score[cl] = 0
    if freq >= freq_th:
        clidx = (npindL == cl).nonzero()[0].tolist()
        cluster_centroid = X[clidx].sum(axis=0)
        try:
            cluster_tweet = vectorizer.inverse_transform(cluster_centroid)
            for term in np.nditer(cluster_tweet):
                try:
                    cluster_score[cl] = max(cluster_score[cl], boosted_wdfVoc[str(term).strip()])
                except: pass
        except: pass
        cluster_score[cl] /= freq
    else: break
sorted_clusters = sorted( ((v,k) for k,v in cluster_score.items()), reverse=True)

In [198]:
ntopics = 20
headline_corpus = []
orig_headline_corpus = []
headline_to_cluster = {}
headline_to_tid = {}
cluster_to_tids = {}
for score,cl in sorted_clusters[:ntopics]:
    clidx = (npindL == cl).nonzero()[0].tolist()
    first_idx = map_index_after_cleaning[clidx[0]]
    keywords = window_corpus[first_idx]
    orig_headline_corpus.append(keywords)
    headline = ''
    for k in keywords.split(","):
        if not '@' in k and not '#' in k:
            headline += k + ","
    headline_corpus.append(headline[:-1])
    headline_to_cluster[headline[:-1]] = cl
    headline_to_tid[headline[:-1]] = tids_window_corpus[first_idx]

    tids = []
    for i in clidx:
        idx = map_index_after_cleaning[i]
        tids.append(tids_window_corpus[idx])
    cluster_to_tids[cl] = tids

In [205]:
indHL = sch.fcluster(HL, dtH*distH.max(), 'distance')

ValueError: Linkage must be computed on at least two observations.

In [200]:
headline_vectorizer = CountVectorizer(tokenizer=custom_tokenize_text, binary=True, min_df=1, ngram_range=(1,1))
H = headline_vectorizer.fit_transform(headline_corpus)
print("H.shape:", H.shape)
vocH = headline_vectorizer.get_feature_names()

Hdense = np.matrix(H.todense()).astype('float')
distH = pairwise_distances(Hdense, metric='cosine')
#				print "fastcluster, avg, euclid"
HL = fastcluster.linkage(distH, method='average')
dtH = 1.0
indHL = sch.fcluster(HL, dtH*distH.max(), 'distance')
freqHCl = Counter(indHL)
print("hclust cut threshold:", dtH)
print("n_clusters:", len(freqHCl))
print(freqHCl)

H.shape: (1, 6)


ValueError: Linkage must be computed on at least two observations.

In [151]:
print(CMUTweetTagger.runtagger_parse(['example tweet 1', 'example tweet 2'],\
                                     run_tagger_cmd="jar cmvf META-INF/MANIFEST.MF data/ark-tweet-nlp-0.3.2-sources.jar"))

b'example tweet 1\nexample tweet 2' (b'', b'java.io.FileNotFoundException: META-INF/MANIFEST.MF (No such file or directory)\n\tat java.io.FileInputStream.open0(Native Method)\n\tat java.io.FileInputStream.open(FileInputStream.java:195)\n\tat java.io.FileInputStream.<init>(FileInputStream.java:138)\n\tat java.io.FileInputStream.<init>(FileInputStream.java:93)\n\tat sun.tools.jar.Main.run(Main.java:175)\n\tat sun.tools.jar.Main.main(Main.java:1288)\n')
[[]]


In [None]:
for line in file_timeordered_tweets:
        
    [tweet_unixtime, tweet_gmttime, tweet_id, text, hashtags, users, urls, media_urls, nfollowers, nfriends] = eval(line)
    if spam_tweet(text):
        continue

    if tweet_unixtime_old == -1:
        tweet_unixtime_old = tweet_unixtime

    if (tweet_unixtime - tweet_unixtime_old) < time_window_mins * 60:
        ntweets += 1

        features = process_json_tweet(text, fout, debug)
        tweet_bag = ""
        try:
            for user in set(users):
                tweet_bag += "@" + user.decode('utf-8').lower() + ","
            for tag in set(hashtags):
                if tag.decode('utf-8').lower() not in stop_words: 
                    tweet_bag += "#" + tag.decode('utf-8').lower() + ","
            for feature in features:
                tweet_bag += feature.decode('utf-8') + ","
        except:
            pass


        if len(users) < 3 and len(hashtags) < 3 and len(features) > 3 and len(tweet_bag.split(",")) > 4 and not str(features).upper() == str(features):
            tweet_bag = tweet_bag[:-1]

            window_corpus.append(tweet_bag)
            tids_window_corpus.append(tweet_id)
            tid_to_urls_window_corpus[tweet_id] = media_urls
            tid_to_raw_tweet[tweet_id] = text

    else:
            dtime = datetime.fromtimestamp(tweet_unixtime_old).strftime("%d-%m-%Y %H:%M")
            print "\nWindow Starts GMT Time:", dtime, "\n"
            tweet_unixtime_old = tweet_unixtime	

            t += 1

            vectorizer = CountVectorizer(tokenizer=custom_tokenize_text, binary=True, min_df=max(int(len(window_corpus)*0.0025), 10), ngram_range=(2,3))

            X = vectorizer.fit_transform(window_corpus)
            map_index_after_cleaning = {}
            Xclean = np.zeros((1, X.shape[1]))
            for i in range(0, X.shape[0]):
                if X[i].sum() > 4:
                    Xclean = np.vstack([Xclean, X[i].toarray()])
                    map_index_after_cleaning[Xclean.shape[0] - 2] = i

            Xclean = Xclean[1:,]
            print "total tweets in window:", ntweets
            print "X.shape:", X.shape
            print "Xclean.shape:", Xclean.shape

            X = Xclean
            Xdense = np.matrix(X).astype('float')
            X_scaled = preprocessing.scale(Xdense)
            X_normalized = preprocessing.normalize(X_scaled, norm='l2')

            vocX = vectorizer.get_feature_names()

            boost_entity = {}
            pos_tokens = CMUTweetTagger.runtagger_parse([term.upper() for term in vocX])

            for l in pos_tokens:
                term =''
                for gr in range(0, len(l)):
                    term += l[gr][0].lower() + " "
                if "^" in str(l):
                    boost_entity[term.strip()] = 2.5
                else: 	 		
                    boost_entity[term.strip()] = 1.0

            dfX = X.sum(axis=0)
            dfVoc = {}
            wdfVoc = {}
            boosted_wdfVoc = {}	
            keys = vocX
            vals = dfX
            for k,v in zip(keys, vals):
                dfVoc[k] = v
            for k in dfVoc: 
                try:
                    dfVocTimeWindows[k] += dfVoc[k]
                    avgdfVoc = (dfVocTimeWindows[k] - dfVoc[k])/(t - 1)
                    #avgdfVoc = (dfVocTimeWindows[k] - dfVoc[k])
                except:
                    dfVocTimeWindows[k] = dfVoc[k]
                    avgdfVoc = 0

                wdfVoc[k] = (dfVoc[k] + 1) / (np.log(avgdfVoc + 1) + 1)
                try:
                    boosted_wdfVoc[k] = wdfVoc[k] * boost_entity[k]
                except: 
                    boosted_wdfVoc[k] = wdfVoc[k]

            print "sorted wdfVoc*boost_entity:"
            print sorted( ((v,k) for k,v in boosted_wdfVoc.iteritems()), reverse=True)

            distMatrix = pairwise_distances(X_normalized, metric='cosine')


            print "fastcluster, average, cosine"
            L = fastcluster.linkage(distMatrix, method='average')


            dt = 0.5
            print "hclust cut threshold:", dt

            indL = sch.fcluster(L, dt*distMatrix.max(), 'distance')

            freqTwCl = Counter(indL)
            print "n_clusters:", len(freqTwCl)
            print(freqTwCl)

            npindL = np.array(indL)

            freq_th = max(10, int(X.shape[0]*0.0025))
            cluster_score = {}
            for clfreq in freqTwCl.most_common(50):
                cl = clfreq[0]
                freq = clfreq[1]
                cluster_score[cl] = 0
                if freq >= freq_th:

                    clidx = (npindL == cl).nonzero()[0].tolist()
                    cluster_centroid = X[clidx].sum(axis=0)

                    try:

                        cluster_tweet = vectorizer.inverse_transform(cluster_centroid)

                        for term in np.nditer(cluster_tweet):
                            try:
                                cluster_score[cl] = max(cluster_score[cl], boosted_wdfVoc[str(term).strip()])
                            except: pass 			
                    except: pass
                    cluster_score[cl] /= freq
                else: break

            sorted_clusters = sorted( ((v,k) for k,v in cluster_score.iteritems()), reverse=True)
            print "sorted cluster_score:"
            print sorted_clusters

            ntopics = 20
            headline_corpus = []
            orig_headline_corpus = []
            headline_to_cluster = {}
            headline_to_tid = {}
            cluster_to_tids = {}
            for score,cl in sorted_clusters[:ntopics]:

                clidx = (npindL == cl).nonzero()[0].tolist()

                first_idx = map_index_after_cleaning[clidx[0]]
                keywords = window_corpus[first_idx]
                orig_headline_corpus.append(keywords)
                headline = ''
                for k in keywords.split(","):
                    if not '@' in k and not '#' in k:
                        headline += k + ","
                headline_corpus.append(headline[:-1])
                headline_to_cluster[headline[:-1]] = cl
                headline_to_tid[headline[:-1]] = tids_window_corpus[first_idx]

                tids = []
                for i in clidx:
                    idx = map_index_after_cleaning[i]
                    tids.append(tids_window_corpus[idx])

            headline_vectorizer = CountVectorizer(tokenizer=custom_tokenize_text, binary=True, min_df=1, ngram_range=(1,1))
            H = headline_vectorizer.fit_transform(headline_corpus)
            print "H.shape:", H.shape
            vocH = headline_vectorizer.get_feature_names()


            Hdense = np.matrix(H.todense()).astype('float')
            distH = pairwise_distances(Hdense, metric='cosine')
            HL = fastcluster.linkage(distH, method='average')
            dtH = 1.0
            indHL = sch.fcluster(HL, dtH*distH.max(), 'distance')
            freqHCl = Counter(indHL)
            print "hclust cut threshold:", dtH
            print "n_clusters:", len(freqHCl)
            print(freqHCl)

            npindHL = np.array(indHL)
            hcluster_score = {}
            for hclfreq in freqHCl.most_common(ntopics):
                hcl = hclfreq[0]
                hfreq = hclfreq[1]
                hcluster_score[hcl] = 0
                hclidx = (npindHL == hcl).nonzero()[0].tolist()
                for i in hclidx:
                    hcluster_score[hcl] = max(hcluster_score[hcl], cluster_score[headline_to_cluster[headline_corpus[i]]])
            sorted_hclusters = sorted( ((v,k) for k,v in hcluster_score.iteritems()), reverse=True)
            print "sorted hcluster_score:"
            print sorted_hclusters

            for hscore, hcl in sorted_hclusters[:10]:
#					print "\n(cluster, freq):", hcl, freqHCl[hcl]
                hclidx = (npindHL == hcl).nonzero()[0].tolist()
                clean_headline = ''
                raw_headline = ''
                keywords = ''
                tids_set = set()
                tids_list = []
                urls_list = []
                selected_raw_tweets_set = set()
                tids_cluster = []
                for i in hclidx:
                    clean_headline += headline_corpus[i].replace(",", " ") + "//"
                    keywords += orig_headline_corpus[i].lower() + ","
                    tid = headline_to_tid[headline_corpus[i]]
                    tids_set.add(tid)
                    raw_tweet = tid_to_raw_tweet[tid].encode('utf8', 'replace').replace("\n", ' ').replace("\t", ' ')
                    raw_tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(pic\.twitter\.com/[^\s]+))','', raw_tweet)
                    selected_raw_tweets_set.add(raw_tweet.decode('utf8', 'ignore').strip())
                    tids_list.append(tid)
                    if tid_to_urls_window_corpus[tid]:
                        urls_list.append(tid_to_urls_window_corpus[tid])
                    for id in cluster_to_tids[headline_to_cluster[headline_corpus[i]]]:
                        tids_cluster.append(id)

                raw_headline = tid_to_raw_tweet[headline_to_tid[headline_corpus[hclidx[0]]]]
                raw_headline = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(pic\.twitter\.com/[^\s]+))','', raw_headline)
                raw_headline = raw_headline.encode('utf8', 'replace').replace("\n", ' ').replace("\t", ' ')
                keywords_list = str(sorted(list(set(keywords[:-1].split(",")))))[1:-1].encode('utf8', 'replace').replace('u\'','').replace('\'','')					

                for tid in tids_cluster:
                    if len(urls_list) < 1 and tid_to_urls_window_corpus[tid] and tid not in tids_set:
                            raw_tweet = tid_to_raw_tweet[tid].encode('utf8', 'replace').replace("\n", ' ').replace("\t", ' ')
                            raw_tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(pic\.twitter\.com/[^\s]+))','', raw_tweet)
                            raw_tweet = raw_tweet.decode('utf8', 'ignore')
                            #fout.write("\ncluster tweet: " + raw_tweet)
                            if raw_tweet.strip() not in selected_raw_tweets_set:
                                tids_list.append(tid)
                                urls_list.append(tid_to_urls_window_corpus[tid])
                                selected_raw_tweets_set.add(raw_tweet.strip())

                try:	
                    print "\n", clean_headline.decode('utf8', 'ignore')#, "\t", keywords_list

                except: pass					

                urls_set = set()
                for url_list in urls_list:
                    for url in url_list:
                        urls_set.add(url)

                fout.write("\n" + str(dtime) + "\t" + raw_headline.decode('utf8', 'ignore') + "\t" + keywords_list.decode('utf8', 'ignore') + "\t" + str(tids_list)[1:-1] + "\t" + str(list(urls_set))[1:-1][2:-1].replace('\'','').replace('uhttp','http'))

            #sys.exit()
            window_corpus = []
            tids_window_corpus = []
            tid_to_urls_window_corpus = {}
            tid_to_raw_tweet = {}
            ntweets = 0
            if t == 4:
                dfVocTimeWindows = {}
                t = 0