According to a study by Pear Analytics [16], about 40% of all the tweets are pointless “babbles” like “have to get something from the minimart downstairs”

In [1]:
from langdetect import detect
import pickle
from os import path
import re
from nltk.tokenize import TweetTokenizer
import nltk
import numpy as np
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
import CMUTweetTagger
from sklearn.metrics.pairwise import pairwise_distances
import scipy.cluster.hierarchy as sch
import fastcluster
from collections import Counter

In [2]:
DIR_DATA = path.join('data', 'twitter data')
DIR_GEO = path.join('data', 'geofiles')

In [3]:
# Loading the saved file is as easy as running these lines of code
with open(path.join(DIR_DATA, 'clean_data.pkl'), 'rb') as in_file:
    df = pickle.load(in_file)

The history saving thread hit an unexpected error (OperationalError('database is locked',)).History will not be written to the database.


In [4]:
df.sort_values(by='createdAt', ascending=1, inplace = True)

# Preprocessing

In [6]:
# Here we normalize the text, the code is taken from 
#https://github.com/heerme/twitter-topics/blob/master/twitter-topics-from-json-text-stream.py
def normalize_text(text):
    if type(text) is not str:
        print(text)
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(pic\.twitter\.com/[^\s]+))','', text)
    text = re.sub('@[^\s]+','', text)
    text = re.sub('#([^\s]+)', '', text)
    text = re.sub('[:;>?<=*+()/,\-#!$%\{˜|\}\[^_\\@\]1234567890’‘]',' ', text)
    text = re.sub('[\d]','', text)
    text = text.replace(".", '')
    text = text.replace("'", ' ')
    text = text.replace("\"", ' ')
    #text = text.replace("-", " ")
    #normalize some utf8 encoding
    text = text.replace("\x9d",' ').replace("\x8c",' ')
    text = text.replace("\xa0",' ')
    text = text.replace("\x9d\x92", ' ').replace("\x9a\xaa\xf0\x9f\x94\xb5", ' ').replace("\xf0\x9f\x91\x8d\x87\xba\xf0\x9f\x87\xb8", ' ').replace("\x9f",' ').replace("\x91\x8d",' ')
    text = text.replace("\xf0\x9f\x87\xba\xf0\x9f\x87\xb8",' ').replace("\xf0",' ').replace('\xf0x9f','').replace("\x9f\x91\x8d",' ').replace("\x87\xba\x87\xb8",' ')	
    text = text.replace("\xe2\x80\x94",' ').replace("\x9d\xa4",' ').replace("\x96\x91",' ').replace("\xe1\x91\xac\xc9\x8c\xce\x90\xc8\xbb\xef\xbb\x89\xd4\xbc\xef\xbb\x89\xc5\xa0\xc5\xa0\xc2\xb8",' ')
    text = text.replace("\xe2\x80\x99s", " ").replace("\xe2\x80\x98", ' ').replace("\xe2\x80\x99", ' ').replace("\xe2\x80\x9c", " ").replace("\xe2\x80\x9d", " ")
    text = text.replace("\xe2\x82\xac", " ").replace("\xc2\xa3", " ").replace("\xc2\xa0", " ").replace("\xc2\xab", " ").replace("\xf0\x9f\x94\xb4", " ").replace("\xf0\x9f\x87\xba\xf0\x9f\x87\xb8\xf0\x9f", "")
    return text

In [38]:
# Find the hashtags and users
df['Hashtags'] = df['text'].apply(lambda x:{tag.strip("#") for tag in x.split() if tag.startswith("#")})
df['users'] = df['text'].apply(lambda x:{tag.strip("@") for tag in x.split() if tag.startswith("@")})

In [7]:
df.dropna(subset = ['text'],inplace=True)

In [8]:
df['processed_text'] = df['text'].apply(lambda x: normalize_text(x))
df.reset_index(inplace = True, drop = True)

In [26]:
df = df.iloc[:20000]

In [10]:
#  filter the blank cells
filter_text = (df["processed_text"] != "") & (df["processed_text"] != " ") & (df["processed_text"] != "  ") \
    & (df["processed_text"] != "   ") 
df = df[filter_text]
df.reset_index(inplace=True,drop = True)

In [11]:
stop_words = nltk.corpus.stopwords.words('english')
stop_words.extend(nltk.corpus.stopwords.words('french'))
stop_words.extend(nltk.corpus.stopwords.words('italian'))
stop_words.extend(nltk.corpus.stopwords.words('german'))

In [12]:
tknzr = TweetTokenizer()

In [13]:
def nltk_tokenize(text):
    tokens = []
    pos_tokens = []
    entities = []
    features = []
    try:
        tokens = tknzr.tokenize(text)
        #tokens = text.split()
        for word in tokens:
            if word.lower() not in stop_words and len(word) > 1:
                features.append(word)
    except: 
        pass
    return [tokens, pos_tokens, entities, features]

In [14]:
def custom_tokenize_text(text):
    REGEX = re.compile(r",\s*")
    tokens = []
    for tok in REGEX.split(text):
        #if "@" not in tok and "#" not in tok:
        if "@" not in tok:
            tokens.append(tok.strip().lower())
    return tokens

In [89]:
tweet_old_time = -1
window_analysis_time = 20  # This is the size of window that we analyze text inside it

tid_to_raw_tweet = {}
window_corpus = []
tid_to_urls_window_corpus = {}
tids_window_corpus = []
dfVocTimeWindows = {}
t = 0
ntweets = 0
for index, row in df.iterrows():
    tweet_current_time = row['createdAt']
    text = row['processed_text']
    users = row['users']
    hashtags = row['Hashtags']
    if tweet_old_time == -1:
        tweet_old_time = tweet_current_time
    if (tweet_current_time - tweet_old_time).days < window_analysis_time: # Inside the window we still gather the data
                                                                          # For analysis
        ntweets += 1
        [tokens, pos_tokens, entities, features] = nltk_tokenize(text)
        tweet_bag = ""
        try:
            for user in set(users):
                tweet_bag += "@" + user.decode('utf-8').lower() + ","
            for tag in set(hashtags):
                if tag.decode('utf-8').lower() not in stop_words: 
                    tweet_bag += "#" + tag.decode('utf-8').lower() + ","
            for feature in features:
                tweet_bag += feature + ","
        except:
            pass

        if len(users) < 3 and len(hashtags) < 3 and len(features) > 3 and len(tweet_bag.split(",")) > 4 \
                        and not str(features).upper() == str(features):

            tweet_bag = tweet_bag[:-1]
            window_corpus.append(tweet_bag)
            tids_window_corpus.append(row.id)
            tid_to_raw_tweet[row.id] = text
    else:
        tweet_old_time = tweet_current_time
        #increase window counter used in df-idf
        t += 1
        # The reason for min_df is that the cluster need to gather enough tweet to be considered a topic
        # CountVectorizer: Convert a collection of text documents to a matrix of token counts
        vectorizer = CountVectorizer(tokenizer=custom_tokenize_text, binary=True,\
                            min_df=max(int(len(window_corpus)*0.0025), 10), ngram_range=(2,3))
        try: # If the number of tweet is not enough
             # vectorizer.fit_transform Learn the vocabulary dictionary and return term-document matrix.
            X = vectorizer.fit_transform(window_corpus)
        except:
            continue
        map_index_after_cleaning = {}
        Xclean = np.zeros((1, X.shape[1]))
        for i in range(0, X.shape[0]):
            #keep sample with size at least 5
            if X[i].sum() > 4:
                Xclean = np.vstack([Xclean, X[i].toarray()])
                map_index_after_cleaning[Xclean.shape[0] - 2] = i
        Xclean = Xclean[1:,]
        #print("total tweets in window:", ntweets)
        #print("X.shape:", X.shape)
        #print("Xclean.shape:", Xclean.shape)
        X = Xclean
        Xdense = np.matrix(X).astype('float')
        # doing some preprocessing to make the 
        #data suitable for machin learning algorithms
        X_scaled = preprocessing.scale(Xdense)
        X_normalized = preprocessing.normalize(X_scaled, norm='l2')
        vocX = vectorizer.get_feature_names() # Array mapping from feature integer indices to feature name
        boost_entity = {}
        pos_tokens = CMUTweetTagger.runtagger_parse([term.upper() for term in vocX],\
                                           run_tagger_cmd="java -XX:ParallelGCThreads=2 -Xmx500m -jar data/ark-tweet-nlp-0.3.2.jar")
        

        for l in pos_tokens:
            term =''
            for gr in range(0, len(l)):
                term += l[gr][0].lower() + " "
            if "^" in str(l):
                boost_entity[term.strip()] = 2.5
            else: 
                boost_entity[term.strip()] = 1.0
        dfX = X.sum(axis=0)
        dfVoc = {}
        wdfVoc = {}
        boosted_wdfVoc = {}
        keys = vocX
        vals = dfX
        for k,v in zip(keys, vals):
            dfVoc[k] = v
        for k in dfVoc: 
            try:
                dfVocTimeWindows[k] += dfVoc[k]
                avgdfVoc = (dfVocTimeWindows[k] - dfVoc[k])/(t - 1)
            except:
                dfVocTimeWindows[k] = dfVoc[k]
                avgdfVoc = 0
            wdfVoc[k] = (dfVoc[k] + 1) / (np.log(avgdfVoc + 1) + 1)
            try:
                boosted_wdfVoc[k] = wdfVoc[k] * boost_entity[k]
            except: 
                boosted_wdfVoc[k] = wdfVoc[k]
        #print("sorted wdfVoc*boost_entity:")
        #print(sorted( ((v,k) for k,v in boosted_wdfVoc.items()), reverse=True))
        distMatrix = pairwise_distances(X_normalized, metric='cosine')
        L = fastcluster.linkage(distMatrix, method='average')
        dt = 1  # distance threshold for clustering
        indL = sch.fcluster(L, dt*distMatrix.max(), 'distance')
        freqTwCl = Counter(indL)
        print("n_clusters:", len(freqTwCl))
        print(freqTwCl)
        npindL = np.array(indL)
        freq_th = max(10, int(X.shape[0]*0.0025))
        cluster_score = {}
        for clfreq in freqTwCl.most_common(50):
            cl = clfreq[0]
            freq = clfreq[1]
            cluster_score[cl] = 0
            if freq >= freq_th:
                clidx = (npindL == cl).nonzero()[0].tolist()
                cluster_centroid = X[clidx].sum(axis=0)
                try:
                    cluster_tweet = vectorizer.inverse_transform(cluster_centroid)
                    for term in np.nditer(cluster_tweet):
                        try:
                            cluster_score[cl] = max(cluster_score[cl], boosted_wdfVoc[str(term).strip()])
                        except: pass
                except: pass
                cluster_score[cl] /= freq
            else: break
        sorted_clusters = sorted( ((v,k) for k,v in cluster_score.items()), reverse=True)
        print("sorted cluster",sorted_clusters)
        ntopics = 20
        headline_corpus = []
        orig_headline_corpus = []
        headline_to_cluster = {}
        headline_to_tid = {}
        cluster_to_tids = {}
        for score,cl in sorted_clusters[:ntopics]:
            clidx = (npindL == cl).nonzero()[0].tolist()
            first_idx = map_index_after_cleaning[clidx[0]]
            keywords = window_corpus[first_idx]
            orig_headline_corpus.append(keywords)
            headline = ''
            for k in keywords.split(","):
                if not '@' in k and not '#' in k:
                    headline += k + ","
            headline_corpus.append(headline[:-1])
            headline_to_cluster[headline[:-1]] = cl
            headline_to_tid[headline[:-1]] = tids_window_corpus[first_idx]

            tids = []
            for i in clidx:
                idx = map_index_after_cleaning[i]
                tids.append(tids_window_corpus[idx])
            cluster_to_tids[cl] = tids
        

b'HEUTE RICHTIG\nHEUTE RICHTIG ARSCH\nOKAY TWITTER\nOKAY TWITTER SCHEINT\nRICHTIG ARSCH\nSCHEINT HEUTE\nSCHEINT HEUTE RICHTIG\nTWITTER SCHEINT\nTWITTER SCHEINT HEUTE' (b'HEUTE\t^\t0.3783\nRICHTIG\t^\t0.6048\n\nHEUTE\t^\t0.3783\nRICHTIG\t^\t0.7218\nARSCH\t^\t0.9132\n\nOKAY\t!\t0.9264\nTWITTER\t^\t0.9936\n\nOKAY\t!\t0.9264\nTWITTER\t^\t0.9952\nSCHEINT\t^\t0.5115\n\nRICHTIG\t!\t0.3198\nARSCH\t!\t0.5046\n\nSCHEINT\t^\t0.2012\nHEUTE\t^\t0.9023\n\nSCHEINT\t^\t0.2012\nHEUTE\t^\t0.9315\nRICHTIG\t^\t0.7092\n\nTWITTER\t^\t0.9771\nSCHEINT\t^\t0.4234\n\nTWITTER\t^\t0.9771\nSCHEINT\t^\t0.6369\nHEUTE\t^\t0.9343\n\n', b'Listening on stdin for input.  (-h for help)\nDetected text input format\nTokenized and tagged 9 tweets (22 tokens) in 0.4 seconds: 24.5 tweets/sec, 59.8 tokens/sec\n')
n_clusters: 20
Counter({1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1, 10: 1, 11: 1, 12: 1, 13: 1, 14: 1, 15: 1, 16: 1, 17: 1, 18: 1, 19: 1, 20: 1})
sorted cluster [(0, 1)]
b'HEUTE RICHTIG\nHEUTE RICHTIG ARSCH\n

In [93]:
headline_to_tid

{'RBS,Bolligen,Bolligerstrasse,Bolligen,pic': 148373869860884480,
 'ja,mal,eher,hässlicher,Bahnhof,Bahnhof,Bern,Gare,Berne': 140348008280764416,
 'okay,Twitter,scheint,heute,richtig,Arsch': 16201777044}

In [None]:
indHL = sch.fcluster(HL, dtH*distH.max(), 'distance')

In [None]:
headline_vectorizer = CountVectorizer(tokenizer=custom_tokenize_text, binary=True, min_df=1, ngram_range=(1,1))
H = headline_vectorizer.fit_transform(headline_corpus)
print("H.shape:", H.shape)
vocH = headline_vectorizer.get_feature_names()

Hdense = np.matrix(H.todense()).astype('float')
distH = pairwise_distances(Hdense, metric='cosine')
#				print "fastcluster, avg, euclid"
HL = fastcluster.linkage(distH, method='average')
dtH = 1.0
indHL = sch.fcluster(HL, dtH*distH.max(), 'distance')
freqHCl = Counter(indHL)
print("hclust cut threshold:", dtH)
print("n_clusters:", len(freqHCl))
print(freqHCl)

In [None]:
print(CMUTweetTagger.runtagger_parse(['example tweet 1', 'example tweet 2'],\
                                     run_tagger_cmd="jar cmvf META-INF/MANIFEST.MF data/ark-tweet-nlp-0.3.2-sources.jar"))