In [1]:
import gensim.parsing
import pandas as pd
import os
import re
import string

import nltk
import numpy as np
import pandas as pd

from gensim.models import Word2Vec

from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score, silhouette_samples

nltk.download('stopwords')

ratio_to_train = .05

data = pd.read_csv('training_data/data.csv', encoding='latin-1')
data.columns = ['sentiment', 'id', 'date', 'query', 'user', 'text']

data = data.drop(['id', 'date', 'query', 'user'], axis=1)
data = data.sample(frac=ratio_to_train)

[nltk_data] Downloading package stopwords to /home/crispy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def clean_text(text: str, tokenizer, stopwords):
    """
    Preprocess text and generate tokens
    :param text: Text to tokenize
    :type text: str
    """
    text = str(text).lower()  # Lowercase words
    text = re.sub(r"\[(.*?)\]", "", text)  # Remove [+XYZ chars] in content
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces in content
    text = re.sub(r"\w+…|…", "", text)  # Remove ellipsis (and last word)
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text)  # Replace dash between words
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", "", text
    )  # Remove punctuation

    tokens = tokenizer(text)  # Get tokens from text
    tokens = [t for t in tokens if not t in stopwords]  # Remove stopwords
    tokens = ["" if t.isdigit() else t for t in tokens]  # Remove digits
    tokens = [t for t in tokens if len(t) > 1]  # Remove short tokens
    return tokens

In [3]:
custom_stopwords = set(stopwords.words("english"))

data["tokens"] = data["text"].map(lambda x: clean_text(x, word_tokenize, custom_stopwords))

# Remove duplicated after preprocessing
_, idx = np.unique(data["tokens"], return_index=True)
data = data.iloc[idx, :]

# Remove empty values and keep relevant columns
data = data.loc[data.tokens.map(lambda x: len(x) > 0), ["text", "tokens"]]

docs = data["text"].values
tokenized_docs = data["tokens"].values

In [4]:
model = Word2Vec(sentences=tokenized_docs, vector_size=100)

In [5]:
def vectorize(list_of_docs, model):
    """
    Generate vectors for list of documents using a Word Embedding

    Args:
        list_of_docs: List of documents
        model: Gensim's Word Embedding

    Returns:
        List of document vectors
    """
    features = []

    for tokens in list_of_docs:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features

In [6]:
vectorized_docs = vectorize(tokenized_docs, model=model)

In [7]:
def mbkmeans_clusters(
	X,
    k,
    mb,
    print_silhouette_values,
):
    """Generate clusters and print Silhouette metrics using MBKmeans

    Args:
        X: Matrix of features.
        k: Number of clusters.
        mb: Size of mini-batches.
        print_silhouette_values: Print silhouette values per cluster.

    Returns:
        Trained clustering model and labels based on X.
    """
    km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
    print(f"For n_clusters = {k}")
    print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
    print(f"Inertia:{km.inertia_}")

    if print_silhouette_values:
        sample_silhouette_values = silhouette_samples(X, km.labels_)
        print(f"Silhouette values:")
        silhouette_values = []
        for i in range(k):
            cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
            silhouette_values.append(
                (
                    i,
                    cluster_silhouette_values.shape[0],
                    cluster_silhouette_values.mean(),
                    cluster_silhouette_values.min(),
                    cluster_silhouette_values.max(),
                )
            )
        silhouette_values = sorted(
            silhouette_values, key=lambda tup: tup[2], reverse=True
        )
        for s in silhouette_values:
            print(
                f"    Cluster {s[0]}: Size:{s[1]} | Avg:{s[2]:.2f} | Min:{s[3]:.2f} | Max: {s[4]:.2f}"
            )
    return km, km.labels_

In [8]:
clustering, cluster_labels = mbkmeans_clusters(
	X=vectorized_docs,
    k=50,
    mb=500,
    print_silhouette_values=True,
)
df_clusters = pd.DataFrame({
    "text": docs,
    "tokens": [" ".join(text) for text in tokenized_docs],
    "cluster": cluster_labels
})



For n_clusters = 50
Silhouette coefficient: 0.06
Inertia:98873.54537178887
Silhouette values:
    Cluster 49: Size:1351 | Avg:0.76 | Min:0.21 | Max: 0.88
    Cluster 7: Size:122 | Avg:0.52 | Min:-0.05 | Max: 0.74
    Cluster 13: Size:357 | Avg:0.46 | Min:0.02 | Max: 0.62
    Cluster 34: Size:367 | Avg:0.20 | Min:-0.12 | Max: 0.46
    Cluster 22: Size:3027 | Avg:0.19 | Min:-0.21 | Max: 0.43
    Cluster 28: Size:405 | Avg:0.17 | Min:-0.18 | Max: 0.43
    Cluster 4: Size:5423 | Avg:0.11 | Min:-0.10 | Max: 0.31
    Cluster 27: Size:2258 | Avg:0.11 | Min:-0.16 | Max: 0.35
    Cluster 43: Size:626 | Avg:0.10 | Min:-0.13 | Max: 0.35
    Cluster 12: Size:729 | Avg:0.09 | Min:-0.20 | Max: 0.36
    Cluster 44: Size:1550 | Avg:0.08 | Min:-0.14 | Max: 0.31
    Cluster 38: Size:1186 | Avg:0.08 | Min:-0.16 | Max: 0.32
    Cluster 11: Size:350 | Avg:0.08 | Min:-0.14 | Max: 0.31
    Cluster 25: Size:2843 | Avg:0.07 | Min:-0.14 | Max: 0.28
    Cluster 33: Size:839 | Avg:0.07 | Min:-0.15 | Max: 0.30
   

In [9]:
print("Most representative terms per cluster (based on centroids):")
for i in range(50):
    tokens_per_cluster = ""
    most_representative = model.wv.most_similar(positive=[clustering.cluster_centers_[i]], topn=5)
    for t in most_representative:
        tokens_per_cluster += f"{t[0]} "
    print(f"Cluster {i}: {tokens_per_cluster}")

Most representative terms per cluster (based on centroids):
Cluster 0: nite nights slept saturday stayed 
Cluster 1: werk sack procrastination decided argh 
Cluster 2: gr8 fabulous aussie tweeple fellow 
Cluster 3: idk anyone cuz try wouldnt 
Cluster 4: pants ground difficult swing somehow 
Cluster 5: hold autograph theyll anywhere hurry 
Cluster 6: exhausted church 7am revision maths 
Cluster 7: wwwtweeterfollowcom vip using wwwtweeteraddercom add 
Cluster 8: hahahah honey huh pathetic alot 
Cluster 9: stomach shit tummy ache colds 
Cluster 10: wants ima bout suck thiss 
Cluster 11: day afternoon sunday morning sunny 
Cluster 12: album brand york site website 
Cluster 13: thanks thank followfriday welcome following 
Cluster 14: know worry either understand mean 
Cluster 15: dear aw hun dude saying 
Cluster 16: idk seriously cuz suffer everytime 
Cluster 17: gym nap leaving study homework 
Cluster 18: sooo im soooo hungry baby 
Cluster 19: dear yeah jason2008 ah aw 
Cluster 20: serious

In [10]:
test_cluster = 49
most_representative_docs = np.argsort(
    np.linalg.norm(vectorized_docs - clustering.cluster_centers_[test_cluster], axis=1)
)
for d in most_representative_docs[:10]:
    print(docs[d])
    print("-------------")

@langfordperry  
-------------
@ryanapsmith lulz 
-------------
@brittany Jsmith 
-------------
@Pandaaaaaaaaa ahem.   
-------------
@nkvanhoosier what ru what huh'ing? 
-------------
@goddessFabo Fabiiiii  how ru?
-------------
@little94 Hiii 
-------------
@shadowfish Meanie 
-------------
@nickdoesthis whyyy? 
-------------
@ludens_ twitterrific? ì¶ì²í©ëë¤. 
-------------


In [71]:
from collections import Counter
from nltk import ngrams
from tqdm.notebook import tqdm

compiled_docs = []
for doc in tqdm(docs):
    compiled_docs.extend(doc.split())

stopwords_words = stopwords.words() + ["I", "wish", "will", ".", "..."]

new_docs = [word for word in tqdm(compiled_docs) if word not in stopwords_words]

ngram_counts = Counter(ngrams(new_docs, 3))

  0%|          | 0/79152 [00:00<?, ?it/s]

  0%|          | 0/1050105 [00:00<?, ?it/s]

In [78]:
ngram_counts.most_common(100)

[(('Get', '100', 'followers'), 73),
 (('100', 'followers', 'day'), 73),
 (('Once', 'add', 'train'), 73),
 (('add', 'train', 'pay'), 73),
 (('train', 'pay', 'vip'), 73),
 (('followers', 'day', 'www.tweeteradder.com'), 50),
 (('day', 'www.tweeteradder.com', 'Once'), 50),
 (('www.tweeteradder.com', 'Once', 'add'), 50),
 (('lost.', 'Please', 'find'), 34),
 (('Please', 'find', 'home.'), 34),
 (('Happy', "Mother's", 'Day'), 29),
 (("I'm", 'gonna', 'miss'), 26),
 (('followers', 'day', 'www.tweeterfollow.com'), 23),
 (('day', 'www.tweeterfollow.com', 'Once'), 23),
 (('www.tweeterfollow.com', 'Once', 'add'), 23),
 (("I'm", 'sorry', 'hear'), 23),
 (('mtv', 'movie', 'awards'), 20),
 (('limit', 'story', '140'), 19),
 (('x', 'x', 'x'), 19),
 (('ily', 'ily', 'ily'), 19),
 (('Tell', 'annoying', 'www.iamsoannoyed.com'), 18),
 (('story', '140', 'ch.'), 17),
 (('140', 'ch.', 'Tell'), 17),
 (('ch.', 'Tell', 'annoying'), 16),
 (('Just', 'woke', 'up.'), 14),
 (('back', 'work', 'tomorrow'), 13),
 (('Happy',

In [82]:
last = (0, 0, 0)
best = ngram_counts.most_common(100)
best_list = []
for i in best:
    if not any(item in last for item in i):
        best_list.append(i[0])

    last = i

labeled_docs = {}
reversed_labels = {}
for i in docs:
    for item in best_list:
        if all(f in i for f in item):
            if i in labeled_docs.keys():
                labeled_docs[i].append(item)
            else:
                labeled_docs[i] = [item]
            if item in reversed_labels.keys():
                reversed_labels[item].append(i)
            else:
                reversed_labels[item] = [i]
            break

In [86]:
for i in labeled_docs.keys():
    print(f"{i}: {labeled_docs[i]}")
reversed_labels

11:50am. Just woke up. I'm gonna be late to Narnia. : [('Just', 'woke', 'up.')]
No 16GB iPhone 3GS in my local O2 Store : [('iPhone', '3G', 'S')]
@30SECONDSTOMARS ::waiting for the competitiion:: heh, guys the last few Kytemovies have no sound.  Can't wait to hear new music! xxx: [("Can't", 'wait', 'hear')]
@3rdmusik Get 100 followers a day using www.tweeteradder.com Once you add everyone you are on the train or pay vip : [('Get', '100', 'followers')]
My 7yo is very excited about singing Mother's Day songs for me in church today.  Happy Mother's Day, Moms!: [('Happy', "Mother's", 'Day')]
Has Been In Aaaaall Day.. Playing The Sims 3 Most Of The Day Lol : [('The', 'Sims', '3')]
@abrialukrocks Get 100 followers a day using www.tweeterfollow.com Once you add everyone you are on the train or pay vip : [('Get', '100', 'followers')]
has an addiction... The Sims 3 for iPhone.  and I won't go to rehab.: [('The', 'Sims', '3')]
Ah haha happy mother's day im stuck at work till 8 : [('happy', "moth

{('Just',
  'woke',
  'up.'): ["11:50am. Just woke up. I'm gonna be late to Narnia. ", 'Mmm.... Just woke up... Garage and estate sales with pancake breakfast thrown in.... Perfect morning!!! Loves boyfriend.... ', 'Just woke up. Ate a bowl of cereal. Should be getting ready for work. Blahhhh!!! ', 'Just woke up. No bread left, need to have my daily nuttela sandwich. Sunny day today. Off to sixth form in a while. ', 'Just woke up. Day with @susie_santi shut my energy off after ', 'Just woke up. feeling much better! watching selena on regis and kelly ', 'Just woke up. Now to go get ready to go town with @xhonor and possibly @xoFlawless ', 'Just woke up. I gotta work wharf 11:30-4 ', 'Just woke up. got work from 10am-3pm ', 'Just woke up. Had a great sleep ', 'Just woke up. What a nice day outside! Getting ready to go do some shopping! ', 'Just woke up. Shower time. Everyone will be here in a little bit!! ', 'Just woke up. Soo tired ', 'Just woke up. Still in bed ', 'Just woke up...sleep