## Read and clean data

In [1]:
import pandas as pd

In [12]:
%%time 
df = pd.read_csv('/Users/stefan/Downloads/processed_tweets_chunks/top100users_processed.csv')\
    .dropna()

CPU times: user 2.59 s, sys: 260 ms, total: 2.85 s
Wall time: 2.85 s


In [13]:
f'{len(df):,}'

'198,203'

In [14]:
%time df.text_token = df.text_token.map(eval)

CPU times: user 5.02 s, sys: 15.2 ms, total: 5.03 s
Wall time: 5.03 s


---

## Embeddings

[Pretrained glove models](https://github.com/RaRe-Technologies/gensim-data#models)

[2B tweets models manual download](http://nlp.stanford.edu/data/glove.twitter.27B.zip)

In [30]:
import numpy as np
import gensim.downloader

In [8]:
%time glove = gensim.downloader.load('glove-twitter-50')  # also downloads 199MB model

CPU times: user 1min 7s, sys: 785 ms, total: 1min 8s
Wall time: 1min 9s


In [86]:
def aggregate_embeddings(tokens: [str]) -> [float]:
    return np.array([glove.get_vector(t) for t in tokens]).sum(axis=0)

In [106]:
%%time
# keep just tokens that are in the glove vocab
tweet_tokens = [
    [t for t in tokens if t in glove]
    for tokens in df.text_token.tolist()
]

CPU times: user 1 s, sys: 10.7 ms, total: 1.02 s
Wall time: 1.01 s


In [107]:
%time valid_mask = [bool(tokens) for tokens in tweet_tokens]  # will be used for putting the results back into the df

CPU times: user 22.9 ms, sys: 2.43 ms, total: 25.4 ms
Wall time: 24.6 ms


In [108]:
%time tweet_tokens = [tokens for tokens in tweet_tokens if tokens]  # keep just non-empty entries

CPU times: user 8.19 ms, sys: 1.9 ms, total: 10.1 ms
Wall time: 9.61 ms


In [109]:
%time X = np.array([aggregate_embeddings(tokens) for tokens in tweet_tokens]) # faster than map

CPU times: user 4.07 s, sys: 24.3 ms, total: 4.1 s
Wall time: 4.1 s


In [110]:
X.shape

(190981, 50)

---

In [96]:
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics.pairwise import euclidean_distances

In [92]:
kmeans = MiniBatchKMeans(n_clusters=10, batch_size=128, n_init=20)

In [93]:
%time kmeans.fit(X)

CPU times: user 1.32 s, sys: 121 ms, total: 1.44 s
Wall time: 606 ms


MiniBatchKMeans(batch_size=128, compute_labels=True, init='k-means++',
        init_size=None, max_iter=100, max_no_improvement=10, n_clusters=10,
        n_init=20, random_state=None, reassignment_ratio=0.01, tol=0.0,
        verbose=0)

In [97]:
%%time 
# compute a "soft membership": the distance from each vector to each centroid
pdists = euclidean_distances(X, kmeans.cluster_centers_)

CPU times: user 99.1 ms, sys: 28.3 ms, total: 127 ms
Wall time: 31.6 ms


---

In [112]:
%%time
for cluster_number, dists in enumerate(pdists.T):
    df.loc[valid_mask, f'cluster_{cluster_number}_dist'] = dists

CPU times: user 348 ms, sys: 146 ms, total: 493 ms
Wall time: 496 ms


In [115]:
pd.DataFrame(kmeans.cluster_centers_).to_csv('glove_centroids.csv', index=False)