Forking from the work on Rachaels stream https://www.kaggle.com/rebeccaturner/forum-post-embeddings
https://www.youtube.com/watch?v=jvPpxmp_y34


In [None]:
import pandas as pd
import yake_helper_funcs as yhf
import numpy as np
from nltk.tokenize import RegexpTokenizer
from datetime import datetime
forum_posts = pd.read_csv("../input/meta-kaggle/ForumMessages.csv")
forum_posts = forum_posts.sample(frac = .1)
forum_posts["Message"] = forum_posts["Message"].astype(str)
forum_posts["PostDate"] = pd.to_datetime(forum_posts["PostDate"])

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
forum_posts["Message"] = [w.lower() for w in forum_posts["Message"].tolist()]
forum_posts["Message"] = [tokenizer.tokenize(i) for i in forum_posts["Message"]]

In [None]:
months_lookback = 12

In [None]:
today = datetime.today()

In [None]:
from dateutil.relativedelta import relativedelta
month_ranges = [pd.to_datetime(today)]
for months in range(months_lookback):
    month_ranges.append(pd.to_datetime(today - relativedelta(months=months + 1)))

In [None]:
dict_of_dfs = {}
month_shapes = []
for i in range(len(month_ranges) - 1):
    date_range_df = forum_posts.loc[(forum_posts['PostDate'] < month_ranges[i]) & (forum_posts['PostDate'] > month_ranges[i + 1])]
    month_shapes.append(date_range_df.shape[0])
    dict_of_dfs[month_ranges[i]] = date_range_df

# Get word vectors for posts

In [None]:
from gensim.models import KeyedVectors
w2v = KeyedVectors.load_word2vec_format("../input/fine-tuning-word2vec-2-0/kaggle_word2vec.model", binary = False)

In [None]:
def vectors_from_post(posts):
    post_vectors = np.zeros(shape = (len(posts), 300))
    for i, post in enumerate(posts):
        try:
            post_vectors[i] = w2v[post].mean(axis = 0)
        except:
            #text is empty no vector added
            pass
    return post_vectors

In [None]:
months_vectors = []

In [None]:
for values in dict_of_dfs.values():
    months_vectors.append(vectors_from_post(values["Message"].tolist()))

In [None]:
all_vecs = np.concatenate(months_vectors, axis = 0)

In [None]:
!git clone https://github.com/DmitryUlyanov/Multicore-TSNE.git
!pip install ./Multicore-TSNE/

In [None]:
from MulticoreTSNE import MulticoreTSNE as TSNE

In [None]:
all_vecs.shape

In [None]:
from plotly import offline
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
from sklearn.decomposition import TruncatedSVD
n_iters = 100

reducer = TruncatedSVD(n_components=3, n_iter = 100)
reduced_dimensions = reducer.fit_transform(all_vecs)

    
def add_plot(reduced_dimensions, text, name, color = 'rgba(255, 255, 0, .5)', words_to_show = -1):
    init_notebook_mode(connected=True)
    print(len(reduced_dimensions[:words_to_show,0]))
    print(len(text[:words_to_show]))
    embeds = go.Scatter3d(
        name = name,
        x=reduced_dimensions[:words_to_show,0],
        y=reduced_dimensions[:words_to_show,1],
        z=reduced_dimensions[:words_to_show,2],
        mode='markers',
        text = text[:words_to_show],
        marker=dict(
            size=12,
            line=dict(
                color=color,
                width=0.1
            ),
            opacity=1.0
        )
    )
    return embeds



Interactive visualisation looking at averaged embeddings across the last 12 months. Can check out if there are any major changes throughout the year...or realize all anyone ever does on kaggle is say thank you. 

In [None]:
month_indexes = 0
words_to_show = 100
data = []
for i, (key, value) in enumerate(dict_of_dfs.items()):
    text = [" ".join(sent) for sent in value["Message"]]
    vecs = all_vecs[month_indexes:month_indexes + month_shapes[i], :]
    month_indexes += month_shapes[i]
    data.append(add_plot(vecs, text, name = str(key), words_to_show = words_to_show))
layout = go.Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='simple-3d-scatter')

In [None]:
from sklearn.cluster import MeanShift, DBSCAN

In [None]:
%%time
clusterer = DBSCAN()
cluster_preds = clusterer.fit_predict(all_vecs)

In [None]:
from collections import defaultdict

In [None]:
month_indexes = 0
cluster_text = defaultdict(list)
for i, (key, value) in enumerate(dict_of_dfs.items()):
    text = [" ".join(sent) for sent in value["Message"]]
    vecs = all_vecs[month_indexes:month_indexes + month_shapes[i], :]
    month_indexes += month_shapes[i]
    for j, cluster in enumerate(cluster_preds[month_indexes:month_indexes + month_shapes[i]]):
        cluster_text[cluster].append(text[j])

In [None]:
for clust in cluster_text.keys():
    print(clust)
    print("*" * 80)
    for i, clust_val in enumerate(cluster_text[clust]):
        if i > 10:
            break
        else:
            print(clust_val)