In [18]:
!pip install emoji
!pip install spacy
!pip install sentence_transformers
import pandas as pd
import numpy as np
import emoji
import regex
import nltk
import re
import sklearn
import string
import spacy
from collections import Counter
from sentence_transformers import SentenceTransformer

In [19]:
nltk.download('stopwords')
nltk.download('wordnet')

In [20]:
df = pd.read_csv('data2021.csv')
print(len(df))
df = df.drop_duplicates(subset=['user_id']) # remove duplicates

In [21]:
df.dropna(subset=['user_description'], inplace=True) # remove NaN values

In [22]:
df['user_description']

In [23]:
def give_emoji_free_text(text):
    """
    Removes emoji's from tweets
    Accepts:
        Text (tweets)
    Returns:
        Text (emoji free tweets)
    """
    emoji_list = [c for c in text if c in emoji.EMOJI_DATA]
    clean_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)])
    return clean_text

def url_free_text(text):
    '''
    Cleans text from urls
    '''
    text = re.sub(r'http\S+', '', text)
    return text

# Apply the function above and get tweets free of emoji's
call_emoji_free = lambda x: give_emoji_free_text(x)

# Apply `call_emoji_free` which calls the function to remove all emoji's
df['emoji_free_tweets'] = df['user_description'].apply(call_emoji_free)

#Create a new column with url free tweets
df['url_free_tweets'] = df['emoji_free_tweets'].apply(url_free_text)

In [24]:
df['url_free_tweets'].str.rstrip()

In [25]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

wordnet_lemmatizer = WordNetLemmatizer()
def preprocessing(tweet):
    '''preprocess a tweet'''
    
    tweet = "".join([i for i in tweet if i not in string.punctuation]) # remove punctuation
    tweet = tweet.lower() # lowercase
    
    tweet = word_tokenize(tweet, language="dutch") # tokenize the tweet
    
    stopwords = nltk.corpus.stopwords.words('dutch')
    tweet = [i for i in tweet if i not in stopwords] # remove stopwords
    
    stopwords = nltk.corpus.stopwords.words('english')
    tweet = [i for i in tweet if i not in stopwords] # remove stopwords
    
    tweet = [wordnet_lemmatizer.lemmatize(word) for word in tweet] # lemmatize the tweets
    
    return tweet
    

In [None]:
df['preprocessed'] = df['url_free_tweets'].apply(preprocessing)
df = df[df.astype(str)['preprocessed'] != '[]']

In [None]:
df['preprocessed']

In [None]:
model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
#Our sentences we like to encode
sentences_list =  df['preprocessed'].values
sentences =[]
for i in sentences_list:
    var = " ".join(i)
    sentences.append(var)
embeddings = model.encode(sentences)

In [None]:
#Print the embeddings
#for sentence, embedding in zip(sentences, embeddings):
#    print("Sentence:", sentence)
#    print("Embedding:", embedding)
#    print("")

In [None]:
!pip install yellowbrick

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.datasets import make_blobs
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.preprocessing import StandardScaler
from yellowbrick.cluster import KElbowVisualizer
from matplotlib import pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
%matplotlib inline
sns.set_style('whitegrid')
plt.style.use('fivethirtyeight')

Sum_of_squared_distances =[]
K = range(1,15)
for k in K:
    km =KMeans(n_clusters =k, n_init = 10)
    km =km.fit(embeddings)
    Sum_of_squared_distances.append(km.inertia_)

plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('SSE')
plt.title('Elbow Method For Optimal k')
plt.show()

In [None]:
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import silhouette_samples 

def mbkmeans_clusters(
    X, 
    k, 
    mb, 
    print_silhouette_values, 
):
    """Generate clusters and print Silhouette metrics using MBKmeans

    Args:
        X: Matrix of features.
        k: Number of clusters.
        mb: Size of mini-batches.
        print_silhouette_values: Print silhouette values per cluster.

    Returns:
        Trained clustering model and labels based on X.
    """
    km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
    print(f"For n_clusters = {k}")
    print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
    print(f"Inertia:{km.inertia_}")

    if print_silhouette_values:
        sample_silhouette_values = silhouette_samples(X, km.labels_)
        print(f"Silhouette values:")
        silhouette_values = []
        for i in range(k):
            cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
            silhouette_values.append(
                (
                    i,
                    cluster_silhouette_values.shape[0],
                    cluster_silhouette_values.mean(),
                    cluster_silhouette_values.min(),
                    cluster_silhouette_values.max(),
                )
            )
        silhouette_values = sorted(
            silhouette_values, key=lambda tup: tup[2], reverse=True
        )
        for s in silhouette_values:
            print(
                f"    Cluster {s[0]}: Size:{s[1]} | Avg:{s[2]:.2f} | Min:{s[3]:.2f} | Max: {s[4]:.2f}"
            )
    return km, km.labels_

embedding_rs = embedding.reshape(-1, 1)

clustering, cluster_labels = mbkmeans_clusters(
    X=embedding_rs,
    k=50,
    mb=500,
    print_silhouette_values=True,
)

In [None]:
km = KMeans(n_clusters = 23, n_init = 10)
km = km.fit(embeddings)

In [None]:
label = km.fit_predict(embeddings)

In [None]:
import json
data = label.tolist()
with open('data.json', 'w') as f:
    json.dump(data, f)

In [None]:
print(label)

In [None]:
print(len(label))

In [None]:
cluster_dict = {}
for count, value in enumerate(sentences):
    if label[count] in cluster_dict:
        cluster_dict[label[count]] += value
    else:
        cluster_dict[label[count]] = value
print(len(cluster_dict))
print(len(sentences))


In [None]:
from collections import Counter
for i in cluster_dict:
    wordsList = str(cluster_dict[i]).split()
    henk = Counter(wordsList)
    most_occur = henk.most_common(10)
    print(most_occur)