In [7]:
import glob
import matplotlib.pyplot as plt
import gc
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import subprocess

In [8]:
files_csv = list(filter(os.path.isfile, glob.glob("/kaggle/input/**/*.csv", recursive=True)))
files_csv

In [9]:
def csv_file_length(fname):
    process = subprocess.Popen(['wc', '-l', fname], stdout=subprocess.PIPE, 
                                              stderr=subprocess.PIPE)
    result, error = process.communicate()
    if process.returncode != 0:
        raise IOError(error)
    return int(result.strip().split()[0])

fname = files_csv[-1]
random_rows_selection = csv_file_length(fname)
print(f'Number of random rows in "{fname}" is:', random_rows_selection)

In [10]:
skip_rows = np.random.choice(np.arange(1, random_rows_selection), size=random_rows_selection-1-50000, replace=False)
skip_rows=np.sort(skip_rows)
print('Rows to skip:', len(skip_rows))
print('Remaining rows in the random sample:', random_rows_selection-len(skip_rows))

In [11]:
nrows = 20000
twitter = pd.read_csv(files_csv[-1], nrows=nrows)

In [12]:
del files_csv
gc.collect()

In [13]:
twitter.info(memory_usage="deep")

In [14]:
del nrows
del skip_rows
gc.collect()

In [15]:
twitter.head()

## EDA

In [16]:
twitter.info()

In [17]:
100*twitter.isnull().sum()/twitter.shape[0]

In [18]:
twitter.describe(include="object")

In [19]:
from collections import Counter
from functools import reduce
from operator import add
from nltk.tokenize import TreebankWordTokenizer
import re
import seaborn as sns

In [20]:
tok = TreebankWordTokenizer()
arr = twitter["text"].astype(str).drop_duplicates().apply(lambda x: tok.tokenize(x)).array

test = []
for i, token in enumerate(arr):
    test.extend(token)

c = Counter(test)
d = pd.DataFrame(c, index=['occurrences']).transpose().reset_index()
d.columns=['word', 'occurences']
nb_total = d.occurences.sum()
d['freq'] = d.occurences.apply(lambda x: x/nb_total)
d = d.sort_values('freq', ascending=False)
plt.figure(figsize=(20,10))
plt.grid()
plt.xscale('log')
plt.yscale('log')
plt.xlabel('log(rank)')
plt.ylabel('log(frequency)')
plt.title("Zipf's law plot")
x = list(range(d.shape[0]))
plt.plot(x, d.freq)

plt.show()

In [21]:
pattern_ints = re.compile("[+-]?(?<![\d.])[0-9]+(?![\d.])")
pattern_floats = re.compile("[+-]?([0-9]*[.])?[0-9]+")
pattern_percentage = re.compile("[+-]?([0-9]*[.])?[0-9]+%")
pattern_year = re.compile("[12][0-9]{3}")

In [22]:
df1 = twitter.drop_duplicates("text")
progs = [pattern_ints, pattern_floats, pattern_percentage, pattern_year]
nb_types = ['ints', 'floats', 'percents', 'years']
for prog, nb_type in zip(progs, nb_types):
    df1[nb_type] = df1['text'].apply(lambda x: len(prog.split(x))-1).array
df1[nb_types].head()

In [23]:
fig, (ax0, ax1, ax2) = plt.subplots(1,3, figsize=(20, 10))
sns.histplot(df1["ints"], color="b", ax=ax0, bins=10)
sns.histplot(df1["floats"], color="r", ax=ax1, bins=10)
sns.histplot(df1["years"], color="g", ax=ax2, bins=10)
fig.tight_layout(pad=5.0)

In [24]:
del tok
del df1
del progs
del  nb_types
del ax0
del ax1 
del ax2

gc.collect()

In [25]:
import string
import spacy
import warnings

In [26]:
nlp = spacy.load("en_core_web_sm")

In [27]:
def preprocess(text, stopwords=False):
    text = str(text).lower()
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub("[’]", "'", text)
    text = re.sub("@\S+", ' ', text)
    text = re.sub("#\S+", ' ', text)
    text = re.sub("[']\w*", "",text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub("\s{2,}", ' ', text)
    text = text.strip()
    if stopwords:
        text = " ".join([word for word in text.split() if word in nlp.Defaults.stop_words])
    else:
        text = " ".join([word for word in text.split() if word not in nlp.Defaults.stop_words])
    return text

In [28]:
with warnings.catch_warnings():
    warnings.filterwarnings("ignore",category=DeprecationWarning)
    twitter["cleaned_text"] = twitter["text"].apply(preprocess)
    twitter.head()

In [29]:
twitter.drop_duplicates()["text"].apply(preprocess).str.len().hist(bins=40)

## Distribution stopwords

In [30]:
from functools import partial
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud

In [31]:
twitter.drop_duplicates()["text"].apply(partial(preprocess, stopwords=True)).str.len().hist(bins=40)

In [32]:
cv = CountVectorizer(stop_words="english")

words = cv.fit_transform(twitter["cleaned_text"])
sum_words = words.sum(axis=0) # sum of all words by text

words_freq =  [(word, sum_words[0, idx]) for word, idx in cv.vocabulary_.items()]

words_freq = sorted(words_freq, key=lambda r:r[1], reverse=True)

frequency = pd.DataFrame(words_freq, columns=["word", "freq"])

plt.style.use("fivethirtyeight")
color = plt.cm.ocean(np.linspace(0, 1, 20))
frequency.head(20).plot(x="word", y="freq", kind="bar", color=color, figsize=(20, 10))
plt.title("Most frequency occuring - TOP 20")
plt.show()

In [33]:
word_cloud = WordCloud(width = 1000,
                       height = 800,
                       colormap = 'Blues', 
                       margin = 0,
                       max_words = 200,  
                       min_word_length = 4,
                       max_font_size = 120, min_font_size = 15,  
                       background_color = "white").generate_from_frequencies(dict(words_freq))

plt.figure(figsize = (10, 15))
plt.imshow(word_cloud, interpolation = "gaussian")
plt.axis("off")
plt.show()

In [34]:
from multiprocess import (Pipe, Process, Pool, Manager)
import multiprocess
from functools import partial
from contextlib import contextmanager
import gensim
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk import word_tokenize

In [35]:
def preprocess_tweets(df):
    corpus = []
    nlp.Defaults.stop_words
    for tweets in df['cleaned_text']:
        lem = WordNetLemmatizer()
        words = [w for w in word_tokenize(tweets) if (w not in nlp.Defaults.stop_words)]
        
        words = [lem.lemmatize(w) for w in words if len(w)>2]
        
        corpus.append(words)
    return corpus

corpus = preprocess_tweets(twitter)

In [36]:
!pip install pyLDAvis
!pip install gensim

In [37]:
import pyLDAvis
import pyLDAvis.gensim

In [38]:
dic = gensim.corpora.Dictionary(corpus)
bow_corpus = [dic.doc2bow(doc) for doc in corpus]
lda_model = gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 5, 
                                   id2word = dic,                                    
                                   passes = 10,
                                   workers = 2)

In [39]:
lda_display = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dic)
pyLDAvis.display(lda_display)

In [40]:
del cv
del corpus
del bow_corpus
del lda_model
del dic
del lda_display
gc.collect()

In [41]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import cluster

In [42]:
def run_KMeans(max_k, data):
    max_k += 1
    kmeans_results = dict()
    for k in range(2 , max_k):
        kmeans = cluster.KMeans(n_clusters = k
                               , init = 'k-means++'
                               , n_init = 10
                               , tol = 0.0001
                               , n_jobs = -1
                               , random_state = 1
                               , algorithm = 'full')

        kmeans_results.update( {k : kmeans.fit(data)} )
        
    return kmeans_results

In [44]:
def printAvg(avg_dict):
    for avg in sorted(avg_dict.keys(), reverse=True):
        print("Avg: {}\tK:{}".format(avg.round(4), avg_dict[avg]))
        
def plotSilhouette(df, n_clusters, kmeans_labels, silhouette_avg):
    fig, ax1 = plt.subplots(1)
    fig.set_size_inches(8, 6)
    ax1.set_xlim([-0.2, 1])
    ax1.set_ylim([0, len(df) + (n_clusters + 1) * 10])
    
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--") # The vertical line for average silhouette score of all the values
    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.2, 0, 0.2, 0.4, 0.6, 0.8, 1])
    plt.title(("Silhouette analysis for K = %d" % n_clusters), fontsize=10, fontweight='bold')
    
    y_lower = 10
    sample_silhouette_values = silhouette_samples(df, kmeans_labels) # Compute the silhouette scores for each sample
    for i in range(n_clusters):
        ith_cluster_silhouette_values = sample_silhouette_values[kmeans_labels == i]
        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7)

        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Label the silhouette plots with their cluster numbers at the middle
        y_lower = y_upper + 10  # Compute the new y_lower for next plot. 10 for the 0 samples
    plt.show()
    
        
def silhouette(kmeans_dict, df, plot=False):
    df = df.to_numpy()
    avg_dict = dict()
    for n_clusters, kmeans in kmeans_dict.items():      
        kmeans_labels = kmeans.predict(df)
        silhouette_avg = silhouette_score(df, kmeans_labels) # Average Score for all Samples
        avg_dict.update( {silhouette_avg : n_clusters} )
    
        if(plot): plotSilhouette(df, n_clusters, kmeans_labels, silhouette_avg)

In [45]:
corpus = []
for text in twitter["cleaned_text"]:
    stem = PorterStemmer()
    lem = WordNetLemmatizer()
    words = [lem.lemmatize(w) for w in text.split() if len(w)>2]
    words = " ".join(words)
    corpus.append(words)

In [46]:
def get_top_features_cluster(tf_idf_array, prediction, n_feats):
    labels = np.unique(prediction)
    dfs = []
    for label in labels:
        id_temp = np.where(prediction==label) # indices for each cluster
        x_means = np.mean(tf_idf_array[id_temp], axis = 0) # returns average score across cluster
        sorted_means = np.argsort(x_means)[::-1][:n_feats] # indices with top 20 scores
        features = vectorizer.get_feature_names()
        best_features = [(features[i], x_means[i]) for i in sorted_means]
        df = pd.DataFrame(best_features, columns = ['features', 'score'])
        dfs.append(df)
    return dfs

def plotWords(dfs, n_feats):
    plt.figure(figsize=(8, 4))
    for i in range(0, len(dfs)):
        plt.title(("Most Common Words in Cluster {}".format(i)), fontsize=10, fontweight='bold')
        sns.barplot(x = 'score' , y = 'features', orient = 'h' , data = dfs[i][:n_feats])
        plt.show()

In [47]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
tf_idf = pd.DataFrame(data = X.toarray(), columns=vectorizer.get_feature_names())
final_df = tf_idf
print("{} rows".format(final_df.shape[0]))

In [48]:
k = 8
kmeans_results = run_KMeans(k, final_df)

In [49]:
best_result = 5
kmeans = kmeans_results.get(best_result)

final_df_array = final_df.to_numpy()
prediction = kmeans.predict(final_df)
n_feats = 20
dfs = get_top_features_cluster(final_df_array, prediction, n_feats)
plotWords(dfs, 13)

In [50]:
# Transforms a centroids dataframe into a dictionary to be used on a WordCloud.
def centroidsDict(centroids, index):
    a = centroids.T[index].sort_values(ascending = False).reset_index().values
    centroid_dict = dict()

    for i in range(0, len(a)):
        centroid_dict.update( {a[i,0] : a[i,1]} )

    return centroid_dict

def generateWordClouds(centroids):
    wordcloud = WordCloud(max_font_size=100, background_color = 'white')
    for i in range(0, len(centroids)):
        centroid_dict = centroidsDict(centroids, i)        
        wordcloud.generate_from_frequencies(centroid_dict)

        plt.figure()
        plt.title('Cluster {}'.format(i))
        plt.imshow(wordcloud)
        plt.axis("off")
        plt.show()


In [51]:
centroids = pd.DataFrame(kmeans.cluster_centers_)
centroids.columns = final_df.columns
generateWordClouds(centroids)

In [54]:
labels = kmeans.labels_ 
twitter['label'] = labels
twitter.head()

In [55]:
twitter["label"].nunique()