# EDA

### This notebook constructs some summary statistics on the final data

In [30]:
# Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


In [31]:
data = pd.read_pickle('data/final_data.pkl')  # future: merged_data_sentiment.pkl

In [32]:
data = data[data.category == 'Audio'].reset_index(drop=True)
data.shape

(456, 35)

In [33]:
# Print Example
data.title[0]

'Splash: The Dancing Fountain Bluetooth Speaker'

In [34]:
# Print Example
data.description[0]

'Don’t just hear your music, see your music too!'

In [35]:
# Print Example
data.story_txt_clean[0]

'enjoying music one best feelings world make even special splash bluetooth speaker allows listen music high quality sound allows totally immerse enjoy visually looking really get party started create wow factor entertaining guests home splash bluetooth speaker perfect creating fun vibe occasion splash bluetooth speaker built led light fountain feature driven pulse reactive motor connected sound source motor spins beat music colour led lights dance around inside speaker creating spectacular fountain visual display time tune want take musical experience next level splash bluetooth speaker exactly need boasting dual soundtracks splash bluetooth speaker brings new dimension enjoying music able fill entire room crystal clear stereo surround sound unlike regular hi fi speakers splash becomes party centrepiece powerful woofer creating high quality audio visual enjoyment connect splash device bt aux enjoy high quality sound spectacular fountain show contact us even discounts learn splash today

In [162]:
def top_words_dtm(tf, tf_vectorizer, n_top_words=None):
    tf_words = tf_vectorizer.get_feature_names_out()
    tf_freqs = tf.toarray().sum(axis=0)
    df = pd.DataFrame({'word': tf_words, 'freq': tf_freqs})
    df = df.sort_values(by='freq', ascending=False).reset_index(drop=True)
    return df.iloc[:n_top_words ,:]

In [226]:
def print_top_words_lda(model, tf_vectorizer, n_top_words):
    for topic_id, topic in enumerate(model.components_):
        feature_names = tf_vectorizer.get_feature_names_out()
        print('\nTopic Nr.%d:' % int(topic_id + 1)) 
        print(''.join([feature_names[i] + ' ' + str(round(topic[i], 2)) 
              +' | ' for i in topic.argsort()[:-n_top_words - 1:-1]]))

In [215]:
# create text corpus from text column
corpus_small = data.title_clean + data.description_clean
corpus_large = data.story_txt_clean

corpus = corpus_small

In [228]:
# LDA of the data   #  https://towardsdatascience.com/introduction-to-topic-modeling-using-scikit-learn-4c3f3290f5b9
tf_vectorizer = CountVectorizer(max_df=1.0, min_df=3, max_features=None, stop_words='english', ngram_range=(1, 1))
tf = tf_vectorizer.fit_transform(corpus)  # create dtm with word frequencies
# top_words_dtm(tf, tf_vectorizer)  # Print top words

In [225]:
lda = LatentDirichletAllocation(n_components=5, random_state=1).fit(tf)

In [227]:
print_top_words_lda(lda, tf_vectorizer, n_top_words=10)


Topic Nr.1:
wireless 82.79 | bluetooth 54.26 | true 27.93 | headphones 27.85 | sound 27.74 | audio 26.95 | portable 21.38 | stereo 16.75 | earbuds 15.91 | noise 15.78 | 

Topic Nr.2:
world 30.85 | wireless 14.19 | earbuds 12.35 | designed 11.19 | best 10.26 | custom 10.2 | hearing 9.2 | sounds 9.18 | guitar 9.0 | sound 8.89 | 

Topic Nr.3:
music 45.82 | bluetooth 28.13 | speaker 26.94 | noise 22.99 | ear 17.57 | help 16.2 | voice 14.74 | sound 13.03 | audio 10.28 | create 10.19 | 

Topic Nr.4:
sound 24.48 | audio 23.56 | stereo 20.98 | high 20.43 | earbuds 14.39 | dac 14.2 | music 12.5 | wireless 12.2 | speakers 11.66 | bluetooth 9.93 | 

Topic Nr.5:
audio 61.01 | sound 58.86 | wireless 26.27 | hi 25.2 | music 23.91 | fi 23.6 | speaker 21.33 | headphones 20.13 | waterproof 18.77 | headphone 17.19 | 


In [187]:
lda.transform(tf)  # get topics for samples

array([[0.00461803, 0.99025941, 0.00512256],
       [0.71127196, 0.10536985, 0.18335819],
       [0.00235298, 0.00222621, 0.99542081],
       ...,
       [0.67966514, 0.31847727, 0.00185758],
       [0.01810841, 0.01754193, 0.96434966],
       [0.56862391, 0.0121102 , 0.41926589]])