In [1]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns

import time, warnings, pickle
import re, nltk

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim import matutils, models

%matplotlib inline
plt.style.use("seaborn")
pd.options.display.max_colwidth = 120
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [3]:
text_tagged = pd.read_pickle("data/review_tokens_tagged.pkl")
text_tagged.head()

Unnamed: 0,cont_clean
873185,"[(we, PRP), (listen, VBP), (closely, RB), (case, NN), (of, IN), (murder, NN), (and, CC), (loss, NN), (going, VBG), (..."
694914,"[(love, VB), (adam, NN), (brooke, NN), (so, RB), (true, JJ), (the, DT), (generation, NN), (change, NN), (but, CC), (..."
759771,"[(holly, RB), (and, CC), (her, PRP$), (guest, NN), (share, NN), (inspiring, VBG), (and, CC), (actionable, JJ), (less..."
532244,"[(i, NN), (think, VBP), (that, IN), (anyone, NN), (who, WP), (is, VBZ), (looking, VBG), (for, IN), (a, DT), (perspec..."
427706,"[(people, NNS), (act, VBP), (like, IN), (the, DT), (disclaimer, NN), (wa, NN), (so, IN), (annoying, VBG), (it, PRP),..."


In [4]:
def tag_noun(text_noun):
    """Extracts the nouns in a string of text.
       Parameters: string of text
       Returns: string with nouns
    """
    noun = lambda pos: pos[:2] == "NN"
    all_nouns = [word for (word, pos) in text_noun if noun(pos)]
    return " ".join(all_nouns)

In [5]:
#pull out all the nouns
text_tagged["nouns"] = text_tagged["cont_clean"].apply(tag_noun)

In [6]:
text_tagged.head(3)

Unnamed: 0,cont_clean,nouns
873185,"[(we, PRP), (listen, VBP), (closely, RB), (case, NN), (of, IN), (murder, NN), (and, CC), (loss, NN), (going, VBG), (...",case murder loss crime stella portland oregon
694914,"[(love, VB), (adam, NN), (brooke, NN), (so, RB), (true, JJ), (the, DT), (generation, NN), (change, NN), (but, CC), (...",adam brooke generation change banter discussion bell bottom record player memory discussion mom girl i bell bottom t...
759771,"[(holly, RB), (and, CC), (her, PRP$), (guest, NN), (share, NN), (inspiring, VBG), (and, CC), (actionable, JJ), (less...",guest share lesson leader business hr life hr knowledge mindset team business


In [8]:
from sklearn.feature_extraction import text

more_sws = ["podcast", "podcasts", "wa", "thank", "thanks", "way",
            "im", "ive", "ha", "guy", "pod", "people", 
            "wait", "fan", "thing", "stuff", "listen", "listener",
            "listening", "youre", "lot", "dont", "love", "havent",
            "yall", "ton", "word", "talk", "hi", "wow",
            "episode", "whats"]
stop_words = text.ENGLISH_STOP_WORDS.union(more_sws)

In [9]:
tf_idfn = TfidfVectorizer(max_df=0.95, min_df=2, stop_words=stop_words)
dtm_tfidfn = tf_idfn.fit_transform(text_tagged["nouns"])
dtm_tfidfn

<107445x19235 sparse matrix of type '<class 'numpy.float64'>'
	with 721697 stored elements in Compressed Sparse Row format>

### Topic modeling with Latent Dirichlet Allocation (LDA) using nouns and adjetives, TF-IDF:

In [10]:
def topics_lda(cv, dtm, topic=2, passes=10):
    """Performs topic modeling with latent dirichlet allocation with a document term matrix.
       Parameters: 
       (1) cv = vectorizer
       (2) dmt = document term matrix
       (3) topic = number of topic
       Returns: Top ten words associate with each topic.
    """
    #create the gensim corpus
    corpus = matutils.Sparse2Corpus(dtm.transpose())
    
    #create the vocabulary dictionary
    id2word = dict((v, k) for k, v in cv.vocabulary_.items())
    
    lda = models.LdaModel(corpus=corpus, num_topics=topic, id2word=id2word, passes=passes)
    return lda.print_topics()

In [12]:
topics_lda(tf_idfn, dtm_tfidfn, topic=2)

[(0,
  '0.028*"episode" + 0.017*"story" + 0.015*"time" + 0.012*"host" + 0.012*"work" + 0.011*"week" + 0.010*"topic" + 0.009*"fun" + 0.009*"friend" + 0.007*"content"'),
 (1,
  '0.015*"life" + 0.008*"information" + 0.008*"world" + 0.008*"interview" + 0.008*"day" + 0.008*"guest" + 0.007*"year" + 0.007*"business" + 0.006*"job" + 0.006*"question"')]

In [13]:
topics_lda(tf_idfn, dtm_tfidfn, topic=4)

[(0,
  '0.030*"episode" + 0.017*"time" + 0.016*"story" + 0.013*"life" + 0.010*"movie" + 0.009*"friend" + 0.008*"fun" + 0.008*"book" + 0.007*"woman" + 0.007*"god"'),
 (1,
  '0.017*"host" + 0.014*"topic" + 0.013*"content" + 0.013*"episode" + 0.011*"star" + 0.011*"music" + 0.010*"story" + 0.010*"interview" + 0.010*"guest" + 0.009*"time"'),
 (2,
  '0.016*"work" + 0.009*"view" + 0.008*"funny" + 0.007*"concept" + 0.007*"wow" + 0.006*"glad" + 0.005*"subscribe" + 0.005*"sport" + 0.005*"point" + 0.004*"politics"'),
 (3,
  '0.012*"information" + 0.012*"business" + 0.011*"day" + 0.011*"life" + 0.010*"episode" + 0.009*"advice" + 0.009*"week" + 0.008*"year" + 0.008*"time" + 0.007*"work"')]

In [14]:
topics_lda(tf_idfn, dtm_tfidfn, topic=6)

[(0,
  '0.016*"star" + 0.009*"artist" + 0.008*"ad" + 0.007*"boy" + 0.006*"recommendation" + 0.006*"blessing" + 0.006*"art" + 0.005*"dad" + 0.005*"men" + 0.005*"lady"'),
 (1,
  '0.030*"episode" + 0.017*"time" + 0.008*"season" + 0.007*"story" + 0.007*"world" + 0.005*"life" + 0.005*"host" + 0.005*"work" + 0.005*"star" + 0.005*"year"'),
 (2,
  '0.021*"life" + 0.017*"work" + 0.011*"episode" + 0.011*"year" + 0.011*"god" + 0.009*"family" + 0.008*"woman" + 0.008*"time" + 0.008*"story" + 0.008*"day"'),
 (3,
  '0.018*"day" + 0.010*"youtube" + 0.008*"history" + 0.008*"morning" + 0.008*"hi" + 0.007*"commute" + 0.007*"laugh" + 0.007*"video" + 0.007*"episode" + 0.006*"dog"'),
 (4,
  '0.015*"episode" + 0.014*"week" + 0.013*"fun" + 0.012*"movie" + 0.010*"time" + 0.010*"friend" + 0.010*"story" + 0.010*"music" + 0.009*"host" + 0.008*"review"'),
 (5,
  '0.018*"content" + 0.018*"topic" + 0.018*"host" + 0.018*"story" + 0.017*"guest" + 0.016*"information" + 0.015*"interview" + 0.014*"business" + 0.013*"epis

In [15]:
topics_lda(tf_idfn, dtm_tfidfn, topic=8)

[(0,
  '0.012*"year" + 0.011*"kid" + 0.009*"episode" + 0.009*"story" + 0.008*"family" + 0.008*"night" + 0.007*"ministry" + 0.007*"wish" + 0.007*"child" + 0.007*"parent"'),
 (1,
  '0.018*"movie" + 0.012*"content" + 0.011*"episode" + 0.010*"host" + 0.009*"chemistry" + 0.008*"time" + 0.007*"film" + 0.007*"topic" + 0.007*"opinion" + 0.006*"work"'),
 (2,
  '0.021*"life" + 0.013*"business" + 0.012*"episode" + 0.009*"world" + 0.009*"share" + 0.009*"woman" + 0.009*"time" + 0.009*"story" + 0.008*"guest" + 0.008*"interview"'),
 (3,
  '0.023*"time" + 0.012*"episode" + 0.012*"season" + 0.010*"history" + 0.009*"story" + 0.008*"quality" + 0.006*"ad" + 0.006*"series" + 0.006*"work" + 0.006*"content"'),
 (4,
  '0.013*"music" + 0.010*"funny" + 0.010*"hi" + 0.007*"jesus" + 0.007*"lord" + 0.006*"dog" + 0.006*"sex" + 0.006*"band" + 0.006*"breath" + 0.006*"air"'),
 (5,
  '0.016*"information" + 0.011*"episode" + 0.008*"stop" + 0.007*"host" + 0.007*"point" + 0.006*"estate" + 0.006*"time" + 0.006*"idea" + 0.0

### Sentiment Analysis: input is corpus becuase order of words matters

In [16]:
from textblob import TextBlob
import pandas as pd
import pickle

In [17]:
df_clean = pd.read_pickle("data/df_100k_clean.pkl")

In [18]:
df_clean.head(3)

Unnamed: 0,podcast_id,review_title,content,rating,created_at,category,podcast_title,cont_clean
873185,a9bdaba5449189a4587793e36ce4f704,A True Crime Haiku for you,"We listen closely,\nCases of Murder and Loss\nGoing West : True Crime\n\nStella\nPortland, Oregon",5,2019-10-10T15:19:20-07:00,society-culture,Going West: True Crime,we listen closely case of murder and loss going west true crime stella portland oregon
694914,c699968fd592a60c3ba7333d530907d7,Engaging in my 60's,"Love Adam & Brooke! So true, the generations change, but they do recycle. I love the banter and found the discussion...",5,2018-12-31T04:22:13-07:00,comedy,"Thirty, Flirty, & Dying",love adam brooke so true the generation change but they do recycle i love the banter and found the discussion of bel...
759771,dcfb90baae108f9938fd3a762b810cc3,Highly recommend!,"Holly and her guests share inspiring and actionable lessons on how to succeed as a leader in business, HR, and life....",5,2017-09-11T07:05:53-07:00,business,Nine To Thrive HR,holly and her guest share inspiring and actionable lesson on how to succeed a a leader in business hr and life highl...


In [19]:
df_clean["rating"].value_counts()

5    94549
1     5025
4     3307
3     2402
2     2162
Name: rating, dtype: int64

In [20]:
pol = lambda x: TextBlob(x).sentiment.polarity
df_clean["polarity"] = df_clean["content"].apply(pol)

df_clean.head(3)

Unnamed: 0,podcast_id,review_title,content,rating,created_at,category,podcast_title,cont_clean,polarity
873185,a9bdaba5449189a4587793e36ce4f704,A True Crime Haiku for you,"We listen closely,\nCases of Murder and Loss\nGoing West : True Crime\n\nStella\nPortland, Oregon",5,2019-10-10T15:19:20-07:00,society-culture,Going West: True Crime,we listen closely case of murder and loss going west true crime stella portland oregon,0.35
694914,c699968fd592a60c3ba7333d530907d7,Engaging in my 60's,"Love Adam & Brooke! So true, the generations change, but they do recycle. I love the banter and found the discussion...",5,2018-12-31T04:22:13-07:00,comedy,"Thirty, Flirty, & Dying",love adam brooke so true the generation change but they do recycle i love the banter and found the discussion of bel...,0.321875
759771,dcfb90baae108f9938fd3a762b810cc3,Highly recommend!,"Holly and her guests share inspiring and actionable lessons on how to succeed as a leader in business, HR, and life....",5,2017-09-11T07:05:53-07:00,business,Nine To Thrive HR,holly and her guest share inspiring and actionable lesson on how to succeed a a leader in business hr and life highl...,0.553333


In [21]:
pos = (df_clean["polarity"] > 0)
df_clean[pos].value_counts("rating")

#check review with positive polarity, but low rating
mask = (df_clean["polarity"] > 0.5) & (df_clean["rating"] == 1)
for i in df_clean[mask]["content"][:3]:
    print("Review: ", i, "\n")

Review:  There are much better podcasts out there that focus just on good paranormal stories. 

Review:  So “even someone from Alabama can teach you something “ (Teachability episode). Impressive statement during a time of division and hatred in our country. Good luck. 

Review:  Either he’s promoting his own products, or has a guest promoting theirs (and provides his own “discount” promo code for guests products)

Monotone soliloquy 



In [22]:
#rating distribution of reviews with negative polarity
neg = (df_clean["polarity"] < 0)
df_clean[neg].value_counts("rating")

rating
5    3827
1    1919
2     609
3     469
4     287
dtype: int64

### ^ Polarity doesn't seem to match well with rating scores for a good portion of the reviews...hmmm...