# timeframe: 2011-02-21 to 2018-09-09

In [1]:
import pandas as pd
import numpy as np
import nltk
import spacy
from textblob import TextBlob

from nltk import word_tokenize, sent_tokenize
from nltk.corpus import words
from nltk.stem import SnowballStemmer
from nltk.stem import PorterStemmer

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.decomposition import LatentDirichletAllocation

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

import joblib
pd.options.display.colheader_justify = 'right'
pd.options.display.column_space = 1
pd.options.display.expand_frame_repr = True
pd.options.display.max_colwidth = 120

In [2]:
df = joblib.load('data/clean/clean_df.joblib')

In [None]:
# don't use stemming with Word2Vec

In [3]:
df.shape

(45671, 10)

In [4]:
df.head(3)

Unnamed: 0,body,score,author,permalink,comment_id,created_loc,created_utc,post_title,post_score,post_id
0,thats an odd hashtag to throw at the end of this tweet,1085,RajasConCrema,/r/nba/comments/65wmkz/withers_lebron_said_he_couldnt_imagine_losing_a/dgdq1yr/,dgdq1yr,2017-04-18 00:37:33,2017-04-17 16:37:33,"[Withers] LeBron said he couldn't imagine losing a sibling like Isaiah Thomas did. ""I can only imagine how that woul...",1398,65wmkz
1,that situation is so tough already i cant imagine what it is like preparing all season to get ready for the playoffs...,84,,/r/nba/comments/65wmkz/withers_lebron_said_he_couldnt_imagine_losing_a/dgdpz3b/,dgdpz3b,2017-04-18 00:36:07,2017-04-17 16:36:07,"[Withers] LeBron said he couldn't imagine losing a sibling like Isaiah Thomas did. ""I can only imagine how that woul...",1398,65wmkz
2,i dont think its brought up enough that he never had a sibling or a father figure the fact that hes a great team pla...,209,writingandshit,/r/nba/comments/65wmkz/withers_lebron_said_he_couldnt_imagine_losing_a/dgdsthu/,dgdsthu,2017-04-18 01:28:48,2017-04-17 17:28:48,"[Withers] LeBron said he couldn't imagine losing a sibling like Isaiah Thomas did. ""I can only imagine how that woul...",1398,65wmkz


In [148]:
# TODO: Properly do entire data set and a train_test_split later on
df_small = df[:20000]

df_small_train = df_small[:int(0.7 * df_small.shape[0])]
df_small_test = df_small[int(0.7 * df_small.shape[0]):]

In [48]:
test_cv= CountVectorizer(min_df = 10, max_df = .99, #ngram_range=(1, 2),
                         strip_accents='ascii',
                     strip_accents='unicode', encoding='utf-8')

In [49]:
dtm_tf = test_cv.fit_transform(df.body)
most_common_df = pd.DataFrame(dtm_tf.toarray(), columns=test_cv.get_feature_names())

In [50]:
df.body.str.contains('lebron').value_counts()

False    32504
True     13167
Name: body, dtype: int64

In [51]:
most_common_df.sum().sort_values(ascending=False)

the              41729
to               22576
and              19010
lebron           15172
is               14893
he               14514
in               14017
of               13472
that             12294
this             10117
it                9154
for               8322
his               7730
on                7698
was               7551
you               6337
be                6107
but               5796
with              5769
like              5581
just              4966
him               4852
not               4626
have              4571
at                4498
if                4440
as                4350
all               4347
game              4196
so                4134
                 ...  
memorable           10
opens               10
busy                10
butthurt            10
arsenal             10
obama               10
neutral             10
stellar             10
nailed              10
moon                10
stink               10
mismatch            10
minority   

In [12]:
from nltk.corpus import stopwords

# TF

In [92]:
def word_ngrams(self, tokens): #, stop_words=None):
    """Turn tokens into a sequence of n-grams after stop words filtering"""
    # handle stop words
#     if stop_words is not None:
#         tokens = [w for w in tokens if w not in stop_words]

    # handle token n-grams
    min_n, max_n = self.ngram_range
    if max_n != 1:
        original_tokens = tokens
        if min_n == 1:
            # no need to do any slicing for unigrams
            # just iterate through the original tokens
            tokens = list(original_tokens)
            min_n += 1
        else:
            tokens = []

        n_original_tokens = len(original_tokens)

        # bind method outside of loop to reduce overhead
        tokens_append = tokens.append
        space_join = " ".join

        for n in xrange(min_n,
                        min(max_n + 1, n_original_tokens + 1)):
            for i in xrange(n_original_tokens - n + 1):
                tokens_append(space_join(original_tokens[i: i + n]))

    return tokens

### Next steps:
1. I want to find the X topics/clusters about these texts. Perhaps it's fitness, politics, training, performance, GOAT.
2. Figure out which topic each document best corresponds to, and place the document within that topic/cluster
3. Chart a stacked-bar or a multi-line chart for each year (or month, or something else) from 2011-2018, and show
   the PERCENTAGE (Or COUNT) of each topic for that period of time.
4. Look at the insights! What does that mean?
5. Perhaps do sentiment analysis. See if I can find a well-trained model. Or train my own based on upvote? score? (Probably not doing this)

**NOTE**: Simply do LDA first, and make this work. Get a chart. Get some insights.
**THEN:** Make it repeatable. Get it in a function somehow. Try other Dim Red and Unclassifying and Topic modeling techniques. See what works best.

In [138]:
words_corpus = set(words.words())
analyzer = CountVectorizer().build_analyzer()
stem = SnowballStemmer('english')

stops = stopwords.words('english')
stops += ['just', 'lol', 'like', 'im', 'he', 'hes', 'would', 'get', 'going', 'doesnt', 'th', 'fuck', 'think', 'even', 'dont',
    'lebron', 'james', 'game', 'didnt', 'cant', 'say', 'see', 'look', 'go', 'said', 'also']
stops = set(stops)
acceptable_words = words_corpus - stops

Current isues:
* can't use stemmer and n_grams at same time.
* still too many junk words, even with growing stop_list
* better stop word list. perhaps from reddit.
* what is max_features?
* **don't have a lemmatizer in there. How can i do lemmatize, n_grams, stop words, and stemmer all in analyer?**

### Tell Roberto/Chad I'm having a real hard time cleaning my text

42862

In [175]:
wordpunct = nltk.WordPunctTokenizer()

In [177]:
lemmatizer = nltk.WordNetLemmatizer()

In [186]:
token_pos = nltk.pos_tag(wordpunct.tokenize('filthy'))

In [188]:
token_pos

[('filthy', 'NN')]

In [192]:
token_pos[0][1]

'NN'

In [195]:
lemmatizer.lemmatize('filthy', pos='adj')

KeyError: 'adj'

In [176]:
trythis = wordpunct.tokenize(df_small_train.body)

TypeError: expected string or bytes-like object

In [211]:
from nltk.tokenize import TreebankWordTokenizer


In [214]:
n_gram_size = 2

def english_corpus(doc, stemmer=stem):
    clean_words = [stemmer.stem(w) for w in analyzer(doc) if w in acceptable_words]
#     num_index = 0
#     for w in clean_words:
#         yield w
#         try:
#             if clean_words[num_index + 1]:
#                 yield clean_words[num_index + 1]
#         except:
#             continue
#         num_index+= 1    
    return [stemmer.stem(w) for w in analyzer(doc) if w in acceptable_words]

cv = CountVectorizer(stop_words=stops, 
                     #analyzer=english_corpus, 
                     min_df = 3, max_df = .95, ngram_range=(n_gram_size, n_gram_size),
                     strip_accents='unicode', 
                     encoding='utf-8',
                     tokenizer=TreebankWordTokenizer().tokenize,
                     #max_features=100
                    )
dtm_tf = cv.fit_transform(df_small_train.body)
dtm_tf.shape


# def words_and_char_bigrams(text):
# ...     words = re.findall(r'\w{3,}', text)
# ...     for w in words:
# ...         yield w
# ...         for i in range(len(w) - 2):
# ...             yield w[i:i+2]

  sorted(inconsistent))


(14000, 3546)

In [215]:
# pd.DataFrame(dtm_tf.toarray(), columns=cv.get_feature_names()).shape

# lda_tf = LatentDirichletAllocation(n_components=10, random_state=0)
# lda_tf.fit(dtm_tf)

# # pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, cv)
# see_lda_topics(cv, 5, lda_tf, 7)

for n_topic in range(3,10):
    loop_lda_tf = LatentDirichletAllocation(n_components=n_topic, random_state=0)
    loop_lda_tf.fit(dtm_tf)
    print(':::::')
    print(f'Number of topics = {n_topic}')
    print(':::::')
    see_lda_topics(cv, n_topic, loop_lda_tf, 7)

:::::
Number of topics = 3
:::::
TOPIC 0
gon na
r nba
triple double
high school
got ta
locker room
free throw

TOPIC 1
best player
kevin love
holy shit
basketball player
anyone else
years ago
ty lue

TOPIC 2
last year
regular season
year old
ever seen
ive ever
k k
ive seen

:::::
Number of topics = 4
:::::
TOPIC 0
gon na
triple double
got ta
locker room
wan na
last night
pretty sure

TOPIC 1
best player
kevin love
next year
holy shit
anyone else
basketball player
ty lue

TOPIC 2
gon na
last year
regular season
ever seen
ive ever
year old
k k

TOPIC 3
r nba
high school
every time
free throw
ive seen
space jam
cavs fans

:::::
Number of topics = 5
:::::
TOPIC 0
gon na
triple double
ever seen
ive ever
paul george
pretty sure
jr smith

TOPIC 1
best player
next year
holy shit
anyone else
god damn
player league
really good

TOPIC 2
gon na
regular season
last year
year old
k k
averaged ppg
free throws

TOPIC 3
r nba
every time
free throw
ive seen
high school
space jam
everyone else

TOPIC 4
g

KeyboardInterrupt: 

# TF-IDF

In [71]:
cv2 = CountVectorizer(#stop_words=stops, 
                     analyzer=english_corpus, 
                     min_df = 2, max_df = .95, #ngram_range=(1, 2),
                     strip_accents='unicode', encoding='utf-8')

tfidf_vectorizer = TfidfVectorizer(**cv2.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(df_small_train.body)

lda_tfidf = LatentDirichletAllocation(n_components=10, random_state=0)
lda_tfidf.fit(dtm_tfidf)

pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [120]:
def see_lda_topics(vectorizer, n_topics, fit_lda, n_examples):
    '''
    Prints top 10 names assigned to the topics, and the topics themselves
    '''
    vocab = vectorizer.get_feature_names()
    for topic in range(n_topics):
        print(f"TOPIC {topic}")
        for j in np.argsort(-fit_lda.components_,1)[topic,:n_examples]:
            print(vocab[j])
        print()
see_lda_topics(cv, 10, lda_tf, 3)

TOPIC 0
game
cavs
got

TOPIC 1
nice
damn
think

TOPIC 2
dont
see
game

TOPIC 3
game
nba
one

TOPIC 4
player
finals
team

TOPIC 5
team
think
player

TOPIC 6
one
fucking
people

TOPIC 7
double
really
seasons

TOPIC 8
man
good
confirmed

TOPIC 9
game
games
season



In [23]:
cv.get_feature_names()

['abil',
 'abl',
 'about',
 'abov',
 'absolut',
 'absorb',
 'absurd',
 'abus',
 'accept',
 'accomplish',
 'accord',
 'achiev',
 'across',
 'act',
 'activ',
 'actor',
 'actual',
 'ad',
 'add',
 'adjust',
 'admit',
 'advanc',
 'advantag',
 'advertis',
 'advic',
 'advoc',
 'after',
 'again',
 'against',
 'age',
 'agenc',
 'agent',
 'aggress',
 'ago',
 'agre',
 'ah',
 'ahead',
 'ai',
 'aint',
 'air',
 'albeit',
 'alien',
 'all',
 'alley',
 'almost',
 'alon',
 'along',
 'alongsid',
 'alreadi',
 'also',
 'altern',
 'alway',
 'am',
 'amaz',
 'ami',
 'among',
 'amongst',
 'amount',
 'an',
 'analysi',
 'analyst',
 'and',
 'angl',
 'angri',
 'ani',
 'anim',
 'ankl',
 'annoy',
 'annual',
 'anonym',
 'anoth',
 'answer',
 'anti',
 'anybodi',
 'anyon',
 'anyth',
 'anyway',
 'anywher',
 'apart',
 'appear',
 'appl',
 'appreci',
 'appropri',
 'are',
 'arena',
 'argu',
 'argument',
 'arm',
 'around',
 'art',
 'articl',
 'as',
 'asham',
 'asid',
 'ask',
 'aspect',
 'ass',
 'assassin',
 'assist',
 'assum'

In [25]:
cv = CountVectorizer(stop_words='english')
X = cv.fit_transform(sentences[:500])
Z = pd.DataFrame(X.toarray(),columns=cv.get_feature_names())

In [26]:
Z.shape

(500, 1790)

In [27]:
cv2 = CountVectorizer(stop_words='english')
XX = cv2.fit_transform(words[:500])
Z = pd.DataFrame(XX.toarray(),columns=cv2.get_feature_names())

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(stop_words='english')
X3 = tf.fit_transform(sentences[:500])
Z = pd.DataFrame(X3.toarray(), columns = tf.get_feature_names())

In [34]:
Z.shape

(500, 1790)

In [18]:
from nltk import sent_tokenize
text = df.body.apply(sent_tokenize)
text.sample(3, random_state=42)

25798    [As much hate as this dude gets, their is no denying that he can ball, and he's breaking these records at a relative...
26644                [The last 9 finals have consisted of either LeBron James Jones or Kobe Bryant, but never at the same time.]
5307                                                                                                        [That's just unfair]
Name: body, dtype: object

In [19]:
from nltk import word_tokenize
words = df.body.apply(word_tokenize)
words.sample(3, random_state=42)    

25798    [As, much, hate, as, this, dude, gets, ,, their, is, no, denying, that, he, can, ball, ,, and, he, 's, breaking, the...
26644    [The, last, 9, finals, have, consisted, of, either, LeBron, James, Jones, or, Kobe, Bryant, ,, but, never, at, the, ...
5307                                                                                                    [That, 's, just, unfair]
Name: body, dtype: object

In [32]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
X = cv.fit_transform(df.body[90:100])
pd.DataFrame(X.toarray(),columns=cv.get_feature_names())

Unnamed: 0,16,added,ago,akron,april,believe,better,chugging,come,considered,...,retired,retires,rose,said,smhhhh,teenager,think,wait,workaholic,years
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,1,0,1,...,1,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
5,0,0,0,0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,1,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
7,1,0,1,1,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
8,0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
9,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [2]:
df = joblib.load('data/clean/clean_df.joblib')

In [25]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

sia = SIA()

In [21]:
prob_selected = 2/16
num_trial = 27

In [26]:
# probability of num of trials up to and until 1st success
P = ((1-prob_selected)**(num_trial-1)) * prob_selected
P * 100

0.3882570868012486

In [25]:
# probability of num of failures until 1st success
P2 = ((1-prob_selected)**(num_trial)) * prob_selected
P2 * 100

0.33972495095109256