In [1]:
import pandas as pd
import numpy as np

import pickle

# gensim
from gensim import corpora, models, similarities, matutils

# sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

import nltk
import spacy


Loading in game text data:

In [3]:
with open('/Users/robertpagano/metis_data/project_4/text_dataframes/games_text_2016_2019.pickle', 'rb') as f:
    df = pickle.load(f)

In [4]:
df.head()

Unnamed: 0,final_state,id,name,blurb,category_name
1796,successful,269726791,Emanations Playing Cards | A Study of Real Magic,An artistic professional pack of playing cards...,Playing Cards
2575,successful,1697621882,FAT STACKS: The Pancake Stacking Strategy Card...,Make it rain pancakes in FAT STACKS. \nCreate ...,Tabletop Games
2594,successful,2046938895,Royal Heroz - Fantasy Strategy Playing Cards,"Endless playability, stunning artwork and fant...",Playing Cards
2665,successful,1642293087,Coral Islands : 2 interactive dice-stacking ga...,"In 'Coral', use dice to make shapes in 3D! In ...",Tabletop Games
2668,failed,932587626,3D Shopping with WalkTheWeb 3D Technology!,Online 3D Stores for businesses that blur 3D G...,Live Games


In [92]:
df.shape

(5902, 7)

In [93]:
X_blurb = df['blurb']

In [15]:
X_blurb.head()

1796    An artistic professional pack of playing cards...
2575    Make it rain pancakes in FAT STACKS. \nCreate ...
2594    Endless playability, stunning artwork and fant...
2665    In 'Coral', use dice to make shapes in 3D! In ...
2668    Online 3D Stores for businesses that blur 3D G...
Name: blurb, dtype: object

BACK UP. What I need to do is create functions or something to do my text preprocessing on the column ITSELF.

And then I load it into the vecotorizer

In [25]:
!python -m spacy download en

Collecting en_core_web_sm==2.0.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz#egg=en_core_web_sm==2.0.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz (37.4MB)
[K    100% |████████████████████████████████| 37.4MB 211kB/s ta 0:00:011    86% |███████████████████████████▌    | 32.1MB 4.9MB/s eta 0:00:02
[?25hInstalling collected packages: en-core-web-sm
  Running setup.py install for en-core-web-sm ... [?25ldone
[?25hSuccessfully installed en-core-web-sm-2.0.0

[93m    Linking successful[0m
    /anaconda3/lib/python3.6/site-packages/en_core_web_sm -->
    /anaconda3/lib/python3.6/site-packages/spacy/data/en

    You can now load the model via spacy.load('en')



In [32]:
from nltk.corpus import stopwords

In [36]:
import re
def pre_process(text):
    
    # lowercase
    text=text.lower()
    
    #remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    return text

In [29]:
df['blurb'].head()
df.head()

Unnamed: 0,final_state,id,name,blurb,category_name
1796,successful,269726791,Emanations Playing Cards | A Study of Real Magic,An artistic professional pack of playing cards...,Playing Cards
2575,successful,1697621882,FAT STACKS: The Pancake Stacking Strategy Card...,Make it rain pancakes in FAT STACKS. \nCreate ...,Tabletop Games
2594,successful,2046938895,Royal Heroz - Fantasy Strategy Playing Cards,"Endless playability, stunning artwork and fant...",Playing Cards
2665,successful,1642293087,Coral Islands : 2 interactive dice-stacking ga...,"In 'Coral', use dice to make shapes in 3D! In ...",Tabletop Games
2668,failed,932587626,3D Shopping with WalkTheWeb 3D Technology!,Online 3D Stores for businesses that blur 3D G...,Live Games


In [30]:
df['blurb_processed'] = df['blurb'].apply(lambda x:pre_process(x))

In [33]:
nlp = spacy.load('en', disable=['parser', 'ner'])

In [39]:
from textblob import TextBlob, Word

In [35]:
def pre_process2(text):
    
    # lowercase
    text=text.lower()
    
    #remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    #lemmatize with Spacy
    doc = nlp(text)
    text = " ".join([token.lemma_ for token in doc])
    
    return text

In [37]:
df['blurb_processed_2'] = df['blurb'].apply(lambda x:pre_process2(x))

In [38]:
df.head()

Unnamed: 0,final_state,id,name,blurb,category_name,blurb_processed,blurb_processed_2
1796,successful,269726791,Emanations Playing Cards | A Study of Real Magic,An artistic professional pack of playing cards...,Playing Cards,an artistic professional pack of playing cards...,an artistic professional pack of playing card ...
2575,successful,1697621882,FAT STACKS: The Pancake Stacking Strategy Card...,Make it rain pancakes in FAT STACKS. \nCreate ...,Tabletop Games,make it rain pancakes in fat stacks create the...,make -PRON- rain pancake in fat stack create t...
2594,successful,2046938895,Royal Heroz - Fantasy Strategy Playing Cards,"Endless playability, stunning artwork and fant...",Playing Cards,endless playability stunning artwork and fanta...,endless playability stunning artwork and fanta...
2665,successful,1642293087,Coral Islands : 2 interactive dice-stacking ga...,"In 'Coral', use dice to make shapes in 3D! In ...",Tabletop Games,in coral use dice to make shapes in d in islan...,in coral use dice to make shape in d in island...
2668,failed,932587626,3D Shopping with WalkTheWeb 3D Technology!,Online 3D Stores for businesses that blur 3D G...,Live Games,online d stores for businesses that blur d gam...,online d store for business that blur d game w...


In [40]:
X_blurb_2 = df['blurb_processed_2']

In [16]:
tfidf = TfidfVectorizer(stop_words='english', token_pattern="\\b[a-z][a-z]+\\b")


In [58]:
tfidf2 = TfidfVectorizer(stop_words='english', token_pattern="\\b[a-z][a-z]+\\b")


In [65]:
tfidf3 = TfidfVectorizer(stop_words='english', token_pattern="\\b[a-z][a-z]+\\b", ngram_range=(1,2))


In [59]:
bag_of_words = tfidf2.fit_transform(X_blurb)

In [61]:
feature_names = tfidf2.get_feature_names()
pd.DataFrame(bag_of_words.toarray(), columns = feature_names)

Unnamed: 0,aaa,aapo,aargh,ab,aba,abandonado,abandoned,abandonned,abbey,abc,...,zum,zur,zusammen,zweck,zwergen,zwiercies,zx,zy,zyde,zynvaded
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Below I am trying the tfidf transform again, after preprocessing text

In [43]:
bag_of_words_2 = tfidf.fit_transform(X_blurb_2)

In [45]:
bag_of_words_2.shape

(5902, 10022)

In [44]:
feature_names_2 = tfidf.get_feature_names()
pd.DataFrame(bag_of_words_2.toarray(), columns = feature_names_2)

Unnamed: 0,aaa,aapo,aargh,ab,aba,abandon,abandonado,abandonned,abbey,abc,...,zur,zusamman,zusammen,zweck,zwergen,zwiercie,zx,zy,zyde,zynvad
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

In [63]:
# Acronynms: Latent Semantic Analysis (LSA) is just another name for 
#  Signular Value Decomposition (SVD) applied to Natural Language Processing (NLP)
lsa = TruncatedSVD(4)
doc_topic = lsa.fit_transform(bag_of_words_2)
lsa.explained_variance_ratio_

array([0.00569487, 0.00868322, 0.00592941, 0.00449745])

In [52]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [64]:
display_topics(lsa, tfidf.get_feature_names(), 5)


Topic  0
pron, game, card, play, fun

Topic  1
card, playing, deck, design, print

Topic  2
pron, deck, playing, card, print

Topic  3
fun, game, play, card, player


Ok so this is not working well yet...

In [75]:
# Acronynms: Latent Semantic Analysis (LSA) is just another name for 
#  Signular Value Decomposition (SVD) applied to Natural Language Processing (NLP)
lsa2 = TruncatedSVD(2)
doc_topic = lsa2.fit_transform(bag_of_words)
lsa2.explained_variance_ratio_

array([0.00350624, 0.00740475])

In [76]:
display_topics(lsa2, tfidf2.get_feature_names(), 5)


Topic  0
game, card, cards, playing, deck

Topic  1
cards, playing, deck, printed, custom


Going to try again with ngrams:

In [66]:
bag_of_words_3 = tfidf3.fit_transform(X_blurb_2)

In [67]:
bag_of_words_3.shape

(5902, 56032)

In [68]:
lsa3 = TruncatedSVD(4)
doc_topic = lsa3.fit_transform(bag_of_words_3)
lsa3.explained_variance_ratio_

array([0.00225564, 0.00421463, 0.00280094, 0.00206129])

In [69]:
display_topics(lsa3, tfidf3.get_feature_names(), 5)


Topic  0
pron, game, card, card game, play

Topic  1
card, playing card, playing, deck, print

Topic  2
game, card game, player, game player, fun

Topic  3
card game, card, pron, fun, play card


In [70]:
tfidf4 = TfidfVectorizer(stop_words='english', token_pattern="\\b[a-z][a-z]+\\b", ngram_range=(1,2), sublinear_tf=True)


In [71]:
bag_of_words_4 = tfidf4.fit_transform(X_blurb_2)

In [72]:
bag_of_words_3.shape

(5902, 56032)

In [84]:
lsa4 = TruncatedSVD(20)
doc_topic = lsa4.fit_transform(bag_of_words_4)
lsa4.explained_variance_ratio_

array([0.00174973, 0.00374813, 0.00240706, 0.00200494, 0.00174363,
       0.00153427, 0.00147485, 0.00140673, 0.00137667, 0.00130706,
       0.00127322, 0.00126129, 0.00121868, 0.00121328, 0.0011816 ,
       0.00113248, 0.00112065, 0.00111633, 0.00109033, 0.0010588 ])

In [85]:
display_topics(lsa4, tfidf4.get_feature_names(), 20)


Topic  0
pron, game, card, card game, play, playing, player, deck, playing card, fun, game pron, new, world, make, friend, create, design, base, game player, pron pron

Topic  1
card, playing card, playing, deck, print, design, custom, deck playing, uspcc, play card, print uspcc, card deck, card inspire, magician, card print, custom deck, inspire, luxury, poker, card design

Topic  2
card game, game, card, player, game player, fun, strategy, fast, play, age, board, board game, fun card, strategic, family, minute, play card, fast paced, paced, base

Topic  3
adventure, world, rpg, game, fantasy, action, base, playing, role, role playing, set, playing game, story, adventure game, player, new, mobile, tabletop, open world, inspire

Topic  4
play, play card, deck, deck play, pron play, inspire, play pron, game play, rpg, play game, card inspire, world, unique, adventure, free, custom deck, custom, art, magician, cardistry

Topic  5
game, fun, board, mobile, board game, new, video, playing

do a plot of inertia, see where it levels off with # of components, and then run kmeans clustering on those components to come up with clusters for projects

In [87]:
type(doc_topic)

numpy.ndarray

In [102]:
lsa4_df = pd.DataFrame(data=doc_topic)

In [103]:
lsa4_df.shape

(5902, 20)

In [104]:
lsa4_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.118766,0.1825,-0.037265,0.037267,-0.044946,0.049501,0.002541,-0.00677,-0.007811,0.018801,-0.015498,-0.002218,0.032596,0.022209,0.004208,0.004203,0.001338,-0.001968,-8.8e-05,0.024586
1,0.052765,-0.03361,-0.045497,-0.031714,-0.018083,0.01346,0.014778,0.004725,-0.024136,-0.026924,0.023276,0.008448,0.065364,-0.000383,-0.012131,-0.051124,0.034466,0.001643,0.001262,0.041218
2,0.017984,0.001349,-0.002976,0.031297,0.003618,-0.023423,0.010418,0.001093,0.014256,-0.019537,0.005158,-0.012001,-0.007438,-0.001096,-0.019998,-0.001146,-0.009617,-0.009471,-0.014659,-0.006911
3,0.035802,-0.011548,-0.00999,0.004433,-0.010727,0.008696,0.012592,0.040098,-0.017851,-0.02746,-0.014112,0.024724,0.065946,-0.011778,-0.024843,-0.049866,-0.035768,-0.015192,-0.001631,-0.006819
4,0.085047,-0.05567,-0.040441,0.037124,0.009305,0.002576,-0.034718,0.003229,-0.022656,0.009985,-0.016675,-0.076651,-0.018325,0.02305,0.023408,-0.012605,0.046087,-0.004715,-0.014554,-0.056953


In [105]:
lsa4_df = lsa4_df.add_prefix('topic_')

In [106]:
lsa4_df.head()

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
0,0.118766,0.1825,-0.037265,0.037267,-0.044946,0.049501,0.002541,-0.00677,-0.007811,0.018801,-0.015498,-0.002218,0.032596,0.022209,0.004208,0.004203,0.001338,-0.001968,-8.8e-05,0.024586
1,0.052765,-0.03361,-0.045497,-0.031714,-0.018083,0.01346,0.014778,0.004725,-0.024136,-0.026924,0.023276,0.008448,0.065364,-0.000383,-0.012131,-0.051124,0.034466,0.001643,0.001262,0.041218
2,0.017984,0.001349,-0.002976,0.031297,0.003618,-0.023423,0.010418,0.001093,0.014256,-0.019537,0.005158,-0.012001,-0.007438,-0.001096,-0.019998,-0.001146,-0.009617,-0.009471,-0.014659,-0.006911
3,0.035802,-0.011548,-0.00999,0.004433,-0.010727,0.008696,0.012592,0.040098,-0.017851,-0.02746,-0.014112,0.024724,0.065946,-0.011778,-0.024843,-0.049866,-0.035768,-0.015192,-0.001631,-0.006819
4,0.085047,-0.05567,-0.040441,0.037124,0.009305,0.002576,-0.034718,0.003229,-0.022656,0.009985,-0.016675,-0.076651,-0.018325,0.02305,0.023408,-0.012605,0.046087,-0.004715,-0.014554,-0.056953


In [110]:
df_topics = df.reset_index()
df_topics.head()

Unnamed: 0,index,final_state,id,name,blurb,category_name,blurb_processed,blurb_processed_2
0,1796,successful,269726791,Emanations Playing Cards | A Study of Real Magic,An artistic professional pack of playing cards...,Playing Cards,an artistic professional pack of playing cards...,an artistic professional pack of playing card ...
1,2575,successful,1697621882,FAT STACKS: The Pancake Stacking Strategy Card...,Make it rain pancakes in FAT STACKS. \nCreate ...,Tabletop Games,make it rain pancakes in fat stacks create the...,make -PRON- rain pancake in fat stack create t...
2,2594,successful,2046938895,Royal Heroz - Fantasy Strategy Playing Cards,"Endless playability, stunning artwork and fant...",Playing Cards,endless playability stunning artwork and fanta...,endless playability stunning artwork and fanta...
3,2665,successful,1642293087,Coral Islands : 2 interactive dice-stacking ga...,"In 'Coral', use dice to make shapes in 3D! In ...",Tabletop Games,in coral use dice to make shapes in d in islan...,in coral use dice to make shape in d in island...
4,2668,failed,932587626,3D Shopping with WalkTheWeb 3D Technology!,Online 3D Stores for businesses that blur 3D G...,Live Games,online d stores for businesses that blur d gam...,online d store for business that blur d game w...


(5902, 7)

In [111]:
df_topics = pd.merge(df_topics, lsa4_df, left_index=True, right_index=True)

In [112]:
df_topics.shape

(5902, 28)

In [113]:
df_topics.head()

Unnamed: 0,index,final_state,id,name,blurb,category_name,blurb_processed,blurb_processed_2,topic_0,topic_1,...,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
0,1796,successful,269726791,Emanations Playing Cards | A Study of Real Magic,An artistic professional pack of playing cards...,Playing Cards,an artistic professional pack of playing cards...,an artistic professional pack of playing card ...,0.118766,0.1825,...,-0.015498,-0.002218,0.032596,0.022209,0.004208,0.004203,0.001338,-0.001968,-8.8e-05,0.024586
1,2575,successful,1697621882,FAT STACKS: The Pancake Stacking Strategy Card...,Make it rain pancakes in FAT STACKS. \nCreate ...,Tabletop Games,make it rain pancakes in fat stacks create the...,make -PRON- rain pancake in fat stack create t...,0.052765,-0.03361,...,0.023276,0.008448,0.065364,-0.000383,-0.012131,-0.051124,0.034466,0.001643,0.001262,0.041218
2,2594,successful,2046938895,Royal Heroz - Fantasy Strategy Playing Cards,"Endless playability, stunning artwork and fant...",Playing Cards,endless playability stunning artwork and fanta...,endless playability stunning artwork and fanta...,0.017984,0.001349,...,0.005158,-0.012001,-0.007438,-0.001096,-0.019998,-0.001146,-0.009617,-0.009471,-0.014659,-0.006911
3,2665,successful,1642293087,Coral Islands : 2 interactive dice-stacking ga...,"In 'Coral', use dice to make shapes in 3D! In ...",Tabletop Games,in coral use dice to make shapes in d in islan...,in coral use dice to make shape in d in island...,0.035802,-0.011548,...,-0.014112,0.024724,0.065946,-0.011778,-0.024843,-0.049866,-0.035768,-0.015192,-0.001631,-0.006819
4,2668,failed,932587626,3D Shopping with WalkTheWeb 3D Technology!,Online 3D Stores for businesses that blur 3D G...,Live Games,online d stores for businesses that blur d gam...,online d store for business that blur d game w...,0.085047,-0.05567,...,-0.016675,-0.076651,-0.018325,0.02305,0.023408,-0.012605,0.046087,-0.004715,-0.014554,-0.056953


In [114]:
with open('/Users/robertpagano/metis_data/project_4/text_dataframes/games_and_lsatopics.pickle', 'wb') as to_write:
    pickle.dump(df_topics, to_write)

IN the interest of getting to my MVP, I'm going to move onto modeling with these 20 topics. 

MVP Steps:

 - Create models - RF and Logistic
     - Setup structure for this - i.e. cross val, metrics, etc.
     - Create models with and without LSA components (and maybe ONLY LSA components)
     - Create baseline using %rate of $ / %rate needed > 1, see how it compares to both

Future Steps:
 - add specific stop words (game, etc) and stemming
 - NMF
 - LDA
 - Look into Inertia scores for # of topics?
 - Further EDA for model (Consider adding more categories)
 - Go back to modelling. If haven't found reason for # of topics, CV on this
 - Consider adding more categories
 - Presentation Slides
 - Clustering on topics (if not LDA)
 - Recommender