In [2]:
import nltk
from textblob import TextBlob
import pandas as pd
import numpy as np
from nltk import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.decomposition import NMF
import re
from scipy.spatial.distance import pdist, squareform, cdist
import matplotlib.pyplot as plt
from pattern.en import tag
from sklearn.neighbors import KDTree, NearestNeighbors
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import normalize
import pickle
%matplotlib inline

# Read and Clean Data

In [3]:
biz = pd.read_csv(
    '/Users/sarah/ds/metis/projects/kojak/USCompanyOverviewDataState.csv', encoding = 'latin-1')

In [4]:
text_id = biz.copy()

In [6]:
# Verify that test companies are present
text_id[(text_id['company_name_normalized'] == 'draftkings') | (text_id['company_name_normalized'] == 'fanduel') | (text_id['company_name_normalized'] == 'learnvest')]

Unnamed: 0,company_id,company_name,company_name_normalized,company_overview
7583,c:162946,DraftKings,draftkings,DraftKings is a daily fantasy sports website w...
24760,c:2640,FanDuel,fanduel,"FanDuel, launched in July 2009, is now the lea..."
30180,c:30814,LearnVest,learnvest,Our mission at LearnVest is to empower people ...


# Build document - term matrix using TFIDF

In [7]:
# Create custom tokenizer function that tags part of speech, takes only nouns, stems, and lemmatizes words
def noun_tokenize(text):
    '''Function to feed nltk processing functionality through sklearn tfidf vectorizer'''
    lemmatizer = nltk.WordNetLemmatizer()
    #stemmer = nltk.stem.porter.PorterStemmer()
    tokens = []
    tags=['NN', 'NNS', 'JJ']
    for item in [token[0] for token in tag(text) if token[1] in tags and token[0] not in stopwords]:
        #word = stemmer.stem_word(item.lower())
        word = lemmatizer.lemmatize(item)
        tokens.append(word)
    return tokens

In [8]:
#Add some custom stopwords based on term exploration
stopwords = nltk.corpus.stopwords.words('english')
stopwords += ['.', ',', '(', ')', "'", '"', ':', ';', '?', '!', '"', "'", '%', '~', '/', '•', '’', '-', '_', '|', '@', ':', '[', ']']
stopwords += ['company', 'new', 'provides', 'com', 'www', 'http', 'https', 'founded', 'use', 'allows', 
             'way', 'make', 'best', 'york', 'city', 'san', 'francisco', 'boston', 'austin', 'california', 'headquartered',
              'offices', 'incorporated', 'privately', 'massachusetts', 'twitter', 'facebook', 'techcrunch', 'crunchbase',
              'south', 'north', 'east', 'west' 'united', 'states', 'offer', 'canada', 'brazil', 'russia', 'africa',
              'coast', 'one', 'including', 'inc', 'aaa', 'abc', 'llc', 'seattle', 'los', 'angeles', 'sector', 'washington', 
             'illinois', 'formerly', 'corp', 'based', 'alto', 'menlo', 'park', 'chicago', 'brooklyn', 'silicon', 'valley',
              'also', '.com', '.net', '/month', '.we', './month', ':[', '.in', '.m', 'area', 'bay', 'metro', 'metropolitan',
              'dallas', 'dc', 'wide', 'southern']

# Business Data

In [8]:
#Initialize tfidf vectorizer, using custom tokenizer function, custom stopwords, use 1-3 ngrams, filtering out terms that don't
#appear in at least 30 business overviews or that occur in over 80% of overviews.
tfidf_vectorizer = TfidfVectorizer(tokenizer = noun_tokenize, stop_words = stopwords,
                                   ngram_range=(1, 3), min_df = 30, max_df = 0.8,
                                   token_pattern='(?u)\b[a-z][a-z][a-z]+\b', strip_accents='unicode')

In [9]:
tfidf = tfidf_vectorizer.fit_transform(text_id['company_overview'])



In [18]:
#Save tfidf model for later use
pickle.dump(tfidf, open("/Users/sarah/ds/metis/projects/kojak/biztfidf.p", "wb"))
pickle.dump(tfidf_vectorizer.get_feature_names(), open("/Users/sarah/ds/metis/projects/kojak/tfidf_feature_names.p", "wb"))
pickle.dump(tfidf_vectorizer.vocabulary_, open("/Users/sarah/ds/metis/projects/kojak/tfidf_vocabulary.p", "wb"))
pickle.dump(tfidf_vectorizer, open("/Users/sarah/ds/metis/projects/kojak/tfidf_vectorizer.p", "wb"))

In [15]:
#Put tfidf matrix into a dataframe to inspect.
tfidf_back = tfidf.toarray()
df_tfidf = pd.DataFrame(tfidf_back, columns=tfidf_vectorizer.get_feature_names())

In [16]:
df_tfidf.head()

Unnamed: 0,100%,1st,2012,2013,21st,21st century,24/7,360,3d,3rd,...,you?ll,you?re,youare,young,young adult,young professional,youth,zip,zip code,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
#Identify which terms have the highest occurrence and weight across all company descriptions
df_tfidf.sum()
word_weight_tfidf = df_tfidf.sum().reset_index()
word_weight_tfidf.columns = ['word', 'weight']
word_weight_tfidf = word_weight_tfidf.sort_values(by = 'weight', ascending = False)

In [20]:
word_weight_tfidf.head(20)

Unnamed: 0,word,weight
6667,service,1423.42065
775,business,1101.403591
7130,solution,1065.480936
5814,product,1064.994956
7684,technology,1053.23904
7070,software,818.760681
5087,online,813.292403
4687,mobile,801.265795
7559,system,786.433985
309,application,757.126341


# Build out topics using NMF on businesses

In [39]:
def print_top_words(model, feature_names, n_top_words):
    '''Print out the top words associated with each topic in topic model
       model: nmf model
       feature_names: term/ngram names
       n_top_words: number of words to inspect per topic
       '''
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [34]:
#Set number of topics and fit NMF model
n_topics = 200
nmf = NMF(init='nndsvd', n_components=n_topics, random_state=1).fit(tfidf)

In [35]:
#Save nmf model for later
pickle.dump(nmf, open("/Users/sarah/ds/metis/projects/kojak/biznmf.p", "wb"))

In [40]:
#Set number of words to display
n_top_words = 10

#Print topics and inspect to make sure they are coherent
print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


Topics in NMF model:
Topic #0:
service product service technology service full service provider service business managed service service software service customer service
Topic #1:
social social medium social network mobile social social medium marketing social platform social game social web engagement medium marketing
Topic #2:
first first product world first first time first online available early technology first second industry first
Topic #3:
mobile mobile application mobile device mobile marketing web mobile mobile phone mobile web mobile app mobile platform mobile technology
Topic #4:
development software development application development web development design development development service product development research development strategy app development
Topic #5:
medium social medium digital medium medium marketing medium platform online medium medium technology medium business rich medium small medium
Topic #6:
garage door door garage repair door repair garage door repa

In [40]:
#Create a crosswalk between company name and index
biz_dict = text_id.reset_index().set_index('company_name')['index'].to_dict()

In [None]:
#Use nmf model results to add topic scores to companies from the document-term matrix
biz_arr = nmf.transform(tfidf)

In [41]:
#Save crosswalk and topic matrix for later use
pickle.dump(biz_dict, open("/Users/sarah/ds/metis/projects/kojak/bizname_index_crosswalk.p", "wb"))
pickle.dump(biz_arr, open("/Users/sarah/ds/metis/projects/kojak/biztopic_matrix.p", "wb"))

In [24]:
#Put topic matrix into a dataframe to inspect
topics = pd.DataFrame(biz_arr)

In [159]:
topics.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.0,0.006248,0.0,0,0,0.009518,0,0.0,0.000333,0.0,...,0,0.0,0.0,0.002822,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0,0,0.007395,0,0.01734,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.037348,0.0,0.00203,0,0,0.003824,0,0.0,0.0,0.0,...,0,0.000829,0.0,0.0,0.0,0.0,0.0,0.037574,0.0,0
3,0.016702,0.0,0.0,0,0,0.000133,0,0.0,0.0,0.000858,...,0,0.0,0.000378,0.006232,0.016191,0.026598,0.000114,0.0,0.015124,0
4,0.025034,0.0,0.0,0,0,0.0,0,0.011682,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.024209,0.02352,0.0,0
