# 4. Modeling

In [141]:
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline

In [142]:
Location = r'../final_3/clean_data.csv'
df = pd.read_csv(Location,  encoding='utf-8')

In [143]:
df.head()

Unnamed: 0,article_description,article_pubtime,article_section,article_subtopic,article_tags,article_topic,author_name,content,title
0,If you're having trouble trying to be more inn...,2016-11-11T20:00:15+00:00,Planning for Growth,Growth Opportunities,"[Growth Opportunities, Innovation, Personal Pr...",Planning for Growth,Heather Willems,The Power of Non-Linear ThinkingHeather Willem...,The Power of Non-Linear Thinking
1,Huge growth may be possible for businesses of ...,2016-11-09T22:30:34+00:00,Planning for Growth,Operations,"[Business Expansion, Business Plan, Cash Flow,...",Planning for Growth,Andrew J. Sherman,3 Growth Strategies for Businesses With Limite...,3 Growth Strategies for Businesses With Limite...
2,What sets your business apart from the competi...,2016-11-11T22:30:47+00:00,Getting Customers,Marketing & Sales,"[Advertising, Brainstorming, Branding, Busines...",Getting Customers,Rieva Lesonsky,USP 101: How to Uncover Your Unique Selling Pr...,USP 101: How to Uncover Your Unique Selling Pr...
3,"Organizational changes are rarely fun or easy,...",2016-11-03T23:30:02+00:00,Building Your Team,Company Culture,"[Building Your Team, Business Expansion, Busin...",Building Your Team,David Niu,8 Strategies for Big (or Small) Organizational...,8 Strategies for Big (or Small) Organizational...
4,"When expanding your business, exporting to int...",2016-10-27T19:00:39+00:00,Planning for Growth,Growth Opportunities,"[Growth Opportunities, Importing & Exporting, ...",Planning for Growth,Greg Sandler,Lessons Learned: Four Stories From Experienced...,Lessons Learned: Four Stories From Experienced...


In [144]:
# split data into train and test sets
from sklearn.cross_validation import train_test_split
data_train, data_test = train_test_split(df, test_size=0.33, random_state=42)
len(data_train)

data_train.head()
labels = data_train['article_subtopic']
print "Labels \n", labels.head()

labels_test = data_test['article_subtopic']

Labels 
16                 Strategy
39                    Taxes
62    Social Media Strategy
72              Hiring & HR
7                  Research
Name: article_subtopic, dtype: object


In [145]:
growth_opp = df[df['article_subtopic'] == 'Growth Opportunities']
hiring_hr = df[df['article_subtopic'] == 'Hiring & HR']
print len(hiring_hr)
hiring_labels = pd.Series(["Hiring & HR","Hiring & HR","Hiring & HR","Hiring & HR","Hiring & HR","Hiring & HR"])
len(hiring_labels)

6


6

In [170]:
# vectorize
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(max_features = 1000, 
                             ngram_range=(1, 3), 
                             stop_words='english',
                             binary=True)
X_train_counts = count_vect.fit(data_train['content'])
#X_train_counts.shape
X_train_counts = count_vect.transform(data_train['content'])
names =  count_vect.get_feature_names()
names[115:145]

[u'build',
 u'building',
 u'built',
 u'business',
 u'business access',
 u'business growth',
 u'business owner',
 u'business owners',
 u'business read',
 u'business read articles',
 u'business situation',
 u'business situation article',
 u'business situation judgment',
 u'business strategy',
 u'business strategy approach',
 u'businesses',
 u'buy',
 u'california',
 u'california area',
 u'called',
 u'calls',
 u'came',
 u'campaigns',
 u'care',
 u'case',
 u'cash',
 u'cash flow',
 u'cause',
 u'center',
 u'ceo']

In [171]:
count_vect.vocabulary_.get(u'business')

118

In [172]:
count_vect.vocabulary_.get(u'employees')

277

In [173]:
count_vect.vocabulary_.get(u'stock')

In [174]:
count_vect.vocabulary_.get(u'bread')

In [175]:
# vectorize with tf-idf 
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
print X_train_tf.shape

names = count_vect.get_feature_names()
names[50:100]

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print X_train_tfidf.shape

(55, 1000)
(55, 1000)


In [176]:
# use Naive-Bayes Classifier 
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_counts, labels)

In [177]:
print labels[0:5]

16                 Strategy
39                    Taxes
62    Social Media Strategy
72              Hiring & HR
7                  Research
Name: article_subtopic, dtype: object


In [178]:
# Test the classifier 
docs_new = ['Business strategy is important.', 
            "Hiring new employees is tough because candidates are hard to find.", 
            "Candidates hire new employees", 
            "Web SEO mobile",
           "Attracting new customers through advertising",
           "Employee benefits",
           "Economic Ups and Downs",
           "Ups and Downs",
           "Customer stuff",
           "mobile",
           "digital",
           "The Power of Non-Linear Thinking",
           "3 Growth Strategies for Businesses With Limited"]
X_new_counts = count_vect.transform(docs_new)

X_new_tf = tf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tf)

for doc, category in zip(docs_new, predicted):
     print('%r => %s' % (doc, category))

'Business strategy is important.' => Customer Relations
'Hiring new employees is tough because candidates are hard to find.' => Leadership
'Candidates hire new employees' => Leadership
'Web SEO mobile' => Customer Relations
'Attracting new customers through advertising' => Customer Relations
'Employee benefits' => Leadership
'Economic Ups and Downs' => Leadership
'Ups and Downs' => Leadership
'Customer stuff' => Customer Relations
'mobile' => Customer Relations
'digital' => Customer Relations
'The Power of Non-Linear Thinking' => Leadership
'3 Growth Strategies for Businesses With Limited' => Customer Relations


In [179]:
# creates a pipeline 
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB()),
 ])

In [180]:
text_clf = text_clf.fit(data_train['content'], labels)

In [181]:
# get prediction accuracy using the test set
docs_test = data_test['content']
#print docs_test
predicted = text_clf.predict(docs_test)
#print predicted
np.mean(predicted == labels_test)            

30    When It Comes to Accounting, Do You Need a CPA...
0     The Power of Non-Linear ThinkingHeather Willem...
22    A New Spin on Going Viral: Using Contagion The...
31    How to Help Map Out a Year-Round Cash Flow Pla...
18    Growing Your Business: 7 Questions to Ask Befo...
28    How to Help Keep Your Cash Flow FlowingMeredit...
10    One Womans Journey From Finance to Full-Time S...
53    How E-Commerce Sites and Storefronts Alike Can...
4     Lessons Learned: Four Stories From Experienced...
12    Could Franchising Help Scale Your Business? Yo...
49    How to Help Protect Your Business From Todays ...
33    3 Spooky Debt Collection Stories to Scare You ...
69    Office Etiquette: How to Handle Employee Dispu...
35    7 Strategies That May Help With Explosive Busi...
70    From Elected Official to Government Consultant...
45    6 Reasons Why Your Website Should Have a BlogS...
75    Should Your Company Hire a Chief Digital Offic...
67    9 Best Practices to Help Develop a Top-Sel

0.035714285714285712

In [182]:
print data_train['article_subtopic'].value_counts()

Leadership               11
Customer Relations        9
Company Culture           6
Marketing & Sales         5
Hiring & HR               4
Strategy                  3
Taxes                     3
Operations                3
Cash Flow                 2
Financing                 2
Research                  2
Growth Opportunities      2
Social Media Strategy     1
Digital Tools             1
Productivity              1
Name: article_subtopic, dtype: int64


My classifier is only bucketing documents into two categories, despite feeding it several categories. This might be because there are a lot of "Leadership" and "Customer Relations" articles and not as many of the other categories.

Let's try it again using categories that all have the same number of observations - to see if that was causing it.

In [230]:
categories = ['Cash Flow', 'Financing', 'Research', 'Growth Opportunities', 'Strategy',
              'Taxes', 'Operations']
train_dropped = data_train[data_train['article_subtopic'].isin(categories)]
train_dropped.head(2)

Unnamed: 0,article_description,article_pubtime,article_section,article_subtopic,article_tags,article_topic,author_name,content,title
16,Small businesses can be just as volatile as th...,2016-08-23T22:30:24+00:00,Planning for Growth,Strategy,"[Accounts Receivable/Payable, Building Your Te...",Planning for Growth,Julie Bawden Davis,Economic Ups and Downs: How to Help Prepare Yo...,Economic Ups and Downs: How to Help Prepare Yo...
39,Being as tax-savvy as possible can help you—an...,2016-09-29T17:00:01+00:00,Managing Money,Taxes,"[Cash Flow, Managing Money, Tax Deductions, Ta...",Managing Money,John Suh,4 Things Every Business Owner Should Know Abou...,4 Things Every Business Owner Should Know Abou...


In [231]:
# vectorize
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(max_features = 1000, 
                             ngram_range=(1, 3), 
                             stop_words='english',
                             binary=True)
X_train_counts = count_vect.fit(train_dropped['content'])
#X_train_counts.shape
X_train_counts = count_vect.transform(train_dropped['content'])
names =  count_vect.get_feature_names()
names[115:145]

# vectorize with tf-idf 
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
print X_train_tf.shape

names = count_vect.get_feature_names()
names[50:100]

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print X_train_tfidf.shape

# use Naive-Bayes Classifier 
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_counts, train_dropped['article_subtopic'])

# Test the classifier 
docs_new = ['Business strategy is important.', 
            "Hiring new employees is tough because candidates are hard to find.", 
            "Candidates hire new employees", 
            "Web SEO mobile",
           "Attracting new customers through advertising",
           "Employee benefits",
           "Economic Ups and Downs",
           "Ups and Downs",
           "Customer stuff",
           "mobile",
           "digital",
           "The Power of Non-Linear Thinking",
           "3 Growth Strategies for Businesses With Limited"]
X_new_counts = count_vect.transform(docs_new)

X_new_tf = tf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tf)

for doc, category in zip(docs_new, predicted):
     print('%r => %s' % (doc, category))

(17, 1000)
(17, 1000)
'Business strategy is important.' => Strategy
'Hiring new employees is tough because candidates are hard to find.' => Strategy
'Candidates hire new employees' => Strategy
'Web SEO mobile' => Operations
'Attracting new customers through advertising' => Operations
'Employee benefits' => Operations
'Economic Ups and Downs' => Operations
'Ups and Downs' => Operations
'Customer stuff' => Operations
'mobile' => Operations
'digital' => Operations
'The Power of Non-Linear Thinking' => Operations
'3 Growth Strategies for Businesses With Limited' => Operations


In [232]:
test_dropped = data_test[data_test['article_subtopic'].isin(categories)]
test_dropped.head(2)

Unnamed: 0,article_description,article_pubtime,article_section,article_subtopic,article_tags,article_topic,author_name,content,title
30,"As your business grows, you may want to consid...",2016-11-08T17:30:05+00:00,Managing Money,Cash Flow,"[Accounting, Cash Flow, Managing Money]",Managing Money,Hal Shelton,"When It Comes to Accounting, Do You Need a CPA...","When It Comes to Accounting, Do You Need a CPA..."
0,If you're having trouble trying to be more inn...,2016-11-11T20:00:15+00:00,Planning for Growth,Growth Opportunities,"[Growth Opportunities, Innovation, Personal Pr...",Planning for Growth,Heather Willems,The Power of Non-Linear ThinkingHeather Willem...,The Power of Non-Linear Thinking


In [233]:
text_clf = text_clf.fit(train_dropped['content'], train_dropped['article_subtopic'])

# get prediction accuracy using the test set
docs_test = test_dropped['content']
predicted = text_clf.predict(docs_test)
#print "Predicted:", predicted
np.mean(predicted == test_dropped['article_subtopic']) 
#test_dropped['article_subtopic']

0.15384615384615385

My accuracy slightly increased.. Promising! LOL.