# Imports

In [1]:
import pandas as pd
pd.options.display.max_columns = 50
pd.options.display.max_colwidth = 280

import matplotlib.pyplot as plt
from collections import defaultdict


from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate

import nltk
from nltk.probability import FreqDist
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
from nltk import pos_tag
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer
sw = stopwords.words('english')



In [2]:
import sys
sys.path.append( '../../src' )
from pandas_functions import *

In [3]:
dataFolder_path = '../../data/'

# Helper functions

In [27]:
# def get_wordnet_pos(treebank_tag):
#     '''
#     Translate nltk POS to wordnet tags
#     '''
#     if treebank_tag.startswith('J'):
#         return wordnet.ADJ
#     elif treebank_tag.startswith('V'):
#         return wordnet.VERB
#     elif treebank_tag.startswith('N'):
#         return wordnet.NOUN
#     elif treebank_tag.startswith('R'):
#         return wordnet.ADV
#     else:
#         return wordnet.NOUN


def doc_preparer(doc, stem = False, stop_words=sw):
    '''

    :param doc: a document from the satire corpus 
    :return: a document string with words which have been 
            lemmatized, 
            parsed for stopwords, 
            made lowercase,
            and stripped of punctuation and numbers.
    '''
    #Stemming seems to work better. Lemming can't identify plurals of products
    
    
#     lemmed_keywords = ['apple',
#                 'ipad', 'ipads',
#                 'iphone', 'iphones',
#                 'itunes',
#                 'google', 'googled',
#                 'android', 'droid', 'androids', 'droids',
#                 'circle', 'circles'
#                 'app', 'apps']

#     stemmed_keywords = ['appl',
#                         'ipad',
#                         'iphon',
#                         'itun',
#                         'googl',
#                         'android',
#                         'droid',
#                         'circl',
#                         'app']

    regex_token = RegexpTokenizer(r"([a-zA-Z]+(?:’[a-z]+)?)")
    doc = regex_token.tokenize(doc)
    doc = [word.lower() for word in doc]
    doc = [word for word in doc if word not in sw]
#     doc = pos_tag(doc)
#     doc = [(word[0], get_wordnet_pos(word[1])) for word in doc]
#     lemmatizer = WordNetLemmatizer()
#     doc = [lemmatizer.lemmatize(word[0], word[1]) for word in doc]
#     doc = [word for word in doc if word in lemmed_keywords]
    
    
    
    p_stemmer = nltk.stem.PorterStemmer()
    if stem:
        doc = [p_stemmer.stem(word) for word in doc if p_stemmer.stem(word)]
    return ' '.join(doc)

def cv_printScores(cv_metric):
    print('CV Results')
    print('='*32)
    print('Accuracy')
    print('-'*32)
    print(f"Training accuracy: {cv_metric['train_accuracy'].mean():.3f}")
    print(f"Training accuracy: {cv_metric['test_accuracy'].mean():.3f}")
    print('F-1 Score')
    print('-'*32)
    print(f"Training F1 score: {cv_metric['train_f1_macro'].mean():.3f}")
    print(f"Training F1 score: {cv_metric['test_f1_macro'].mean():.3f}")

In [5]:
data_df = pd.read_csv(dataFolder_path+'judge_1377884607_tweet_product_company.csv')

In [6]:
dataFrame_info(data_df)

Datframe has 8721 rows and 3 columns


Info Table:,Zeroes,Zeroes,Nulls,Nulls,Uniques,Uniques,Missing/Unknown,Missing/Unknown,Mean,Median
Details:,Count,Fraction,Count,Fraction,Count,Fraction,Count,Fraction,Unnamed: 9_level_1,Unnamed: 10_level_1
Columns:,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
tweet_text,0,0.00 %,1,0.01 %,8694,99.69 %,0,0.00 %,0.0,0.0
emotion_in_tweet_is_directed_at,0,0.00 %,5552,63.66 %,10,0.11 %,0,0.00 %,0.0,0.0
is_there_an_emotion_directed_at_a_brand_or_product,0,0.00 %,0,0.00 %,4,0.05 %,0,0.00 %,0.0,0.0


Looking at the 1 null in tweet text

In [7]:
data_df[data_df.tweet_text.isna()]

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
6,,,No emotion toward brand or product


In [8]:
data_df.dropna(subset=['tweet_text'],inplace=True)

In [9]:
data_df.shape

(8720, 3)

In [10]:
dataFrame_info(data_df)

Datframe has 8720 rows and 3 columns


Info Table:,Zeroes,Zeroes,Nulls,Nulls,Uniques,Uniques,Missing/Unknown,Missing/Unknown,Mean,Median
Details:,Count,Fraction,Count,Fraction,Count,Fraction,Count,Fraction,Unnamed: 9_level_1,Unnamed: 10_level_1
Columns:,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
tweet_text,0,0.00 %,0,0.00 %,8693,99.69 %,0,0.00 %,0.0,0.0
emotion_in_tweet_is_directed_at,0,0.00 %,5551,63.66 %,10,0.11 %,0,0.00 %,0.0,0.0
is_there_an_emotion_directed_at_a_brand_or_product,0,0.00 %,0,0.00 %,4,0.05 %,0,0.00 %,0.0,0.0


Lets look at the emotion quotient column

In [11]:
data_df.is_there_an_emotion_directed_at_a_brand_or_product.value_counts(normalize=True)

No emotion toward brand or product    0.591170
Positive emotion                      0.329014
Negative emotion                      0.062500
I can't tell                          0.017317
Name: is_there_an_emotion_directed_at_a_brand_or_product, dtype: float64

Major class imbalance. Should consider dropping "I can't tell". 

In [12]:
# data_df[data_df.is_there_an_emotion_directed_at_a_brand_or_product == "I can't tell" ]

data_df = data_df[data_df.is_there_an_emotion_directed_at_a_brand_or_product != "I can't tell" ]

In [13]:
data_df.shape

(8569, 3)

Lets look at some of the most common words

In [14]:
word_freq = FreqDist()
for tweet in data_df['tweet_text'].map(lambda x:doc_preparer(x,stem=False)):
    for word in tweet.split():
        word_freq[word] +=1
word_freq.most_common(n=50)

[('sxsw', 9116),
 ('mention', 6851),
 ('link', 4077),
 ('rt', 2925),
 ('ipad', 2848),
 ('google', 2504),
 ('apple', 2184),
 ('quot', 1582),
 ('iphone', 1497),
 ('store', 1399),
 ('new', 1057),
 ('austin', 921),
 ('amp', 803),
 ('app', 792),
 ('circles', 639),
 ('social', 633),
 ('launch', 628),
 ('today', 566),
 ('android', 565),
 ('pop', 543),
 ('network', 447),
 ('via', 400),
 ('line', 391),
 ('get', 383),
 ('free', 378),
 ('called', 353),
 ('mobile', 342),
 ('party', 335),
 ('sxswi', 333),
 ('major', 301),
 ('one', 297),
 ('like', 275),
 ('time', 262),
 ('w', 261),
 ('check', 257),
 ('temporary', 254),
 ('opening', 242),
 ('possibly', 240),
 ('day', 231),
 ('people', 223),
 ('see', 217),
 ('downtown', 216),
 ('mayer', 212),
 ('great', 211),
 ('going', 211),
 ('maps', 211),
 ('apps', 210),
 ('go', 203),
 ('popup', 198),
 ('need', 196)]

Adding venue specific words and twitter specific words to stopwords

In [15]:
#Maybe don't add mention? and link?
sw.extend(['sxsw','rt','quot','austin','sxswi','mention','link'])

In [16]:
word_freq = FreqDist()
for tweet in data_df['tweet_text'].map(lambda x:doc_preparer(x,stem=True)):
    for word in tweet.split():
        word_freq[word] +=1
word_freq.most_common(n=50)

[('ipad', 2935),
 ('googl', 2508),
 ('appl', 2187),
 ('iphon', 1505),
 ('store', 1437),
 ('new', 1057),
 ('app', 1002),
 ('amp', 803),
 ('launch', 802),
 ('circl', 654),
 ('social', 637),
 ('today', 566),
 ('android', 565),
 ('pop', 558),
 ('get', 514),
 ('open', 498),
 ('network', 468),
 ('line', 440),
 ('go', 416),
 ('via', 400),
 ('call', 389),
 ('parti', 387),
 ('free', 378),
 ('mobil', 345),
 ('come', 326),
 ('like', 309),
 ('use', 309),
 ('major', 306),
 ('win', 305),
 ('time', 304),
 ('one', 301),
 ('check', 300),
 ('day', 280),
 ('map', 264),
 ('w', 261),
 ('possibl', 254),
 ('temporari', 254),
 ('see', 250),
 ('need', 238),
 ('look', 228),
 ('design', 225),
 ('peopl', 223),
 ('make', 219),
 ('downtown', 216),
 ('mayer', 213),
 ('great', 211),
 ('popup', 199),
 ('know', 196),
 ('marissa', 186),
 ('talk', 184)]

In [17]:
data_df['stemmed_tokens'] = data_df['tweet_text'].map(lambda x:doc_preparer(x,stem=True))

In [18]:
data_df

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,stemmed_tokens
0,".@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead! I need to upgrade. Plugin stations at #SXSW.",iPhone,Negative emotion,wesley g iphon hr tweet rise dead need upgrad plugin station
1,"@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",iPad or iPhone App,Positive emotion,jessede know fludapp awesom ipad iphon app like appreci design also give free ts
2,@swonderlin Can not wait for #iPad 2 also. They should sale them down at #SXSW.,iPad,Positive emotion,swonderlin wait ipad also sale
3,@sxsw I hope this year's festival isn't as crashy as this year's iPhone app. #sxsw,iPad or iPhone App,Negative emotion,hope year festiv crashi year iphon app
4,"@sxtxstate great stuff on Fri #SXSW: Marissa Mayer (Google), Tim O'Reilly (tech books/conferences) &amp; Matt Mullenweg (Wordpress)",Google,Positive emotion,sxtxstate great stuff fri marissa mayer googl tim reilli tech book confer amp matt mullenweg wordpress
...,...,...,...,...
8716,Ipad everywhere. #SXSW {link},iPad,Positive emotion,ipad everywher
8717,"Wave, buzz... RT @mention We interrupt your regularly scheduled #sxsw geek programming with big news {link} #google #circles",,No emotion toward brand or product,wave buzz interrupt regularli schedul geek program big news googl circl
8718,"Google's Zeiger, a physician never reported potential AE. Yet FDA relies on physicians. &quot;We're operating w/out data.&quot; #sxsw #health2dev",,No emotion toward brand or product,googl zeiger physician never report potenti ae yet fda reli physician oper w data health dev
8719,Some Verizon iPhone customers complained their time fell back an hour this weekend. Of course they were the New Yorkers who attended #SXSW.,,No emotion toward brand or product,verizon iphon custom complain time fell back hour weekend cours new yorker attend


# Label Encoding the sentinment column

In [19]:
le = LabelEncoder()
data_df['sentiment_target'] = le.fit_transform(data_df.is_there_an_emotion_directed_at_a_brand_or_product)
le.classes_

array(['Negative emotion', 'No emotion toward brand or product',
       'Positive emotion'], dtype=object)

In [20]:
data_df


Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,stemmed_tokens,sentiment_target
0,".@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead! I need to upgrade. Plugin stations at #SXSW.",iPhone,Negative emotion,wesley g iphon hr tweet rise dead need upgrad plugin station,0
1,"@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",iPad or iPhone App,Positive emotion,jessede know fludapp awesom ipad iphon app like appreci design also give free ts,2
2,@swonderlin Can not wait for #iPad 2 also. They should sale them down at #SXSW.,iPad,Positive emotion,swonderlin wait ipad also sale,2
3,@sxsw I hope this year's festival isn't as crashy as this year's iPhone app. #sxsw,iPad or iPhone App,Negative emotion,hope year festiv crashi year iphon app,0
4,"@sxtxstate great stuff on Fri #SXSW: Marissa Mayer (Google), Tim O'Reilly (tech books/conferences) &amp; Matt Mullenweg (Wordpress)",Google,Positive emotion,sxtxstate great stuff fri marissa mayer googl tim reilli tech book confer amp matt mullenweg wordpress,2
...,...,...,...,...,...
8716,Ipad everywhere. #SXSW {link},iPad,Positive emotion,ipad everywher,2
8717,"Wave, buzz... RT @mention We interrupt your regularly scheduled #sxsw geek programming with big news {link} #google #circles",,No emotion toward brand or product,wave buzz interrupt regularli schedul geek program big news googl circl,1
8718,"Google's Zeiger, a physician never reported potential AE. Yet FDA relies on physicians. &quot;We're operating w/out data.&quot; #sxsw #health2dev",,No emotion toward brand or product,googl zeiger physician never report potenti ae yet fda reli physician oper w data health dev,1
8719,Some Verizon iPhone customers complained their time fell back an hour this weekend. Of course they were the New Yorkers who attended #SXSW.,,No emotion toward brand or product,verizon iphon custom complain time fell back hour weekend cours new yorker attend,1


# Define X,y, train-test-split

In [21]:
X = data_df['stemmed_tokens']
y = data_df['sentiment_target']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3)

In [22]:
len(word_freq.keys())

6866

# CountVec with MNB

In [29]:
cvec = CountVectorizer()

X_train_vec = cvec.fit_transform(X_train)
X_train_vec = pd.DataFrame.sparse.from_spmatrix(X_train_vec)
X_train_vec.columns = sorted(cvec.vocabulary_)
X_train_vec.set_index(y_train.index, inplace=True)
X_train_vec

Unnamed: 0,aapl,aaron,aarpbulletin,ab,abandon,abba,abc,aber,abil,abl,abnorm,abound,absolut,absolutley,abt,abuzz,academi,acc,accelerat,accept,access,accessori,accesssxsw,accident,accommod,...,zation,zazzl,zazzlesxsw,zazzlsxsw,ze,zeiger,zelda,zeldman,zero,zeu,zgd,zing,zinio,zip,zite,zlf,zm,zomb,zombi,zomg,zone,zoom,zuckerberg,zynga,zzz
8122,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6647,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6092,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2370,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1453,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3660,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3408,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3404,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6253,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [30]:
mnb = MultinomialNB()

cvec_mnb_1_cvResults = cross_validate(mnb,
                                      X_train_vec,
                                      y_train,
                                      scoring=('accuracy', 'f1_macro'),
                                      cv=5,
                                      verbose=1,
                                      n_jobs = -2,
                                      return_train_score=True)

cv_printScores(cvec_mnb_1_cvResults)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 15 concurrent workers.


CV Results
Accuracy
--------------------------------
Training accuracy: 0.818
Training accuracy: 0.644
F-1 Score
--------------------------------
Training F1 score: 0.736
Training F1 score: 0.499


[Parallel(n_jobs=-2)]: Done   5 out of   5 | elapsed:    3.3s finished


# TiffyDiffy with MNB

In [31]:
tvec = TfidfVectorizer()

X_train_vec = tvec.fit_transform(X_train)
X_train_vec = pd.DataFrame.sparse.from_spmatrix(X_train_vec)
X_train_vec.columns = sorted(tvec.vocabulary_)
X_train_vec.set_index(y_train.index, inplace=True)
X_train_vec

Unnamed: 0,aapl,aaron,aarpbulletin,ab,abandon,abba,abc,aber,abil,abl,abnorm,abound,absolut,absolutley,abt,abuzz,academi,acc,accelerat,accept,access,accessori,accesssxsw,accident,accommod,...,zation,zazzl,zazzlesxsw,zazzlsxsw,ze,zeiger,zelda,zeldman,zero,zeu,zgd,zing,zinio,zip,zite,zlf,zm,zomb,zombi,zomg,zone,zoom,zuckerberg,zynga,zzz
8122,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6647,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6092,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2370,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1453,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3660,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6253,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
mnb = MultinomialNB()

tvec_mnb_1_cvResults = cross_validate(mnb,
                                      X_train_vec,
                                      y_train,
                                      scoring=('accuracy', 'f1_macro'),
                                      cv=5,
                                      verbose=1,
                                      n_jobs = -2,
                                      return_train_score=True)

cv_printScores(tvec_mnb_1_cvResults)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 15 concurrent workers.


CV Results
Accuracy
--------------------------------
Training accuracy: 0.747
Training accuracy: 0.649
F-1 Score
--------------------------------
Training F1 score: 0.493
Training F1 score: 0.379


[Parallel(n_jobs=-2)]: Done   5 out of   5 | elapsed:    3.3s finished
