# References

https://github.com/jasonwei20/eda_nlp/blob/04ab29c5b18d2d72f9fa5b304322aaf4793acea0/code/eda.py#L86

# Imports

In [1]:
import pandas as pd
pd.options.display.max_columns = 50   # max displayed columns
pd.options.display.max_colwidth = 280 # width of a column

import matplotlib.pyplot as plt
from collections import defaultdict

import random
random.seed(42)

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV

import nltk
from nltk.probability import FreqDist
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
from nltk import pos_tag
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer
sw = stopwords.words('english')

from googletrans import Translator

import xgboost
# from imblearn.over_sampling import SMOTE 




NB_n_jobs = -2

In [2]:
#nltk.download('omw-1.4')

In [3]:
import sys
sys.path.append( '../../src' )
from pandas_functions import *

In [4]:
dataFolder_path = '../../data/'

# Helper functions

In [5]:
# def get_wordnet_pos(treebank_tag):
#     '''
#     Translate nltk POS to wordnet tags
#     '''
#     if treebank_tag.startswith('J'):
#         return wordnet.ADJ
#     elif treebank_tag.startswith('V'):
#         return wordnet.VERB
#     elif treebank_tag.startswith('N'):
#         return wordnet.NOUN
#     elif treebank_tag.startswith('R'):
#         return wordnet.ADV
#     else:
#         return wordnet.NOUN


def doc_preparer(doc, stem = False, stop_words=sw, implement_transtranslate = False):
    '''

    :param doc: a document from the satire corpus 
    :return: a document string with words which have been 
            lemmatized, 
            parsed for stopwords, 
            made lowercase,
            and stripped of punctuation and numbers.
    '''
    #Stemming seems to work better. Lemming can't identify plurals of products
    
    
#     lemmed_keywords = ['apple',
#                 'ipad', 'ipads',
#                 'iphone', 'iphones',
#                 'itunes',
#                 'google', 'googled',
#                 'android', 'droid', 'androids', 'droids',
#                 'circle', 'circles'
#                 'app', 'apps']

#     stemmed_keywords = ['appl',
#                         'ipad',
#                         'iphon',
#                         'itun',
#                         'googl',
#                         'android',
#                         'droid',
#                         'circl',
#                         'app']

    regex_token = RegexpTokenizer(r"([a-zA-Z]+(?:’[a-z]+)?)")
    doc = regex_token.tokenize(doc)
    doc = [word.lower() for word in doc]
    doc = [word for word in doc if word not in sw]
#     doc = pos_tag(doc)
#     doc = [(word[0], get_wordnet_pos(word[1])) for word in doc]
#     lemmatizer = WordNetLemmatizer()
#     doc = [lemmatizer.lemmatize(word[0], word[1]) for word in doc]
#     doc = [word for word in doc if word in lemmed_keywords]

#     if implement_transtranslate:
#         doc = 
   
    p_stemmer = nltk.stem.PorterStemmer()
    if stem:
        doc = [p_stemmer.stem(word) for word in doc]
    return ' '.join(doc)


def cv_printScores(cv_metric):
    print('CV Results')
    print('='*32)
    print('Accuracy')
    print('-'*32)
    print(f"Training accuracy: {cv_metric['train_accuracy'].mean():.3f}")
    print(f"Test accuracy:     {cv_metric['test_accuracy'].mean():.3f}")
    print('F-1 Score')
    print('-'*32)
    print(f"Training F1 score: {cv_metric['train_f1_macro'].mean():.3f}")
    print(f"Test F1 score:     {cv_metric['test_f1_macro'].mean():.3f}")
    

In [6]:
# from googletrans import Translator
# translator = Translator()

# def German_translation(x):
#     print(x)    
#     german_translation = translator.translate(x, dest='de')    
#     return german_translation.text

# def English_translation(x):
#     print(x)    
#     english_translation = translator.translate(x, dest='en')    
#     return english_translation.text

In [7]:
'''

Original Author: Jason Wei
EDA: Easy Data Augmentation Techniques for Boosting Performance on Text Classification Tasks:
https://github.com/jasonwei20/eda_nlp/blob/04ab29c5b18d2d72f9fa5b304322aaf4793acea0/code/eda.py#L86

'''

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word): 
        for l in syn.lemmas(): 
#             print(l.name())
            synonym = l.name().replace("_", " ").replace("-", " ").lower()
            synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])

            if (synonym in sw):
                pass
            synonyms.add(synonym) 
    if word in synonyms:
        synonyms.remove(word)
    return list(synonyms)


def synonym_augmentation(sentence, numOfWordsToSyn=1, numOfExtraSentences=2):
    '''
    sentence: string to augment using synomization
    numOfWordsToSyn: number of words in the sentence to synonymize
    numOfExtraSentences: number of augmented sentences to return. 
    '''
    new_sentences = []
    sentence = sentence.lower()
    original_words = sentence.split()
    
    if len(original_words)<numOfWordsToSyn:
        numOfWordsToSyn = len(original_words)
        
    random_word_list = list(set(original_words))
    random.shuffle(random_word_list)
    num_replaced = 0
    
    # Randomly pick a word in list 
    # Pick a random synonym that is not in sw list
    # Replace that word in our list of words    
    for i in range(0,numOfExtraSentences):
        new_words = original_words
        for random_word in random_word_list:
            synonyms = get_synonyms(random_word)
            if len(synonyms) >= 1:
                synonym = random.choice(list(synonyms))

                #new_words is rebuilt with synonym replacement, while maintaining order of words
                new_words = [synonym if word == random_word else word for word in new_words]
                random_word_list.remove(random_word)
                num_replaced += 1
            if num_replaced >= numOfWordsToSyn: #only replace up to n words
                break
                
        #Form new sentence by joining new words
        s = ' '.join(new_words).lower().strip().replace('  ', ' ')
        
        # add sentences to our list of synonymized sentences
        # if not already there and not same as the original sentence
        if (s not in new_sentences) & (s!=sentence):
            new_sentences.append(s)

    return new_sentences

In [8]:
synonym_augmentation("holy shit, I can't believe this works",numOfWordsToSyn=5,numOfExtraSentences=5)

["holy place shit, i can't trust this whole shebang",
 "holy shit, ace can't believe this works"]

In [9]:
data_df = pd.read_csv(dataFolder_path+'judge_1377884607_tweet_product_company.csv')

In [10]:
dataFrame_info(data_df)

Datframe has 8721 rows and 3 columns


Info Table:,Zeroes,Zeroes,Nulls,Nulls,Uniques,Uniques,Missing/Unknown,Missing/Unknown,Mean,Median
Details:,Count,Fraction,Count,Fraction,Count,Fraction,Count,Fraction,Unnamed: 9_level_1,Unnamed: 10_level_1
Columns:,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
tweet_text,0,0.00 %,1,0.01 %,8694,99.69 %,0,0.00 %,0.0,0.0
emotion_in_tweet_is_directed_at,0,0.00 %,5552,63.66 %,10,0.11 %,0,0.00 %,0.0,0.0
is_there_an_emotion_directed_at_a_brand_or_product,0,0.00 %,0,0.00 %,4,0.05 %,0,0.00 %,0.0,0.0


Looking at the 1 null in tweet text

In [11]:
data_df[data_df.tweet_text.isna()]

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
6,,,No emotion toward brand or product


In [12]:
data_df.dropna(subset=['tweet_text'],inplace=True)

In [13]:
data_df.shape

(8720, 3)

In [14]:
dataFrame_info(data_df)

Datframe has 8720 rows and 3 columns


Info Table:,Zeroes,Zeroes,Nulls,Nulls,Uniques,Uniques,Missing/Unknown,Missing/Unknown,Mean,Median
Details:,Count,Fraction,Count,Fraction,Count,Fraction,Count,Fraction,Unnamed: 9_level_1,Unnamed: 10_level_1
Columns:,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
tweet_text,0,0.00 %,0,0.00 %,8693,99.69 %,0,0.00 %,0.0,0.0
emotion_in_tweet_is_directed_at,0,0.00 %,5551,63.66 %,10,0.11 %,0,0.00 %,0.0,0.0
is_there_an_emotion_directed_at_a_brand_or_product,0,0.00 %,0,0.00 %,4,0.05 %,0,0.00 %,0.0,0.0


Lets look at the emotion quotient column

In [15]:
data_df.is_there_an_emotion_directed_at_a_brand_or_product.value_counts()

No emotion toward brand or product    5155
Positive emotion                      2869
Negative emotion                       545
I can't tell                           151
Name: is_there_an_emotion_directed_at_a_brand_or_product, dtype: int64

Major class imbalance. Should consider dropping "I can't tell". 

In [16]:
# data_df[data_df.is_there_an_emotion_directed_at_a_brand_or_product == "I can't tell" ]

data_df = data_df[data_df.is_there_an_emotion_directed_at_a_brand_or_product != "I can't tell" ]

In [17]:
data_df.shape

(8569, 3)

Lets look at some of the most common words

In [18]:
word_freq = FreqDist()
for tweet in data_df['tweet_text'].map(lambda x:doc_preparer(x,stem=False)):
    for word in tweet.split():
        word_freq[word] +=1
word_freq.most_common(n=50)

[('sxsw', 9116),
 ('mention', 6851),
 ('link', 4077),
 ('rt', 2925),
 ('ipad', 2848),
 ('google', 2504),
 ('apple', 2184),
 ('quot', 1582),
 ('iphone', 1497),
 ('store', 1399),
 ('new', 1057),
 ('austin', 921),
 ('amp', 803),
 ('app', 792),
 ('circles', 639),
 ('social', 633),
 ('launch', 628),
 ('today', 566),
 ('android', 565),
 ('pop', 543),
 ('network', 447),
 ('via', 400),
 ('line', 391),
 ('get', 383),
 ('free', 378),
 ('called', 353),
 ('mobile', 342),
 ('party', 335),
 ('sxswi', 333),
 ('major', 301),
 ('one', 297),
 ('like', 275),
 ('time', 262),
 ('w', 261),
 ('check', 257),
 ('temporary', 254),
 ('opening', 242),
 ('possibly', 240),
 ('day', 231),
 ('people', 223),
 ('see', 217),
 ('downtown', 216),
 ('mayer', 212),
 ('great', 211),
 ('going', 211),
 ('maps', 211),
 ('apps', 210),
 ('go', 203),
 ('popup', 198),
 ('need', 196)]

Adding venue specific words and twitter specific words to stopwords

In [19]:
#Maybe don't add mention? and link?
sw.extend(['sxsw','rt','quot','austin','sxswi',
           'mention','link',
           'today','w'
          ])

In [20]:
word_freq = FreqDist()
for tweet in data_df['tweet_text'].map(lambda x:doc_preparer(x,stem=True)):
    for word in tweet.split():
        word_freq[word] +=1
word_freq.most_common(n=50)

[('ipad', 2935),
 ('googl', 2508),
 ('appl', 2187),
 ('iphon', 1505),
 ('store', 1437),
 ('new', 1057),
 ('app', 1002),
 ('amp', 803),
 ('launch', 802),
 ('circl', 654),
 ('social', 637),
 ('android', 565),
 ('pop', 558),
 ('get', 514),
 ('open', 498),
 ('network', 468),
 ('line', 440),
 ('go', 416),
 ('via', 400),
 ('call', 389),
 ('parti', 387),
 ('free', 378),
 ('mobil', 345),
 ('come', 326),
 ('like', 309),
 ('use', 309),
 ('major', 306),
 ('win', 305),
 ('time', 304),
 ('one', 301),
 ('check', 300),
 ('day', 280),
 ('map', 264),
 ('possibl', 254),
 ('temporari', 254),
 ('see', 250),
 ('need', 238),
 ('look', 228),
 ('design', 225),
 ('peopl', 223),
 ('make', 219),
 ('downtown', 216),
 ('mayer', 213),
 ('great', 211),
 ('popup', 199),
 ('know', 196),
 ('marissa', 186),
 ('talk', 184),
 ('think', 182),
 ('set', 181)]

In [21]:
data_df['stemmed_tokens'] = data_df['tweet_text'].map(lambda x:doc_preparer(x,stem=True))

In [22]:
data_df

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,stemmed_tokens
0,".@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead! I need to upgrade. Plugin stations at #SXSW.",iPhone,Negative emotion,wesley g iphon hr tweet rise dead need upgrad plugin station
1,"@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",iPad or iPhone App,Positive emotion,jessede know fludapp awesom ipad iphon app like appreci design also give free ts
2,@swonderlin Can not wait for #iPad 2 also. They should sale them down at #SXSW.,iPad,Positive emotion,swonderlin wait ipad also sale
3,@sxsw I hope this year's festival isn't as crashy as this year's iPhone app. #sxsw,iPad or iPhone App,Negative emotion,hope year festiv crashi year iphon app
4,"@sxtxstate great stuff on Fri #SXSW: Marissa Mayer (Google), Tim O'Reilly (tech books/conferences) &amp; Matt Mullenweg (Wordpress)",Google,Positive emotion,sxtxstate great stuff fri marissa mayer googl tim reilli tech book confer amp matt mullenweg wordpress
...,...,...,...,...
8716,Ipad everywhere. #SXSW {link},iPad,Positive emotion,ipad everywher
8717,"Wave, buzz... RT @mention We interrupt your regularly scheduled #sxsw geek programming with big news {link} #google #circles",,No emotion toward brand or product,wave buzz interrupt regularli schedul geek program big news googl circl
8718,"Google's Zeiger, a physician never reported potential AE. Yet FDA relies on physicians. &quot;We're operating w/out data.&quot; #sxsw #health2dev",,No emotion toward brand or product,googl zeiger physician never report potenti ae yet fda reli physician oper data health dev
8719,Some Verizon iPhone customers complained their time fell back an hour this weekend. Of course they were the New Yorkers who attended #SXSW.,,No emotion toward brand or product,verizon iphon custom complain time fell back hour weekend cours new yorker attend


# Label Encoding the sentinment column

In [23]:
le = LabelEncoder()
data_df['sentiment_target'] = le.fit_transform(data_df.is_there_an_emotion_directed_at_a_brand_or_product)
le.classes_

array(['Negative emotion', 'No emotion toward brand or product',
       'Positive emotion'], dtype=object)

In [24]:
data_df


Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,stemmed_tokens,sentiment_target
0,".@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead! I need to upgrade. Plugin stations at #SXSW.",iPhone,Negative emotion,wesley g iphon hr tweet rise dead need upgrad plugin station,0
1,"@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",iPad or iPhone App,Positive emotion,jessede know fludapp awesom ipad iphon app like appreci design also give free ts,2
2,@swonderlin Can not wait for #iPad 2 also. They should sale them down at #SXSW.,iPad,Positive emotion,swonderlin wait ipad also sale,2
3,@sxsw I hope this year's festival isn't as crashy as this year's iPhone app. #sxsw,iPad or iPhone App,Negative emotion,hope year festiv crashi year iphon app,0
4,"@sxtxstate great stuff on Fri #SXSW: Marissa Mayer (Google), Tim O'Reilly (tech books/conferences) &amp; Matt Mullenweg (Wordpress)",Google,Positive emotion,sxtxstate great stuff fri marissa mayer googl tim reilli tech book confer amp matt mullenweg wordpress,2
...,...,...,...,...,...
8716,Ipad everywhere. #SXSW {link},iPad,Positive emotion,ipad everywher,2
8717,"Wave, buzz... RT @mention We interrupt your regularly scheduled #sxsw geek programming with big news {link} #google #circles",,No emotion toward brand or product,wave buzz interrupt regularli schedul geek program big news googl circl,1
8718,"Google's Zeiger, a physician never reported potential AE. Yet FDA relies on physicians. &quot;We're operating w/out data.&quot; #sxsw #health2dev",,No emotion toward brand or product,googl zeiger physician never report potenti ae yet fda reli physician oper data health dev,1
8719,Some Verizon iPhone customers complained their time fell back an hour this weekend. Of course they were the New Yorkers who attended #SXSW.,,No emotion toward brand or product,verizon iphon custom complain time fell back hour weekend cours new yorker attend,1


# Define X,y, train-test-split

In [25]:
X = data_df['stemmed_tokens']
y = data_df['sentiment_target']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3)

In [26]:
len(word_freq.keys())

6864

In [27]:
df = pd.merge(X_train, y_train, right_index = True,
               left_index = True)
df[df['sentiment_target'] == 0]
for each in (df[df['sentiment_target'] == 0]).stemmed_tokens:
    print(each)
    print(synonym_augmentation(each,numOfWordsToSyn=5))
    break

guess android app time
['gauge android app clock time', 'guess humanoid app time']


In [28]:
y_train.value_counts()

1    3642
2    1963
0     393
Name: sentiment_target, dtype: int64

In [29]:
X_train

7017                                     anyon know appl store still ipad like run
2470                                      realli googl go call circl hmm see later
8131               thing got smarmcak writeup googl ontologist ontologyshoutoutwut
2655       get readi session design ipad interfac lynn teo ballroom convent center
3664                                  line alreadi form temp appl store open insan
                                           ...                                    
1455        ok come build custom map googl map night organ parti itinerari product
6078                                 media circu guy walk appl popup store st ipad
752                       googl launch major new social network call circl possibl
2720    save tree go cardless sign cardless contact tool environment android iphon
928                               offici app go android iphon ipad lt thank instal
Name: stemmed_tokens, Length: 5998, dtype: object

# CountVec with MNB

In [30]:
cvec = CountVectorizer()

X_train_vec = cvec.fit_transform(X_train)
X_train_vec = pd.DataFrame.sparse.from_spmatrix(X_train_vec)
X_train_vec.columns = sorted(cvec.vocabulary_)
X_train_vec.set_index(y_train.index, inplace=True)
# X_train_vec

In [31]:
mnb = MultinomialNB()
mnb.fit(X_train_vec,y_train)
cvec_mnb_cvResults = cross_validate(mnb,
                                      X_train_vec,
                                      y_train,
                                      scoring=('accuracy', 'f1_macro'),
                                      cv=4,
                                      verbose=1,
                                      n_jobs = NB_n_jobs,
                                      return_train_score=True)

cv_printScores(cvec_mnb_cvResults)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 15 concurrent workers.


CV Results
Accuracy
--------------------------------
Training accuracy: 0.826
Test accuracy:     0.654
F-1 Score
--------------------------------
Training F1 score: 0.740
Test F1 score:     0.509


[Parallel(n_jobs=-2)]: Done   4 out of   4 | elapsed:    4.3s finished


In [32]:
X_test_vec = cvec.transform(X_test)
X_test_vec = pd.DataFrame.sparse.from_spmatrix(X_test_vec)
X_test_vec.columns = sorted(cvec.vocabulary_)
X_test_vec.set_index(y_test.index, inplace=True)

mnb.score(X_test_vec,y_test)

0.6549980552314275

# TiffyDiffy with MNB

In [33]:
tvec = TfidfVectorizer()

X_train_vec = tvec.fit_transform(X_train)
X_train_vec = pd.DataFrame.sparse.from_spmatrix(X_train_vec)
X_train_vec.columns = sorted(tvec.vocabulary_)
X_train_vec.set_index(y_train.index, inplace=True)
# X_train_vec

In [34]:
mnb = MultinomialNB()
mnb.fit(X_train_vec,y_train)
tvec_mnb_cvResults = cross_validate(mnb,
                                      X_train_vec,
                                      y_train,
                                      scoring=('accuracy', 'f1_macro'),
                                      cv=4,
                                      verbose=1,
                                      n_jobs = NB_n_jobs,
                                      return_train_score=True)

cv_printScores(tvec_mnb_cvResults)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 15 concurrent workers.


CV Results
Accuracy
--------------------------------
Training accuracy: 0.742
Test accuracy:     0.654
F-1 Score
--------------------------------
Training F1 score: 0.486
Test F1 score:     0.378


[Parallel(n_jobs=-2)]: Done   4 out of   4 | elapsed:    3.6s finished


# CountVec with RF

In [35]:
cvec = CountVectorizer()

X_train_vec = cvec.fit_transform(X_train)
X_train_vec = pd.DataFrame.sparse.from_spmatrix(X_train_vec)
X_train_vec.columns = sorted(cvec.vocabulary_)
X_train_vec.set_index(y_train.index, inplace=True)
# X_train_vec

In [36]:
rfc = RandomForestClassifier()
rfc.fit(X_train_vec,y_train)
cvec_rfc_cvResults = cross_validate(rfc,
                                      X_train_vec,
                                      y_train,
                                      scoring=('accuracy', 'f1_macro'),
                                      cv=4,
                                      verbose=1,
                                      n_jobs = NB_n_jobs,
                                      return_train_score=True)

cv_printScores(cvec_mnb_cvResults)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 15 concurrent workers.


CV Results
Accuracy
--------------------------------
Training accuracy: 0.826
Test accuracy:     0.654
F-1 Score
--------------------------------
Training F1 score: 0.740
Test F1 score:     0.509


[Parallel(n_jobs=-2)]: Done   4 out of   4 | elapsed:   10.0s finished


In [37]:
X_test_vec = cvec.transform(X_test)
X_test_vec = pd.DataFrame.sparse.from_spmatrix(X_test_vec)
X_test_vec.columns = sorted(cvec.vocabulary_)
X_test_vec.set_index(y_test.index, inplace=True)

rfc.score(X_test_vec,y_test)

0.6818358615324777

# TiffyDiffy with RandomForest

In [38]:
tvec = TfidfVectorizer()

X_train_vec = tvec.fit_transform(X_train)
X_train_vec = pd.DataFrame.sparse.from_spmatrix(X_train_vec)
X_train_vec.columns = sorted(tvec.vocabulary_)
X_train_vec.set_index(y_train.index, inplace=True)
# X_train_vec

In [39]:
rfc = RandomForestClassifier()
rfc.fit(X_train_vec,y_train)
tvec_rfc_cvResults = cross_validate(rfc,
                                    X_train_vec,
                                    y_train,
                                    scoring=('accuracy', 'f1_macro'),
                                    cv=4,
                                    verbose=1,
                                    n_jobs=NB_n_jobs,
                                    return_train_score=True)

cv_printScores(tvec_rfc_cvResults)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 15 concurrent workers.


CV Results
Accuracy
--------------------------------
Training accuracy: 0.968
Test accuracy:     0.679
F-1 Score
--------------------------------
Training F1 score: 0.964
Test F1 score:     0.507


[Parallel(n_jobs=-2)]: Done   4 out of   4 | elapsed:    9.7s finished


In [40]:
# params = {}
# params['rfc__criterion'] = ['gini','entropy']
# params['rfc__n_estimators'] = np.arange(50,250,50)
# params['rfc__max_depth'] = np.arange(150,200,10)
# # params['rfc__max_leaf_nodes']=[4000,4500,5000]

# rfc_model_pipe = Pipeline([
#     ('vec',TfidfVectorizer()),
#    ('rfc',RandomForestClassifier(random_state=42,n_jobs=NB_n_jobs))
# ])

# rfc_gs1 = GridSearchCV(estimator=rfc_model_pipe,
#                            param_grid=params,
#                            n_jobs=NB_n_jobs,
#                            scoring=[ 'accuracy','precision_macro','recall_macro','f1_macro'],
#                            refit='accuracy',
#                            return_train_score=True)

# rfc_gs1.fit(X_train,y_train)
# prettyPrintGridCVResults(rfc_gs1)

In [41]:
# rfc_gs1.best_estimator_.score(X_test,y_test)

# TiffyDiffy with GradientBoosting Classifier

In [42]:
tvec = TfidfVectorizer()

X_train_vec = tvec.fit_transform(X_train)
X_train_vec = pd.DataFrame.sparse.from_spmatrix(X_train_vec)
X_train_vec.columns = sorted(tvec.vocabulary_)
X_train_vec.set_index(y_train.index, inplace=True)
# X_train_vec

In [43]:
grad = GradientBoostingClassifier()
grad.fit(X_train_vec,y_train)
tvec_grad_cvResults = cross_validate(grad,
                                      X_train_vec,
                                      y_train,
                                      scoring=('accuracy', 'f1_macro'),
                                      verbose=1,
                                      n_jobs = NB_n_jobs,
                                      return_train_score=True)

cv_printScores(tvec_grad_cvResults)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 15 concurrent workers.


CV Results
Accuracy
--------------------------------
Training accuracy: 0.737
Test accuracy:     0.670
F-1 Score
--------------------------------
Training F1 score: 0.571
Test F1 score:     0.446


[Parallel(n_jobs=-2)]: Done   5 out of   5 | elapsed:   11.3s finished
