In [1]:
import numpy as np
import pandas as pd

In [2]:
pip install -U spacy

Requirement already up-to-date: spacy in c:\users\rupal\anaconda3\lib\site-packages (2.2.4)
Note: you may need to restart the kernel to use updated packages.


In [3]:
!python -m spacy download en_core_web_md

[+] Download and installation successful
You can now load the model via spacy.load('en_core_web_md')


In [4]:
# read the data into a pandas dataframe
df=pd.read_csv('twitterMining.csv')
df.sample(frac=0.005)

Unnamed: 0.1,Unnamed: 0,user,location,Tweets,Subjectivity,Polarity,Positive,Negative,Neutral,compound,TextBlobAnalysis,VADERAnalysis,TextBlobAnalysis_Val,VADERAnalysis_Val
819,819,cyberdyne,Tehran,Backfilling my daily quotient of beer with Cor...,0.05,0.0,0.0,0.0,1.0,0.0,Neutral,Neutral,1,1
375,375,Gem_N_Eye_Radio,NO-where STL ✈ ATL ✈TUL✈ HTOWN,Wow this CORONA really got yall in an IMPOSSIB...,0.733333,-0.122222,0.275,0.0,0.725,0.5859,Negative,Positive,0,2
415,415,Freedom_24_7,United States,Try to defend sticking sticking pork barre...,0.0,0.0,0.0,0.0,1.0,0.0,Neutral,Neutral,1,1
586,586,Cfa89,/\/¯¯¯¯¯\/\ I♥CPT ZA,80s Yeah i mean corona is strictly for 100 or ...,0.59375,0.09375,0.196,0.0,0.804,0.296,Positive,Positive,2,2
757,757,DanieImcgrogan,CHY NA,Donald trump will literally joke his way into ...,0.55,0.25,0.224,0.0,0.776,0.5994,Positive,Positive,2,2


In [5]:
# setup the data
X, y = df['Tweets'], df['TextBlobAnalysis_Val']

from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=1)

Xtrain = Xtrain.copy()
Xtest = Xtest.copy()
ytrain = ytrain.copy()
ytest = ytest.copy()

In [6]:
def custom_tokenizer(doc):

    # use spacy to filter out noise
    tokens = [token.lemma_.lower() 
                        for token in doc 
                               if (
                                    len(token) >= 2 and # only preserve tokens that are greater than 2 characters long
                                    token.pos_ in ['PROPN', 'NOUN', 'ADJ', 'VERB', 'ADV'] and # only preserve selected pos
                                    #token.text in nlp.vocab and # check if token in vocab 
                                    token.is_alpha and # only preserve tokens that are fully alpha (not numeric or alpha-numeric)
                                    #not token.is_digit and # get rid of tokens that are fully numeric
                                    not token.is_punct and # get rid of tokens that are punctuations
                                    not token.is_space and # get rid of tokens that are spaces
                                    not token.is_stop and # get rid of tokens that are stop words
                                    not token.is_currency # get rid of tokens that denote currencies
                                )
                   ]

    # return cleaned-up text
    return ' '.join(tokens)

In [7]:
import spacy
nlp = spacy.load("en_core_web_md", disable=['parser', 'ner'])
corpus = nlp.pipe(Xtrain)
clean_corpus = [custom_tokenizer(doc) for doc in corpus]
Xtrain = pd.Series(clean_corpus)
Xtrain.head()


corpus = nlp.pipe(Xtest)
clean_corpus = [custom_tokenizer(doc) for doc in corpus]
Xtest = pd.Series(clean_corpus)
Xtest.head()

0                                         toppe corona
1    trend continue rare day march delhi well weath...
2     crush agree date play away mr oxygen problem don
3          balanced view forget check study retired dr
4                              guys corona invade hell
dtype: object

In [8]:
# setup the preprocessing->model pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

clf = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(
        binary=False,
        use_idf=True, smooth_idf=True, # idf  - with smoothing
        norm='l2', # tfidf - l2 norm
        lowercase=True, stop_words='english', 
        #token_pattern='(?u)\\b\\w\\w+\\b', 
        min_df=1, max_df=1.0, max_features=None, 
        ngram_range=(1, 1)
    )),
    ('nb', MultinomialNB(
        fit_prior=True, # learn class prior-probabilities from data
        class_prior=None # none - go with whatever fit-prior says
    ))])



In [9]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'tfidf__sublinear_tf':['True','False'], 
   'nb__alpha': [0.00002, 0.000002, 0.002, 0.0000002,0.2, 0.1,0.001]}

gscv = GridSearchCV(clf, param_grid, cv=4, return_train_score = False)

In [10]:
gscv.fit(Xtrain, ytrain)

GridSearchCV(cv=4, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        n

In [11]:
# Uncomment below to see the results of hyperparameter tuning
print ("-"*100)
print(gscv.best_estimator_, "\n")
print ("-"*100)
print(gscv.best_score_, "\n")
print ("-"*100)
print(gscv.best_params_, "\n")
print ("-"*100)
print(gscv.cv_results_, "\n")
print ("-"*100)


----------------------------------------------------------------------------------------------------
Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words='english', strip_accents=None,
                                 sublinear_tf='True',
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('nb',
                 MultinomialNB(alpha=0.2,

In [12]:
from sklearn import metrics
ypred = gscv.best_estimator_.predict(Xtest)

from sklearn import metrics
print (metrics.accuracy_score(ytest, ypred))
print("confusion matrix")
print (metrics.confusion_matrix(ytest, ypred))
print (metrics.classification_report(ytest, ypred))
# TN,FP,FN,TP = metrics.confusion_matrix(y_true=ytest, y_pred=ypred).ravel()

0.53
confusion matrix
[[14 14 15]
 [ 7 62 14]
 [ 5 39 30]]
              precision    recall  f1-score   support

           0       0.54      0.33      0.41        43
           1       0.54      0.75      0.63        83
           2       0.51      0.41      0.45        74

    accuracy                           0.53       200
   macro avg       0.53      0.49      0.49       200
weighted avg       0.53      0.53      0.51       200

