In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install twython

In [None]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

In [None]:
data=pd.read_csv('../input/all-covid19-vaccines-tweets/vaccination_all_tweets.csv')
data.head()

In [None]:
import re
def clean(text):
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('<.*?>+', '', text)
    return text

In [None]:
data['text'] = data['text'].apply(lambda x:clean(x))

In [None]:
sia=SIA()
scores=[]
for i in range(len(data['text'])):
    
    score = sia.polarity_scores(data['text'][i])
    score=score['compound']
    scores.append(score)
sentiment=[]
for i in scores:
    if i>=0.05:
        sentiment.append('Positive')
    elif i<=(-0.05):
        sentiment.append('Negative')
    else:
        sentiment.append('Neutral')
data['sentiment']=pd.Series(np.array(sentiment))

In [None]:
import string

def clean_text(text):
    
    text = str(text).lower()
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    
    return text
data['text'] = data['text'].apply(lambda x:clean_text(x))

data['text']

In [None]:
df=pd.DataFrame()
df['text']=data['text']
def tokenization(text):
    text = re.split('\W+', text)
    return text

df['tokenized'] = df['text'].apply(lambda x: tokenization(x.lower()))
stopword = nltk.corpus.stopwords.words('english')
def remove_stopwords(text):
    text = [word for word in text if word not in stopword]
    return text
    
df['No_stopwords'] = df['tokenized'].apply(lambda x: remove_stopwords(x))

ps = nltk.PorterStemmer()

def stemming1(text):
    text = [ps.stem(word) for word in text]
    return text

df['stemmed_porter'] = df['No_stopwords'].apply(lambda x: stemming1(x))

from nltk.stem.snowball import SnowballStemmer
s_stemmer = SnowballStemmer(language='english')
def stemming2(text):
    text = [s_stemmer.stem(word) for word in text]
    return text
df['stemmed_snowball'] = df['No_stopwords'].apply(lambda x: stemming2(x))

wn = nltk.WordNetLemmatizer()

def lemmatizer(text):
    text = [wn.lemmatize(word) for word in text]
    return text

df['lemmatized'] = df['No_stopwords'].apply(lambda x: lemmatizer(x))



In [None]:
df.head(10)

In [None]:
temp = data.groupby('sentiment').count()['text'].reset_index().sort_values(by='text',ascending=False)
temp.style.background_gradient(cmap='coolwarm_r')

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
n = nltk.WordNetLemmatizer()

from nltk.tokenize import RegexpTokenizer
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
text_counts = cv.fit_transform(df['text'])

In [None]:
print (text_counts)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, data['sentiment'], test_size=0.25, random_state=5)

In [None]:
print (X_train)

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
MNB = MultinomialNB()
MNB.fit(X_train, Y_train)

In [None]:
from sklearn import metrics
predicted = MNB.predict(X_test)
accuracy_score = metrics.accuracy_score(predicted, Y_test)

In [None]:
print(str('{:04.2f}'.format(accuracy_score*100))+'%')

In [None]:
from sklearn.naive_bayes import ComplementNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
CNB = ComplementNB()
GNB = GaussianNB()
BNB = BernoulliNB()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
text_count_2 = tfidf.fit_transform(df['text'])

#splitting the data in test and training
#from sklearn.model_selection() import train_test_split()
x_train, x_test, y_train, y_test = train_test_split(text_count_2, data['sentiment'],test_size=0.25,random_state=5)

#defining the model
#compilimg the model -> we are going to use already used models  MNB, CNB, BNB
#fitting the model
MNB.fit(x_train, y_train)
accuracy_score_mnb = metrics.accuracy_score(MNB.predict(x_test), y_test)
print('accuracy_score_mnb = '+str('{:4.2f}'.format(accuracy_score_mnb*100))+'%')

BNB.fit(x_train, y_train)
accuracy_score_bnb = metrics.accuracy_score(BNB.predict(x_test), y_test)
print('accuracy_score_bnb = '+str('{:4.2f}'.format(accuracy_score_bnb*100))+'%')

CNB.fit(x_train, y_train)
accuracy_score_cnb = metrics.accuracy_score(CNB.predict(x_test), y_test)
print('accuracy_score_cnb = '+str('{:4.2f}'.format(accuracy_score_cnb*100))+'%')



In [None]:
print(text_count_2)

In [None]:
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.colors import n_colors
from plotly.subplots import make_subplots

In [None]:
def ngram_df(corpus,nrange,n=None):
    vec = CountVectorizer(stop_words = 'english',ngram_range=nrange).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    total_list=words_freq[:n]
    df=pd.DataFrame(total_list,columns=['text','count'])
    return df
unigram_df=ngram_df(df['text'],(1,1),20)
bigram_df=ngram_df(df['text'],(2,2),20)
trigram_df=ngram_df(df['text'],(3,3),20)
fig = make_subplots(
    rows=3, cols=1,subplot_titles=("Unigram","Bigram",'Trigram'),
    specs=[[{"type": "scatter"}],
           [{"type": "scatter"}],
           [{"type": "scatter"}]
          ])

fig.add_trace(go.Bar(
    y=unigram_df['text'][::-1],
    x=unigram_df['count'][::-1],
    marker={'color': "blue"},  
    text=unigram_df['count'],
    textposition = "outside",
    orientation="h",
    name="Months",
),row=1,col=1)

fig.add_trace(go.Bar(
    y=bigram_df['text'][::-1],
    x=bigram_df['count'][::-1],
    marker={'color': "blue"},  
    text=bigram_df['count'],
     name="Days",
    textposition = "outside",
    orientation="h",
),row=2,col=1)

fig.add_trace(go.Bar(
    y=trigram_df['text'][::-1],
    x=trigram_df['count'][::-1],
    marker={'color': "blue"},  
    text=trigram_df['count'],
     name="Days",
    orientation="h",
    textposition = "outside",
),row=3,col=1)

fig.update_xaxes(showline=True, linewidth=2, linecolor='black', mirror=True)
fig.update_yaxes(showline=True, linewidth=2, linecolor='black', mirror=True)
fig.update_layout(title_text='Top N Grams',xaxis_title=" ",yaxis_title=" ",
                  showlegend=False,title_x=0.5,height=1200,template="plotly_dark")
fig.show()

In [None]:
Positive_tweet = data[data['sentiment']=='Positive'].reset_index()
Negative_tweet = data[data['sentiment']=='Negative'].reset_index()
Neutral_tweet = data[data['sentiment']=='Neutral'].reset_index()

In [None]:
Positive_tweet.head()

In [None]:
import unicodedata

In [None]:
def basic_clean(text):
  wnl = nltk.stem.WordNetLemmatizer()
  stopwords = nltk.corpus.stopwords.words('english') 
  text = (unicodedata.normalize('NFKD', text)
    .encode('ascii', 'ignore')
    .decode('utf-8', 'ignore')
    .lower())
  words = re.sub(r'[^\w\s]', '', text).split()
  return [wnl.lemmatize(word) for word in words if word not in stopwords]

In [None]:
words = basic_clean(''.join(str(Positive_tweet['text'].tolist())))

In [None]:
print (Positive_tweet['text'].count())

In [None]:
unigrams_series =(pd.Series(nltk.ngrams(words, 1)).value_counts())[:30]
unigrams_series.sort_values().plot.barh(color='blue', width=.9, figsize=(12, 8))
plt.title('30 Most Frequently Occuring unigrams')
plt.ylabel('Unigram')
plt.xlabel('# of Occurances')



In [None]:
print (unigrams_series)


In [None]:
unigrams_series.plot()
plt.show()

In [None]:
bigrams_series =(pd.Series(nltk.ngrams(words, 2)).value_counts())[:30]
bigrams_series.sort_values().plot.barh(color='blue', width=.9, figsize=(12, 8))
plt.title('30 Most Frequently Occuring Bigrams')
plt.ylabel('Bigram')
plt.xlabel('# of Occurances')


In [None]:
trigrams_series =(pd.Series(nltk.ngrams(words, 3)).value_counts())[:30]
trigrams_series.sort_values().plot.barh(color='blue', width=.9, figsize=(12, 8))
plt.title('30 Most Frequently Occuring Trigrams')
plt.ylabel('trigram')
plt.xlabel('# of Occurances')

In [None]:
nwords = basic_clean(''.join(str(Negative_tweet['text'].tolist())))

In [None]:
unigrams_nseries =(pd.Series(nltk.ngrams(nwords, 1)).value_counts())[:30]
unigrams_nseries.sort_values().plot.barh(color='blue', width=.9, figsize=(12, 8))
plt.title('30 Most Frequently Occuring unigrams')
plt.ylabel('unigram')
plt.xlabel('# of Occurances')

In [None]:
bigrams_nseries =(pd.Series(nltk.ngrams(nwords, 2)).value_counts())[:30]
bigrams_nseries.sort_values().plot.barh(color='blue', width=.9, figsize=(12, 8))
plt.title('30 Most Frequently Occuring Bigrams')
plt.ylabel('Bigram')
plt.xlabel('# of Occurances')


In [None]:
trigrams_nseries =(pd.Series(nltk.ngrams(nwords, 3)).value_counts())[:20]
trigrams_nseries.sort_values().plot.barh(color='blue', width=.9, figsize=(12, 8))
plt.title('20 Most Frequently Occuring trigrams')
plt.ylabel('trigram')
plt.xlabel('# of Occurances')

In [None]:
neuwords = basic_clean(''.join(str(Neutral_tweet['text'].tolist())))

In [None]:
unigrams_neuseries =(pd.Series(nltk.ngrams(neuwords, 1)).value_counts())[:30]
unigrams_neuseries.sort_values().plot.barh(color='blue', width=.9, figsize=(12, 8))
plt.title('30 Most Frequently Occuring unigrams')
plt.ylabel('unigram')
plt.xlabel('# of Occurances')

In [None]:
bigrams_neuseries =(pd.Series(nltk.ngrams(neuwords, 2)).value_counts())[:30]
bigrams_neuseries.sort_values().plot.barh(color='blue', width=.9, figsize=(12, 8))
plt.title('30 Most Frequently Occuring Bigrams')
plt.ylabel('Bigram')
plt.xlabel('# of Occurances')

In [None]:
trigrams_neuseries =(pd.Series(nltk.ngrams(neuwords, 3)).value_counts())[:30]
trigrams_neuseries.sort_values().plot.barh(color='blue', width=.9, figsize=(12, 8))
plt.title('30 Most Frequently Occuring trigrams')
plt.ylabel('trigram')
plt.xlabel('# of Occurances')

In [None]:
import scattertext as st

In [None]:
data.head()

In [None]:
data1=pd.DataFrame()
from IPython.display import IFrame


In [None]:
data1=data.copy()

data1['binary_sentiment'] = data1['sentiment'].apply(lambda x: x if x =="Negative" else "non-negative")
data1['date'] = data1['date'].apply(str)

data = data1.assign(
    parse=lambda data: data.text.apply(st.whitespace_nlp_with_sentences)
)

corpus = st.CorpusFromParsedDocuments(
    data, category_col='binary_sentiment', parsed_col='parse'
).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))

html = st.produce_scattertext_explorer(
    corpus,
    category='Negative', category_name='Negative', not_category_name='Neutral/Positive',
    minimum_term_frequency=0, pmi_threshold_coefficient=0,
    width_in_pixels=1000, metadata=corpus.get_df()['date'],
    transform=st.Scalers.dense_rank
    
)



In [None]:
open('./demo_compact.html', 'w').write(html)
IFrame(src='./demo_compact.html', width=1200, height=700)

In [None]:
docs=Positive_tweet['text'].tolist()
cv=CountVectorizer(max_df=0.85,stop_words='english',max_features=20000)
word_count_vector=cv.fit_transform(docs)

In [None]:
word_count_vector

In [None]:
list(cv.vocabulary_.keys())[:10]

In [None]:

from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

# ****Features of positive tweets using tf-idf:

In [None]:


def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=32):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results


feature_names=cv.get_feature_names()


#generate tf-idf for the given document
tf_idf_vector=tfidf_transformer.transform(cv.transform(docs))

#sort the tf-idf vectors by descending order of scores
sorted_items=sort_coo(tf_idf_vector.tocoo())

#extract only the top n; n here is 32
keywords=extract_topn_from_vector(feature_names,sorted_items,32)


print("\n===Keywords===")
for k in keywords:
    print(k,keywords[k])


In [None]:
ndocs=Negative_tweet['text'].tolist()
neudocs=Neutral_tweet['text'].tolist()

In [None]:
word_count_vector_neu=cv.fit_transform(neudocs)

In [None]:

word_count_vector_neu

In [None]:
list(cv.vocabulary_.keys())[:10]

In [None]:
tfidf_transformer.fit(word_count_vector_neu)

# ****Features of neutral tweets using tf-idf:

In [None]:
feature_names=cv.get_feature_names()




#generate tf-idf for the given document
tf_idf_vector=tfidf_transformer.transform(cv.transform(neudocs))

#sort the tf-idf vectors by descending order of scores
sorted_items=sort_coo(tf_idf_vector.tocoo())

#extract only the top n; n here is 32
keywords=extract_topn_from_vector(feature_names,sorted_items,32)


print("\n===Keywords===")
for k in keywords:
    print(k,keywords[k])


def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=32):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [None]:
word_count_vector_n=cv.fit_transform(ndocs)
list(cv.vocabulary_.keys())[:10]

In [None]:
tfidf_transformer.fit(word_count_vector_n)

# ****Features of negative tweets using tf-idf:

In [None]:
feature_names=cv.get_feature_names()




#generate tf-idf for the given document
tf_idf_vector=tfidf_transformer.transform(cv.transform(ndocs))

#sort the tf-idf vectors by descending order of scores
sorted_items=sort_coo(tf_idf_vector.tocoo())

#extract only the top n; n here is 32
keywords=extract_topn_from_vector(feature_names,sorted_items,100)


print("\n===Keywords===")
for k in keywords:
    print(k,keywords[k])


def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=32):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results