In [None]:
import tweepy
from textblob import TextBlob

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud
import  json
from collections import Counter

In [None]:
df = pd.read_csv('../input/pfizer-vaccine-tweets/vaccination_tweets.csv')

In [None]:
df.replace(False,0,inplace=True)
df.replace(True,1,inplace=True)

In [None]:
df.sample(2)

In [None]:
df = df[['id','text','user_verified']]
df.head()

# Removing Twitter Handles(@User)

In [None]:
def remove_pattern(text,pattern):
    
    # finding the pattern
    r = re.findall(pattern ,text)
    
    for i in r:
        text = re.sub(i,'',text)
        
    return text    

In [None]:
df['tidy_tweets'] = np.vectorize(remove_pattern)(df['text'],'@[\w]*')

df.head(10)

# Removing Punctuation, Numbers, and Special Characters

In [None]:
df['tidy_tweets'] = df['tidy_tweets'].str.replace('[^a-zA-Z#]',' ')
df.head(10)

# Removing Short Words 

In [None]:
df['tidy_tweets'] = df['tidy_tweets'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
df.head(10)

# Tokenization
*Tokenization is the process of splitting a string of text into tokens*

In [None]:
tokined_tweet = df['tidy_tweets'].apply(lambda x: x.split())
tokined_tweet.head(10)

# Stemming

Stemmming is a rule based process of stripping the suffixes('ing','es' etc.)from a word

fi=or example - 'play','player','played','plays','playing'are different variation of 'play'

In [None]:
from nltk import PorterStemmer

ps = PorterStemmer()

tokenized_tweet = tokined_tweet.apply(lambda x: [ps.stem(i) for i in x])

tokenized_tweet.head(10)

*Switching these tokens back together*

In [None]:
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ''.join(tokenized_tweet[i])
    
    
df['tidy_tweets'] = tokenized_tweet
df.head()

In [None]:
from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image
import urllib
import requests

In [None]:
non_verified_user = ''.join(text for text in df['tidy_tweets'][combine['user_verified']==0])

In [None]:
# combine the image with dataset
Mask = np.array(Image.open(requests.get('http://clipart-library.com/image_gallery2/Twitter-PNG-Image.png', stream=True).raw))

image_colors = ImageColorGenerator(Mask)

wc = WordCloud(background_color='black', height=1500, width=5000, mask=Mask).generate(non_verified_user)

In [None]:
plt.figure(figsize=(10,20))

plt.imshow(wc.recolor(color_func=image_colors), interpolation='hamming')

plt.axis('off')
plt.show()

In [None]:
verified_user = ''.join(text for text in df['tidy_tweets'][combine['user_verified']==1])

In [None]:
# combine the image with dataset
Mask = np.array(Image.open(requests.get('http://clipart-library.com/image_gallery2/Twitter-PNG-Image.png', stream=True).raw))

image_colors = ImageColorGenerator(Mask)

wc = WordCloud(background_color='black', height=1500, width=4000, mask=Mask).generate(verified_user)

In [None]:
plt.figure(figsize=(10,20))

plt.imshow(wc.recolor(color_func=image_colors), interpolation='hamming')

plt.axis('off')
plt.show()

# Impact of Hashtag 

In [None]:
def Hastags_Extract(x):
    hashtags = []
    
    for i in x:
        ht = re.findall(r'#(\w+)',i)
        hashtags.append(ht)
        
    return hashtags    

In [None]:
ht_positive = Hastags_Extract(df['tidy_tweets'][df['user_verified']==1])

ht_positive

# List Unnestting

In [None]:
ht_positive_unnest = sum(ht_positive,[])

In [None]:
ht_positive_unnest

# A nested list of all hastags from non_verified_user

In [None]:
ht_negative = Hastags_Extract(df['tidy_tweets'][df['user_verified']==0])

ht_negative

In [None]:
ht_negative_unnest = sum(ht_negative,[])
ht_negative_unnest


# Counting Frequency of words by verified_user

In [None]:
word_freq_positive = nltk.FreqDist(ht_positive_unnest)

word_freq_positive

# Dataframe of most frequently words by verified_user

In [None]:
df_positive  = pd.DataFrame({'Hashtags':list(word_freq_positive.keys()),'Count':list(word_freq_positive.values())})
df_positive.head(10)

# Barplot for the 20 most frequent words used for hashtags

In [None]:
import seaborn as sns
df_positive_plot = df_positive.nlargest(20,columns='Count')

sns.barplot(data = df_positive_plot, y='Hashtags',x='Count')
sns.despine()

# Similarly for non_verified_user

In [None]:
word_freq_negative = nltk.FreqDist(ht_negative_unnest)

word_freq_negative

df_negative  = pd.DataFrame({'Hashtags':list(word_freq_negative.keys()),'Count':list(word_freq_negative.values())})
df_negative.head(10)

In [None]:
df_negative_plot = df_negative.nlargest(20,columns='Count')

sns.barplot(data = df_negative_plot, y='Hashtags',x='Count')
sns.despine()

# Using CountVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

bow_vectorizer = CountVectorizer(max_df = 0.90, min_df = 2, max_features=325, stop_words='english')

bow = bow_vectorizer.fit_transform(combine['tidy_tweets'])
df_bow = pd.DataFrame(bow.todense())
df_bow

In [None]:
train_bow = bow
train_bow.todense()

In [None]:
# Refrences
# Towards Data Science Newsletter
