# Importing Libraries

In [None]:
!pip install tweepy

In [None]:
import pandas as pd
import numpy as np
import tweepy
from textblob import TextBlob
from wordcloud import WordCloud
import re
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import plotly.graph_objects as go
import plotly.express as px
from collections import Counter
import string
from nltk.corpus import stopwords

# Preprocessing

In [None]:
df = pd.read_csv('/kaggle/input/twitter-data-for-data-analysis/Test Data.csv')
df

### Cleaning the text

In [None]:
def cleantext(txt):
    txt = re.sub(r'@[a-zA-Z0-9]+','',txt) #removing @ccount names
    txt = re.sub(r'#','',txt) #removing # symbol
    txt = re.sub(r'RT[\s]+','',txt) #removing retweetes - RT
    txt = re.sub(r'https?:\/\/\S+','',txt) #removing hyperlinks
#     txt = re.sub(r'\b\w{1,3}\b','',txt) #remove words less than 4 characters
    return txt

In [None]:
df['text'] = df.text.apply(cleantext)
df.text

In [None]:
df.text = df.text.str.lower()
df.text.head()

# Knowing About the Data

In [None]:
tweets = pd.DataFrame()

In [None]:
tweets['tweet'] = df['text']
tweets

In [None]:
# functions for return subjectivity and polarity

def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

def getPolarity(text):
    return TextBlob(text).sentiment.polarity

##### How TextBlob sentiment algorithm work? - By default, it calculates average polarity and subjectivity over each word in a given text using a dictionary of adjectives and their hand-tagged scores.

In [None]:
tweets['subjectivity'] = tweets.tweet.apply(getSubjectivity)
tweets['polarity']  = tweets.tweet.apply(getPolarity)
tweets

### word cloud of all tweets

In [None]:
#word cloud
all_words = ' '.join([twts for twts in tweets['tweet']])
word_cloud = WordCloud(width = 500, height = 300, random_state=42, max_font_size=150).generate(all_words)

plt.imshow(word_cloud)
plt.axis('off')
plt.show()

##### so we can clearly see that ```covaxin``` is the word, people which are tweeting about the most,and generally about the corona virus vaccine.

In [None]:
# function for negative, nuetral, and positive analysis
def getSentiment(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'
    

In [None]:
tweets['sentiment'] = tweets['polarity'].apply(getSentiment)
tweets

### 10 most +ve and -ve tweets

In [None]:
sorted_df = tweets.sort_values(by=['polarity'],ascending=False)
sorted_df

In [None]:
# 10 most +ive tweets
k = 1
for i in sorted_df.head(10)['tweet']:
    print(str(k)+'] '+i)
    print()
    k+=1

In [None]:
# 10 most -ive tweets
k = 1
for i in sorted_df.tail(10)['tweet'][::-1]:
    print(str(k)+'] '+i)
    print()
    k+=1

### plot of polarity and subjectivity

In [None]:
plt.figure(figsize=(12,10))
plt.scatter(tweets['polarity'], tweets['subjectivity'])
    
plt.title('Sentiment Anlysis')
plt.xlabel('Polarity')
plt.ylabel('Subjectivity')
plt.show()

In [None]:
plt.figure(figsize = (12,10))
# sentiments = tweets.groupby('sentiment')['tweet'].count()
# sentiments.plot.bar(color = 'r')
# plt.show()

                        #or
    
plt.title('Sentiment Analysis')
tweets.sentiment.value_counts().plot(kind = 'pie')
plt.show()

#### with the help of above graphs we can easily see that most of the tweets are either *Neutral* or *Positive*.

# Most Common Words

In [None]:
# removig words with length less than 4
def removeWord(txt):
    txt = re.sub(r'[…]','',txt) #removing "…"
    txt = re.sub(r'[’	]','',txt) #removing "’	"
    txt = re.sub('[%s]' % re.escape(string.punctuation),'',txt) #remove punctuation
    txt = re.sub(r'\b\w{1,3}\b','',txt) #word contains more than 3 characters
    return txt

In [None]:
tweets['tweet'] = tweets['tweet'].apply(removeWord)

tweets['temp_words'] = tweets['tweet'].apply(lambda x: str(x).split())
top = Counter([item for sublist in tweets['temp_words'] for item in sublist])
temp = pd.DataFrame(top.most_common(10))
temp.columns = ['Common_words', 'count']
temp.style.background_gradient(cmap = 'Blues')

### remove stopwords

In [None]:
def remove_stopwords(txt):
    cached_words = stopwords.words('english')
    return [word for word in txt if word not in cached_words]

In [None]:
tweets['temp_words'] = tweets['temp_words'].apply(lambda x: remove_stopwords(x))

In [None]:
top = Counter([item for sublist in tweets['temp_words'] for item in sublist])
temp = pd.DataFrame(top.most_common(10))
temp.columns = ['Common_words', 'count']
temp.style.background_gradient(cmap = 'Blues')

In [None]:
fig = px.treemap(temp, path=['Common_words'], values='count',title='Tree of Most Common Words')
fig.show()

## Sentiment Wise Most Common Words

In [None]:
Positive = tweets[tweets['sentiment'] == 'Positive']
Negative = tweets[tweets['sentiment'] == 'Negative']
Neutral = tweets[tweets['sentiment'] == 'Neutral']

### most common positive words

In [None]:
pos_counter = Counter([item for sublist in Positive['temp_words'] for item in sublist])
pos_temp = pd.DataFrame(pos_counter.most_common(10))
pos_temp.columns = ['common_words','count']
fig = px.treemap(pos_temp,path = ['common_words'],values = 'count', title = 'Most Common +ve Words')
fig.show()

### most common negative words

In [None]:
neg_counter = Counter([item for sublist in Negative['temp_words'] for item in sublist])
neg_temp = pd.DataFrame(neg_counter.most_common(10))
neg_temp.columns = ['common_words','count']
fig = px.treemap(neg_temp,path = ['common_words'],values = 'count', title = 'Most Common -ve Words')
fig.show()

### most common neutral words

In [None]:
neut_counter = Counter([item for sublist in Neutral['temp_words'] for item in sublist])
neut_temp = pd.DataFrame(neut_counter.most_common(10))
neut_temp.columns = ['common_words','count']
fig = px.treemap(neut_temp,path = ['common_words'],values = 'count', title = 'Most Common Neutral Words')
fig.show()

#### so there is an ambiguity we can see some common words in specific sentiment is also common in other sentiment also for clearity let's find unique words for each sentiment

## Sentiment Wise Most Common Unique Words

### unique positive sentiment

In [None]:
for key in list(pos_counter.keys()):
    if (key in neg_counter.keys()) or (key in neut_counter.keys()):
        del pos_counter[key]
unique_positive = pd.DataFrame(pos_counter.most_common(10), columns = ['words','count'])
fig = go.Figure(data=[go.Pie(labels=unique_positive.words, values = unique_positive['count'], hole=.5)])
fig.show()

### unique negative sentiment

In [None]:
for key in list(neg_counter.keys()):
    if (key in pos_counter.keys()) or (key in neut_counter.keys()):
        del neg_counter[key]
unique_negative = pd.DataFrame(neg_counter.most_common(10), columns = ['words','count'])
fig = go.Figure(data=[go.Pie(labels=unique_negative.words, values = unique_negative['count'], hole=.5)])
fig.show()

### unique neutral sentiment

In [None]:
for key in list(neut_counter.keys()):
    if (key in neg_counter.keys()) or (key in pos_counter.keys()):
        del neut_counter[key]
unique_neutral = pd.DataFrame(neut_counter.most_common(10), columns = ['words','count'])
fig = go.Figure(data=[go.Pie(labels=unique_neutral.words, values = unique_neutral['count'], hole=.5)])
fig.show()