![](https://indiawantscrypto.net/_next/static/images/indiawantscrypto-banner-1a65d8b23cbce894669d71c75bddf5f7.png)

### This notebook analyses the tweets with the trending #IndiaWantsCrypto hashtag made by the users and classify the tweets into positive, negative and neutral tweets.

<a id="top"></a>

<div class="list-group" id="list-tab" role="tablist">
<h3 class="list-group-item list-group-item-action active" data-toggle="list" style='background:gold; border:0' role="tab" aria-controls="home" color=black><center>Quick navigation</center></h3>

* [1. Required Libraries](#1)
* [2. Dataset Quick Overview](#2)
* [3. User Statistics](#3)
* [4. Hahstag analysis](#4)
* [5. Tweets text analysis](#4)
* [6. Classification of tweets](#6)
* [7. Data set and References](#7)

    Kindly, Upvote the notebook!

<a id="1"></a>
<h2 style='background:gold; border:0; color:black'><center>Required Libraries</center><h2>

In [None]:
import numpy as np 
import pandas as pd 
import os
import itertools

#plots
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.colors import n_colors
from plotly.subplots import make_subplots

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from sklearn.feature_extraction.text import CountVectorizer

from PIL import Image
from nltk.corpus import stopwords
stop=set(stopwords.words('english'))
from nltk.util import ngrams

import missingno as mno

import re
from collections import Counter

import nltk
from nltk.corpus import stopwords

import requests
import json

import seaborn as sns
sns.set(rc={'figure.figsize':(11.7,8.27)})

import warnings
warnings.filterwarnings("ignore")

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

# TextBlob - Python library for processing textual data
from textblob import TextBlob

#Importing the dataset 
tweets_df = pd.read_csv("../input/india-wants-crypto-tweets/IndiaWantsCrypto.csv")

<a id="2"></a>
<h2 style='background:gold; border:0; color:black'><center>Dataset Quick Overview</center><h2>

# # Let's take a quick peek of how the data looks!

In [None]:
tweets_df.info()

### The data consists of 13 features, 3 integer value features, 2 binary value features, and 8 string features

<a id="3"></a>
<h2 style='background:gold; border:0; color:black'><center>User statistics</center><h2>

## Most frequent users

In [None]:
def most_frequent_values(data):
    total = data.count()
    tt = pd.DataFrame(total)
    tt.columns = ['Total']
    items = []
    vals = []
    for col in data.columns:
        itm = data[col].value_counts().index[0]
        val = data[col].value_counts().values[0]
        items.append(itm)
        vals.append(val)
    tt['Most frequent item'] = items
    tt['Frequence'] = vals
    tt['Percent from total'] = np.round(vals / total * 100, 3)
    return(np.transpose(tt))

most_frequent_values(tweets_df)

### Looks like user name Crypto India has made the highest number of tweets with the covidvaccine hashtag!

In [None]:
# He/She used the same message but tagging different people for higher reach and support to his claim!
print(tweets_df[tweets_df['user_name']=='Crypto India']['text'].iloc[0])
print(tweets_df[tweets_df['user_name']=='Crypto India']['text'].iloc[1])

### Digging into the distribution of the user_name feature!

In [None]:
def plot_count(feature, title, df, size=1, ordered=True):
    f, ax = plt.subplots(1,1, figsize=(4*size,4))
    total = float(len(df))
    if ordered:
        g = sns.countplot(df[feature], order = df[feature].value_counts().index[:20], palette='Set3')
    else:
        g = sns.countplot(df[feature], palette='Set3')
    g.set_title("Number and percentage of {}".format(title))
    if(size > 2):
        plt.xticks(rotation=90, size=8)
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x()+p.get_width()/2.,
                height + 3,
                '{:1.2f}%'.format(100*height/total),
                ha="center") 
    plt.show()

plot_count("user_name", "User name", tweets_df,4)

### Lets Visualize the top 20 users by number of tweets

In [None]:
ds = tweets_df['user_name'].value_counts().reset_index()
ds.columns = ['user_name', 'tweets_count']
ds = ds.sort_values(['tweets_count'],ascending=False)
tweets_df = pd.merge(tweets_df, ds, on='user_name')

fig = sns.barplot( 
    x=ds.head(20)["tweets_count"], 
    y=ds.head(20)["user_name"], 
    orientation='horizontal'
).set_title('Top 20 users by number of tweets')

### Similiarly let's see how the user_location feature is distributed!!

In [None]:
plot_count("user_location", "User location", tweets_df,4)


<a id="4"></a>
<h2 style='background:gold; border:0; color:black'><center>Hashtag analysis</center><h2>

### Total number of tweets for users and number of hashtags in every tweet

In [None]:
tweets_df['hashtags'] = tweets_df['hashtags'].fillna('[]')
tweets_df['hashtags_count'] = tweets_df['hashtags'].apply(lambda x: len(x.split(',')))
tweets_df.loc[tweets_df['hashtags'] == '[]', 'hashtags_count'] = 0
fig = sns.scatterplot( 
    x=tweets_df['hashtags_count'], 
    y=tweets_df['tweets_count']
).set_title('Total number of tweets for users and number of hashtags in every tweet')

* As the number of tweets increase from 0 to 50, there is an substantial decrease in the number of hastags
* users who post less than 10 tweets use a range of 0 to a maximum of 23 hastags!

## Number of tweets on #Indiawantscrypto hashtag each day!

In [None]:
tweets_df['tweet_date']=pd.to_datetime(tweets_df['date']).dt.date
tweet_date=tweets_df['tweet_date'].value_counts().to_frame().reset_index().rename(columns={'index':'date','tweet_date':'count'})
tweet_date['date']=pd.to_datetime(tweet_date['date'])
tweet_date=tweet_date.sort_values('date',ascending=False)

fig=go.Figure(go.Scatter(x=tweet_date['date'],
                                y=tweet_date['count'],
                               mode='markers+lines',
                               name="Submissions",
                               marker_color='dodgerblue'))

fig.update_layout(
    title_text='Tweets per Day : ({} - {})'.format(tweets_df['tweet_date'].sort_values()[0].strftime("%d/%m/%Y"),
                                                       tweets_df['tweet_date'].sort_values().iloc[-1].strftime("%d/%m/%Y")),template="plotly_dark",
    title_x=0.5)

fig.show()

### The highest number of tweets were recorded on Feb 13, 2021!

## Tweet distribution -Hourly

In [None]:
tweets_df['hour'] = pd.to_datetime(tweets_df['date']).dt.hour
ds = tweets_df['hour'].value_counts().reset_index()
ds.columns = ['hour', 'count']
ds['hour'] = 'Hour ' + ds['hour'].astype(str)
fig = sns.barplot( 
    x=ds["hour"], 
    y=ds["count"], 
    orientation='vertical', 
).set_title('Tweets distribution over hours')
plt.xticks(rotation='vertical')

## Top 10 hastags used!

In [None]:
def split_hashtags(x): 
    return str(x).replace('[', '').replace(']', '').split(',')

tweets_df = tweets_df.copy()
tweets_df['hashtag'] = tweets_df['hashtags'].apply(lambda row : split_hashtags(row))
tweets_df = tweets_df.explode('hashtag')
tweets_df['hashtag'] = tweets_df['hashtag'].astype(str).str.lower().str.replace("'", '').str.replace(" ", '')
tweets_df.loc[tweets_df['hashtag']=='', 'hashtag'] = 'NO HASHTAG'


ds = tweets_df['hashtag'].value_counts().reset_index()
ds.columns = ['hashtag', 'count']
ds = ds.sort_values(['count'],ascending=False)
fig = sns.barplot(
    x=ds.head(10)["count"], 
    y=ds.head(10)['hashtag'], 
    orientation='horizontal', 
    #title='Top 20 hashtags', 
    #width=800, 
    #height=700
).set_title('Top 10 hashtags')
#fig.show()

<a id="5"></a>
<h2 style='background:gold; border:0; color:black'><center>Tweets text analysis</center><h2>

In [None]:
stopwords = set(STOPWORDS)
def show_wordcloud(data, title = None):
    wordcloud = WordCloud(
        background_color='white',
        stopwords=stopwords,
        max_words=50,
        max_font_size=40, 
        scale=5,
        random_state=1
    ).generate(str(data))

    fig = plt.figure(1, figsize=(10,10))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize=20)
        fig.subplots_adjust(top=2.3)

    plt.imshow(wordcloud)
    plt.show()

# Let's identify the prevalent words in tweets!!

In [None]:
show_wordcloud(tweets_df['text'], title = 'Prevalent words in tweets')

## Let's identify the prevalent words in tweets from India!

In [None]:
india_df = tweets_df.loc[tweets_df.user_location=="India"]
show_wordcloud(india_df['text'], title = 'Prevalent words in tweets from India')

## Let's identify the prevalent words in tweets from US

In [None]:
us_df = tweets_df.loc[tweets_df.user_location=="New Delhi, India"]
show_wordcloud(us_df['text'], title = 'Prevalent words in tweets from New delhi')

## Identifying the prevalent words in tweets from London using wordcloud

In [None]:
us_df = tweets_df.loc[tweets_df.user_location=="Mumbai, India"]
show_wordcloud(us_df['text'], title = 'Prevalent words in tweets from Mumbai')

# Let's identify the prevalent words in hastags!

In [None]:
tweets_df['hashtags'] = tweets_df['hashtags'].replace(np.nan, "['None']", regex=True)
tweets_df['hashtags'] = tweets_df['hashtags'].apply(lambda x: x.replace('\\N',''))

tweets_df['hashtags_individual'] = tweets_df['hashtags'].apply(lambda x: x.split(','))
from itertools import chain
all_hashtags = set(chain.from_iterable(list(tweets_df['hashtags_individual'])))
print(f"There are totally: {len(all_hashtags)}")


show_wordcloud(tweets_df['hashtags_individual'], title = 'Prevalent words in hashtags')

## Refining the text(Important step) and visualizing with violin plot to understand the distribution of the text

In [None]:
def remove_tag(string):
    text=re.sub('<.*?>','',string)
    return text
def remove_mention(text):
    line=re.sub(r'@\w+','',text)
    return line
def remove_hash(text):
    line=re.sub(r'#\w+','',text)
    return line

def remove_newline(string):
    text=re.sub('\n','',string)
    return text
def remove_url(string): 
    text = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+','',string)
    return text
def remove_number(text):
    line=re.sub(r'[0-9]+','',text)
    return line
def remove_punct(text):
    line = re.sub(r'[!"\$%&\'()*+,\-.\/:;=#@?\[\\\]^_`{|}~]*','',text)
    return line
def text_strip(string):
    line=re.sub('\s{2,}', ' ', string.strip())
    return line
def remove_thi_amp_ha_words(string):
    line=re.sub(r'\bamp\b|\bthi\b|\bha\b',' ',string)
    return line

In [None]:
tweets_df['refine_text']=tweets_df['text'].str.lower()
tweets_df['refine_text']=tweets_df['refine_text'].apply(lambda x:remove_tag(str(x)))
tweets_df['refine_text']=tweets_df['refine_text'].apply(lambda x:remove_mention(str(x)))
tweets_df['refine_text']=tweets_df['refine_text'].apply(lambda x:remove_hash(str(x)))
tweets_df['refine_text']=tweets_df['refine_text'].apply(lambda x:remove_newline(x))
tweets_df['refine_text']=tweets_df['refine_text'].apply(lambda x:remove_url(x))
tweets_df['refine_text']=tweets_df['refine_text'].apply(lambda x:remove_number(x))
tweets_df['refine_text']=tweets_df['refine_text'].apply(lambda x:remove_punct(x))
tweets_df['refine_text']=tweets_df['refine_text'].apply(lambda x:remove_thi_amp_ha_words(x))
tweets_df['refine_text']=tweets_df['refine_text'].apply(lambda x:text_strip(x))

tweets_df['text_length']=tweets_df['refine_text'].str.split().map(lambda x: len(x))

In [None]:
fig = go.Figure(data=go.Violin(y=tweets_df['text_length'], box_visible=True, line_color='black',
                               meanline_visible=True, fillcolor='royalblue', opacity=0.6,
                               x0='Tweet Text Length'))

fig.update_layout(yaxis_zeroline=False,title="Distribution of Text length",template='ggplot2')
fig.show()

* Average length of the tweet: 13.41
* Median length of the  tweet:13
* Interquartile lie between : 7 and 18
* Min: 1
* Max: 55

## Listing below the top N-gram sequential words used in Covid Vaccine tweets

In [None]:
def ngram_df(corpus,nrange,n=None):
    vec = CountVectorizer(stop_words = 'english',ngram_range=nrange).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    total_list=words_freq[:n]
    df=pd.DataFrame(total_list,columns=['text','count'])
    return df
unigram_df=ngram_df(tweets_df['refine_text'],(1,1),20)
bigram_df=ngram_df(tweets_df['refine_text'],(2,2),20)
trigram_df=ngram_df(tweets_df['refine_text'],(3,3),20)

fig = make_subplots(
    rows=3, cols=1,subplot_titles=("Unigram","Bigram",'Trigram'),
    specs=[[{"type": "scatter"}],
           [{"type": "scatter"}],
           [{"type": "scatter"}]
          ])

fig.add_trace(go.Bar(
    y=unigram_df['text'][::-1],
    x=unigram_df['count'][::-1],
    marker={'color': "blue"},  
    text=unigram_df['count'],
    textposition = "outside",
    orientation="h",
    name="Months",
),row=1,col=1)

fig.add_trace(go.Bar(
    y=bigram_df['text'][::-1],
    x=bigram_df['count'][::-1],
    marker={'color': "blue"},  
    text=bigram_df['count'],
     name="Days",
    textposition = "outside",
    orientation="h",
),row=2,col=1)

fig.add_trace(go.Bar(
    y=trigram_df['text'][::-1],
    x=trigram_df['count'][::-1],
    marker={'color': "blue"},  
    text=trigram_df['count'],
     name="Days",
    orientation="h",
    textposition = "outside",
),row=3,col=1)

fig.update_xaxes(showline=True, linewidth=2, linecolor='black', mirror=True)
fig.update_yaxes(showline=True, linewidth=2, linecolor='black', mirror=True)
fig.update_layout(title_text='Top N Grams',xaxis_title=" ",yaxis_title=" ",
                  showlegend=False,title_x=0.5,height=1200,template="plotly_dark")
fig.show()

* Crypto, India and Ban are the most used unigrams
* Crypto ban, Just emailed and crypto regulations are the most used bi-grams
* Crypto regulations india,regulate dont ban,support progressive crypto are the most used tri-grams

<a id="6"></a>
<h2 style='background:gold; border:0; color:black'><center>Classification of tweets</center><h2>

### Classifying the refined tweets based on the polarity of the tweet

In [None]:
def getTextSubjectivity(txt):
    return TextBlob(txt).sentiment.subjectivity

def getTextPolarity(txt):
    return TextBlob(txt).sentiment.polarity


tweets_df['Subjectivity'] = tweets_df['refine_text'].apply(getTextSubjectivity)
tweets_df['Polarity'] = tweets_df['refine_text'].apply(getTextPolarity)


In [None]:

# negative, nautral, positive analysis
def getTextAnalysis(a):
    if a < 0:
        return "Negative"
    elif a == 0:
        return "Neutral"
    else:
        return "Positive"
tweets_df['Score'] = tweets_df['Polarity'].apply(getTextAnalysis)


In [None]:
labels = tweets_df.groupby('Score').count().index.values

values = tweets_df.groupby('Score').size().values

sns.barplot(labels, values)
print('Percentage of positive tweets:',round((tweets_df[tweets_df['Score']=='Positive'].shape[0]/(tweets_df.shape[0])*100),2))
print('Percentage of Negative tweets:',round((tweets_df[tweets_df['Score']=='Negative'].shape[0]/(tweets_df.shape[0])*100),2))
print('Percentage of Neutral tweets:',round((tweets_df[tweets_df['Score']=='Neutral'].shape[0]/(tweets_df.shape[0])*100),2))

#### Prevalent words in the positive tweets

In [None]:
pos_df = tweets_df.loc[tweets_df.Score=="Positive"]
show_wordcloud(pos_df['refine_text'], title = 'Prevalent words in the positive tweets')

#### Prevalent words in the neagtive tweets

In [None]:
neg_df = tweets_df.loc[tweets_df.Score=="Negative"]
show_wordcloud(neg_df['refine_text'], title = 'Prevalent words in the Negative tweets')

#### Prevalent words in the Neutral tweets

In [None]:
neutral_df = tweets_df.loc[tweets_df.Score=="Neutral"]
show_wordcloud(neutral_df['refine_text'], title = 'Prevalent words in the Neutral tweets')

#### Some of the negative tweets

In [None]:
tweets_df[tweets_df['Polarity']==min(tweets_df['Polarity'])]['refine_text']

### Some of the positive tweets

In [None]:
print(tweets_df[tweets_df['Polarity']==max(tweets_df['Polarity'])]['refine_text'].iloc[0])
print(tweets_df[tweets_df['Polarity']==max(tweets_df['Polarity'])]['refine_text'].iloc[-1])


<h2 style='background:gold; border:0; color:black'><center> The data set and the notebook will be updated on a daily basis! So stay tuned!</center><h2>

<a id='7'></a>
## Data set details:

https://www.kaggle.com/kaushiksuresh147/india-wants-crypto-tweets

## References:

* https://www.kaggle.com/kaushiksuresh147/ipl2020-twitter-analysis-eda

<h2 style='background:black; border:0; color:gold'><center>Kindly upvote the notebook and the dataset!</center><h2>