# IPL 2020 Tweets Sentiment Analysis

### Installing and importing Required libraries

In [None]:
!pip install vaderSentiment


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Taking a glance at the data

In [None]:
df = pd.read_csv('/kaggle/input/ipl2020-tweets/IPL2020_Tweets.csv')
df.head()

### Finding more details about the data set

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

We can see that some tweets dont involve any hashtags. Also for some the user location and description and source of tweet is missing, but text column which is required for sentiment analysis doesn't contain any null values.

### Creating a copy of text so that we can make changes and perform EDA

In [None]:
df['senttext'] = df['text']


### Converting all the data to lowercase

In [None]:
df = df.apply(lambda x: x.astype(str).str.lower())

In [None]:
loc_df = df[df.user_location != 'nan']
loc_df.user_location.value_counts().nlargest(20).plot(kind='bar',figsize=(20,8))
# df..value_counts().nlargest(20).plot(kind='bar')

### Indian cities dominate the top 20 user locations as expected. 'global' and 'worldwide' locations also make it in top 20.

In [None]:
indian_cities = {}
indian_metros = ['mumbai', 'bangalore', 'delhi', 'kolkata', 'chennai', 'ahmedabad', 'hyderabad']
for city in indian_metros: 
    indian_cities[city] = df.user_location.str.count(city).sum()


In [None]:
plt.figure(figsize=(20, 8))
plt.bar(*zip(*indian_cities.items()))

plt.show()

### Users in Indian metro cities who have teams in IPL are most active. Other cities like Ahmedabad have much less tweets

In [None]:
ipl_countries = {}
countries = ['usa', 'uk', 'united arab emirates', 'canada', 'australia', 'south africa', 'pakistan']
for country in countries: 
    ipl_countries[country] = df.user_location.str.count(country).sum()

    
plt.figure(figsize=(20, 8))
plt.bar(*zip(*ipl_countries.items()))

plt.show()

### Comparing tweets from countries with top cricket teams and where indian population is high.

In [None]:
df.user_verified.value_counts().plot(kind='bar', rot=0)

### Most of the users are not verified on twitter

In [None]:
df.source.value_counts().nlargest(10).plot(kind='bar', rot=0, figsize=(20,8))

### Users used twitter app on android the most. iPhone users come after Website/app users

In [None]:
hashtag_df = df[df.hashtags != 'nan']
hashtag_df.hashtags.value_counts().nlargest(10).plot(kind='bar', rot=0, figsize=(20,8))

### People used the tournament hashtag the most, with some specific match based and team hashtags also making an appearance

## Removing all the stop words, mentions, hashtags and URLs to create a word cloud

In [None]:
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
df.text = df.text.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [None]:
df.text = df.text.apply(lambda x: ' '.join(re.sub("(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)", " ", x).split()))

In [None]:
df.text = df.text.apply(lambda x: ' '.join(re.sub("[\.\,\!\?\:\;\-\=]", " ", x).split()))

In [None]:
df.text = df.text.apply(lambda x: ' '.join(re.sub(r'http\S+', '', x).split()))

In [None]:
wordcloud = WordCloud(
                          background_color='white',
                          colormap='Blues',
                          max_words=200,
                          max_font_size=40, 
                          random_state=42
                         ).generate(str(df['text']))

plt.imshow(wordcloud)
plt.axis('off')
plt.show()

### Time wise plot of all tweets

In [None]:
df['tweet_date']=pd.to_datetime(df['date']).dt.date
tweet_date=df['tweet_date'].value_counts().to_frame().reset_index().rename(columns={'index':'date','tweet_date':'count'})
tweet_date['date']=pd.to_datetime(tweet_date['date'])
tweet_date=tweet_date.sort_values('date',ascending=False)


In [None]:
plt.figure (figsize=(50,20))
plt.xlabel('xlabel', fontsize=18)
plt.ylabel('ylabel', fontsize=16)
plt.plot(tweet_date['date'], tweet_date['count'])

### Tweets started flowing in when the tournament announcement was made, and saw a steep increase once the tournament started

### Replacing the emojis with unicode

In [None]:
try:
    # UCS-4
    e = re.compile(u'[\U00010000-\U0010ffff]')
except re.error:
    # UCS-2
    e = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
emojis = []
for x in df.text:
    match  = e.search(x)
    if match:
        emojis.append(match.group())

### Top 10 Emojis used in tweets

In [None]:
dfe =  pd.DataFrame(emojis,columns=['text'])
pd.Series(' '.join(dfe['text']).lower().split()).value_counts()[:10]


### Using Word2vec to find out similar words

In [None]:
# Set values for various parameters
num_features = 400    # Word vector dimensionality                      
min_word_count = 5   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

wt = [list(x.split()) for x in df.text]
from gensim.models import word2vec
print ("Training model...")
wv_model = word2vec.Word2Vec(wt, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

wv_model.init_sims(replace=True)

In [None]:
wv_model.most_similar('hardik',topn =15)

### We can see words like kung,fu as he is commonly known as Kungfu pandya. Brothers also pops up as he and his brother play for same team

In [None]:
wv_model.most_similar('yorker',topn =15)

### Words like peach, toe length are getting correctly associated whereas bouncer is the exact opposite

In [None]:
wv_model.most_similar('virat',topn =15)

### the word king being similar to virat kohli aka king kohli. ABD who is a teammate also appears

## Now applying the VADER sentiment analyzer

In [None]:
analyser = SentimentIntensityAnalyzer()

df['sentiment_score'] = df['senttext'].apply(lambda x: analyser.polarity_scores(str(x)))


In [None]:
def sentiment_func(sentiment):
#     print(s['pos'])
    for k,v in sentiment.items():
        if (k == 'pos' or k or 'neg' or k == 'neu') == True:
            if (sentiment['pos'] > 0.5 and sentiment['neg'] < 0.5 and sentiment['neu'] < 0.5) == True:
                return 'positive'
            elif (sentiment['pos'] < 0.5 and sentiment['neg'] > 0.5 and sentiment['neu'] < 0.5) == True:
                return 'negative'
            elif (sentiment['pos'] < 0.5 and sentiment['neg'] < 0.5 and sentiment['neu'] > 0.5) == True:
                return 'neutral'

df['sentiment'] = df['sentiment_score'].apply(sentiment_func)

In [None]:
df.sentiment.value_counts().plot(kind='bar', rot=0)

### Most the tweets are neutral. This can be due to most tweets just containing score updates or match updates.

### Number of positive tweets are more than negative. Seems like people were very happy with tournament happening at such difficult time and people got excited and happy to see their favorite cricketers back on pitch.

# PLEASE UPVOTE ^^^