In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Load the Dataset**

In [None]:
df_tweets = pd.read_csv("/kaggle/input/indianeedsoxygen-tweets/IndiaWantsOxygen.csv")
print (df_tweets.shape)
df_tweets.head()

# **EDA**

## **Top 12 locations with highest number of Tweets**

In [None]:
plt.figure(figsize=(14,8))
sns.set(style='darkgrid')
sns.countplot(y='user_location', data=df_tweets, order=df_tweets['user_location'].value_counts().index[:12], palette="Set2")
plt.xlabel("Number of Tweets", weight='bold')
plt.ylabel("User Location", weight='bold')
plt.show()

## **% of verified or non-verified accounts among the tweeters**

In [None]:
dict_temp = dict(Counter(df_tweets['user_verified']))
dict_temp['User Verified'] = dict_temp.pop(True)
dict_temp['User Non-Verified'] = dict_temp.pop(False)

plt.figure(figsize=(7,7))
plt.pie(x=dict_temp.values(), labels=dict_temp.keys(), autopct='%1.1f%%', shadow=True, 
        startangle=0, explode = [0.1, 0])
plt.show()

## **% of platform used for tweet**

In [None]:
dict_temp = df_tweets['source'].value_counts()[:3].to_dict()
dict_temp['Others'] = 0
dict_ = df_tweets['source'].value_counts().to_dict()
for key in dict_.keys():
    if key not in dict_temp.keys():
        dict_temp['Others'] += dict_[key]

plt.figure(figsize=(7,7))
plt.pie(x=dict_temp.values(), labels=dict_temp.keys(), autopct='%1.1f%%', shadow=True, 
        startangle=0)
plt.show()

## **Heatmap (correlation) among different features**

In [None]:
plt.figure(figsize=(7,7))
sns.heatmap(df_tweets[['user_followers', 'user_friends', 'user_favourites', 'user_verified']].corr(), center=0, annot=True,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.show()

## **Number of Tweets on each date**

In [None]:
from datetime import datetime
df_tweets['date'] = pd.to_datetime(df_tweets['date'].str.split(' ', expand=True)[0], format='%Y-%m-%d')

plt.figure(figsize=(14,8))
sns.set(style='darkgrid')
list_ = [datetime.date(x) for x in df_tweets['date']]
sns.countplot(x = list_, order=sorted(set(list_)), palette="Set2")
plt.xlabel("Date", weight='bold')
plt.ylabel("Number of Tweets", weight='bold')
plt.show()

## **WordCloud of Hashtags**

In [None]:
import itertools
from wordcloud import WordCloud

list_hashtags = df_tweets['hashtags'].dropna().str.lstrip('[').str.rstrip(']').str.replace("'", "").str.split(', ').tolist()
list_hashtags = list(itertools.chain(*list_hashtags))

wordcloud = WordCloud(background_color="black", width=800, height=500, max_font_size=80, max_words=100, collocations = False, colormap='Set2').generate(" ".join(list_hashtags))
plt.figure(figsize=(16,12))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

## **WordCloud of Tweets**

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
import string

list_stopwords = set(stopwords.words('english') + list(punctuation))
df_temp = df_tweets[['text']].copy()
df_temp['text'] = df_temp['text'].str.lower()
df_temp['text'] = df_temp['text'].apply(word_tokenize)
df_temp['text'] = df_temp['text'].apply(lambda x: [word for word in x if word not in list_stopwords])
df_temp['text'] = df_temp['text'].apply(lambda x : [word.translate(str.maketrans('', '', string.punctuation)) for word in x])
df_temp['text'] = df_temp['text'].apply(lambda x : [word for word in x if len(word) > 1])

In [None]:
list_text = df_temp['text'].tolist()
list_text = list(itertools.chain(*list_text))

plt.figure(figsize=(16,12))
wordcloud = WordCloud(background_color="black", width=800, height=500, max_font_size=80, max_words=100, collocations = False, colormap='Set2').generate(' '.join(list_text))
plt.imshow(wordcloud,interpolation="bilinear")
plt.axis("off")
plt.show()

# **Feel free to <span style="color:red"> Upvote </span> and give <span style="color:blue"> Feedback </span>.**