![](https://pbs.twimg.com/media/Ezx0LvZUcAQ6nuH.jpg)

## Installing Necessary Packages

In [None]:
!pip -q install bs4

## Libraries Import

In [None]:
import re
import spacy
import numpy as np
import pandas as pd
import seaborn as sns
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
from spacy.lang.en.stop_words import STOP_WORDS as stopwords
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
plt.style.use('ggplot')

## Reading The Data & Preparing It

In [None]:
#converting into standard datetime format
dataset = pd.read_csv('../input/indianeedsoxygen-tweets/IndiaWantsOxygen.csv', engine='python')
from dateutil import parser
dataset['Date'] = pd.to_datetime(dataset['date']).dt.date
dataset['Date'] = dataset['Date'].apply(lambda x : parser.parse(str(x)))
dataset['Date'] = pd.to_datetime(dataset['Date']).dt.date
dataset['Time'] = pd.to_datetime(dataset['date']).dt.time
dataset['Time'] = dataset['Time'].apply(lambda x : parser.parse(str(x)))
dataset.drop(['date'], axis=1, inplace=True)
dataset.head(5)

## Number of Tweets Each Day

In [None]:
plt.rcParams['figure.figsize'] = [10,6]
plt.rcParams['figure.dpi'] = 90

sns.set(style='darkgrid')
dates = [date for date in dataset['Date']]
sns.countplot(x = dates, order=sorted(set(dates)), palette="Set2")
plt.xlabel("Date")
plt.ylabel("Number of Tweets")
plt.title('Number of Tweets Each Day')
plt.xticks(rotation=50) 
plt.show()

## Frequency of Tweets Each Day

In [None]:
data = dataset['Date'].groupby([dataset.Date]).agg('count')
data = data.to_frame(name='Number of Tweets Each Day')
sns.lineplot(data=data, x=data.index, y="Number of Tweets Each Day", color='red', linewidth=1.5)
plt.title('Frequency of Tweets Each Day')
plt.show()

## Top Thirteen Locations With Max Number Of Tweets

In [None]:
sns.countplot(x='user_location', data=dataset, order=dataset['user_location'].value_counts().index[:13])
plt.ylabel("Number of Tweets")
plt.xlabel("User Location")
plt.xticks(rotation=50, horizontalalignment='right', x=1.0) 
plt.title('Top Thirteen Locations With Max Number Of Tweets')
plt.show()

## Popular HashTags

In [None]:
sns.catplot(data = dataset, kind = "bar", x = dataset.hashtags.value_counts().head().index, y = dataset.hashtags.value_counts().head().values)
plt.ylabel("Number of Tweets")
plt.xlabel("Popular Hastags")
plt.xticks(rotation=50, horizontalalignment='right', x=1.0)
plt.title('Popular HashTags')
plt.show()

## Top Plotform Used To Make Tweet

In [None]:
platform = dataset['source'].value_counts()[:3].to_dict()
platform['Others'] = 0
dict_ = dataset['source'].value_counts().to_dict()
for key in dict_.keys():
    if key not in platform.keys():
        platform['Others'] += dict_[key]

plt.pie(x=platform.values(), labels=platform.keys(), autopct='%1.2f%%', shadow=False, startangle=0)
plt.legend(bbox_to_anchor=(.9,.9))
plt.title('Top Plotform Used To Make Tweet', x=0.5, y=0.95)
plt.show()

## Text Cleaning

In [None]:
!pip -q install contractions
import contractions
import unicodedata

In [None]:
def remove_accented(x):
  x = unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode('utf-8', 'ignore')
  return x

In [None]:
def get_clean(X):
    X = str(X).lower().replace('\\', ' ').replace('_', ' ').replace('.', ' ').replace(':', '')
    X = X.replace('#', "")
    X = re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?',"",  X)
    X = re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?',"",  X)
    X = re.sub(r'[^\w\d\s]+','', X)
    X = ' '.join(X.split())
    X = BeautifulSoup(X, 'lxml').get_text().strip()
    X = remove_accented(X)
    X = re.sub(r'[^\w ]+','',X)
    X = re.sub("(.)\\1{2,}", "\\1", X)
    X = contractions.fix(X)
    #X = ' '.join([word for word  in X.split() if word not in  stopwords])
    return X

In [None]:
dataset['text'] = dataset['text'].apply(lambda x: get_clean(x)) 

## Word Cloud Representation

In [None]:
word_cloud = WordCloud(width=700, height=600, max_font_size=180).generate(str(dataset['text']))
plt.imshow(word_cloud)
plt.title('Word Cloud Representation')
plt.axis('off')
plt.show()