In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
from IPython.display import display

import matplotlib.pyplot as plt 
import re
import string

import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.util import *
nltk.download('stopwords')
nltk.download('vader_lexicon')
from collections import Counter

from matplotlib import pyplot as plt
from matplotlib import ticker
import seaborn as sns
import plotly.express as px

sns.set(style="darkgrid")

# **Importing the Dataset**


In [None]:
import os
os.listdir('/kaggle/input/')

In [None]:
df = pd.read_csv("../input/nigeria-endsars-tweets/NigeriaEndSars data.csv")
df.head(5)

let's check the shape of the dataframe

In [None]:
df.shape

let's streamline the needed columns

In [None]:
needed_columns=['username','date','content']
df=df[needed_columns]
df.head()

Picking out the tweet texts

In [None]:
contents=df.content
contents

Removing URLs from tweets

In [None]:
remove_url=lambda x:re.sub(r'http\S+','',str(x))
contents_lr=contents.apply(remove_url)
contents_lr

Converting all tweets to lowercase

In [None]:
to_lower=lambda x: x.lower()
contents_lr_lc=contents_lr.apply(to_lower)
contents_lr_lc

Removing punctuations

In [None]:
remove_puncs= lambda x:x.translate(str.maketrans('','',string.punctuation))
contents_lr_lc_np=contents_lr_lc.apply(remove_puncs)
contents_lr_lc_np

Removing stopwords

In [None]:
more_words=['say','going','like','U','u','hey','#epitwitter','amp',]
stop_words=set(stopwords.words('english')) 
stop_words.update(more_words)
remove_words=lambda x: ' '.join([word for word in x.split() if word not in stop_words]) 
contents_lr_lc_np_ns=r=contents_lr_lc_np.apply(remove_words)
contents_lr_lc_np_ns

In [None]:
words_list=[word for line in contents_lr_lc_np_ns for word in line.split()]
words_list[:5]

In [None]:
word_counts=Counter(words_list).most_common(50)
word_df=pd.DataFrame(word_counts)
word_df.columns=['word','frq']
display(word_df.head(5))
# px=import plotly.express
px.bar(word_df,x='word',y='frq',title='Most common words')

put the Cleaned text in main dataframe

In [None]:
display(df.head(5))
df.content=contents_lr_lc_np_ns
display(df.head(5))

addtional cleaning

In [None]:
def clean_content(content):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    content = re.sub('\[.*?\]', '', content)
    content = re.sub('https?://\S+|www\.\S+', '', content)
    content = re.sub('<.*?>+', '', content)
    content = re.sub('[%s]' % re.escape(string.punctuation), '', content)
    content = re.sub('\n', '', content)
    content = re.sub('\w*\d\w*', '', content)
    return content
df['content'] = df['content'].apply(lambda x: clean_content(x))
display(df)

In [None]:
def remove_emoji(content):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', content)

In [None]:
df['content']=df['content'].apply(lambda x: remove_emoji(x))
display(df)

**Sentiment Analysis**

Getting the polarity scores for each tweet

In [None]:
sid=SentimentIntensityAnalyzer()
ps=lambda x:sid.polarity_scores(x)
sentiment_scores=df.content.apply(ps)
sentiment_scores

In [None]:
sentiment_df=pd.DataFrame(data=list(sentiment_scores))
display(sentiment_df)

Labeling the scores based on the compound polarity value

In [None]:
labelize=lambda x:'neutral' if x==0 else('positive' if x>0 else 'negative')
sentiment_df['label']=sentiment_df.compound.apply(labelize)
display(sentiment_df.head(10))

join the two data frames

In [None]:
display(df.head(5))
data=df.join(sentiment_df.label)
display(data.head(5))

Plotting the sentiment score counts

In [None]:
counts_df=data.label.value_counts().reset_index()
display(counts_df)

In [None]:
plt.figure(figsize=(8,5)) 
sns.barplot(x='index',y='label',data=counts_df)

group number of counts by
* date
* positive,neutral,negative

In [None]:
data_agg=data[['username','date','label']]
display(data_agg.head(5))

In [None]:
data_agg.columns=['date','label','counts']
display(data_agg.head())

In [None]:
data_agg=data_agg.reset_index()
display(data_agg.head(5))

In [None]:
from wordcloud import WordCloud

In [None]:
cut_content = " ".join(df.content)
max_words=100
word_cloud = WordCloud(
                    background_color='white',
                    stopwords=set(stop_words),
                    max_words=max_words,
                    max_font_size=30,
                    scale=5,
    colormap='magma',
                    random_state=1).generate(cut_content)
fig = plt.figure(1, figsize=(50,50))
plt.axis('off')
plt.title('Word Cloud for Top '+str(max_words)+' words with # ENDSars on Twitter\n', fontsize=100,color='blue')
fig.subplots_adjust(top=2.3)
plt.imshow(word_cloud)
plt.show()