In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 
import warnings
warnings.simplefilter("ignore")
from wordcloud import WordCloud, STOPWORDS



![](https://thumbs.dreamstime.com/b/word-writing-text-news-analysis-business-concept-measurement-various-qualitative-quantitative-magnifying-glass-138955945.jpg)

In [None]:
df = pd.read_csv("../input/breaking-news-from-twitter-20102021/tweets_bbc.csv", parse_dates=[['date', 'time']])
df.head()

# Data info and cleaning

In [None]:
df['date_time'] = pd.to_datetime(df['date_time'], errors='coerce')

df['year'] = df['date_time'].dt.year
df['month'] = df['date_time'].dt.month
df['day'] = df['date_time'].dt.day
df['dayofweek'] = df['date_time'].dt.dayofweek
df['hour'] = df['date_time'].dt.hour
df['minute'] = df['date_time'].dt.minute
df['dayofyear'] = df['date_time'].dt.dayofyear
df['date_only'] = df['date_time'].dt.date

In [None]:
print(f"data shape: {df.shape}")
print("--------------------")
df.info()

### Lots of columns are null so we can drop them

In [None]:
df['cashtags'].value_counts()

In [None]:
df['hashtags'].value_counts()

### '[]' in hashtags columns doesn't contribute to anything, so replacing it with null value

In [None]:
df['hashtags']=df['hashtags'].replace({'[]':np.nan})

In [None]:
df.hashtags.value_counts()

### Cashtags column doesn't contribute anything to the dataset, so we can delete them

In [None]:
df.columns

In [None]:
data = df.drop(['retweet_date', 'translate', 'trans_src','trans_dest', 'near', 'geo', 'source', 'user_rt_id', 'user_rt',
       'retweet_id', 'place', 'thumbnail', 'quote_url', 'id', 'conversation_id', 'link', 'urls', 'photos', 'user_id', 'cashtags'],
               axis =  1)

In [None]:
data.head()

# Exploratory data analysis

In [None]:
def missing_data(data):
    total = data.isnull().sum()
    percent = (data.isnull().sum()/data.isnull().count()*100)
    dataframe = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    types = []
    for col in data.columns:
        dtype = str(data[col].dtype)
        types.append(dtype)
    dataframe['Types'] = types
    return(np.transpose(dataframe))

In [None]:
missing_data(data)

### Percentage of missing data in hashtags is 65.7539%

In [None]:
def unique_values(data):
    total = data.count()
    dataframe = pd.DataFrame(total)
    dataframe.columns = ['Total']
    uniques = []
    for col in data.columns:
        unique = data[col].nunique()
        uniques.append(unique)
    dataframe['Uniques'] = uniques
    return(np.transpose(dataframe))

In [None]:
unique_values(data)

### Above dataframe shows the unique values in the dataset

In [None]:
def most_frequent_values(data):
    total = data.count()
    dataframe = pd.DataFrame(total)
    dataframe.columns = ['Total']
    items = []
    vals = []
    for col in data.columns:
        itm = data[col].value_counts().index[0]
        val = data[col].value_counts().values[0]
        items.append(itm)
        vals.append(val)
    dataframe['Most frequent values'] = items
    dataframe['Frequency'] = vals
    dataframe['Percent from total'] = np.round(vals / total * 100, 2)
    return(np.transpose(dataframe))

In [None]:
most_frequent_values(data)

### Above dataframe shows the most unique values in the dataset

# Data visualisation

In [None]:
# Helper function by GM Gabriel Preda
def plot_count(feature, title, df, size=1, ordered=True):
    f, ax = plt.subplots(1,1, figsize=(5*size,5))
    total = float(len(data))
    if ordered:
        g = sns.countplot(df[feature], order = df[feature].value_counts().index[:20], palette='Set2')
    else:
        g = sns.countplot(df[feature], palette='Set3')
    g.set_title("Number and percentage of {}".format(title))
    if(size > 2):
        plt.xticks(rotation=90, size=8)
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x()+p.get_width()/2.,
                height,
                '{:1.2f}%'.format(100*height/total),
                ha="center") 
    plt.show()  

In [None]:
plot_count("language", "Language", data,4)

### English language tweets counts for 99.57%

In [None]:
plot_count("hashtags", "Hashtags", data,4)

### Top Hashtags are Syria, Ukraine, ge2015

In [None]:
plot_count("year", "tweets / year", data, size=3, ordered=False)

### Tweets according to year, 2013 got the highest.

In [None]:
plot_count("month", "tweets / month", data, size=3, ordered=False)

### Tweets according to month, March has got the highest number of tweets.

In [None]:
plot_count("dayofweek", "tweets / day of week", data, size=3, ordered=False)

### Tweets according to day of week, Thursday has got the highest number of tweets followed by wednesday.

In [None]:
plot_count("hour", "tweets / hour", data,size=4, ordered=False)

### Tweets according to hour, 15th hour or 3 p.m has got the highest number of tweets

In [None]:
plot_count("minute", "tweets / minute", data,size=5, ordered=False)

In [None]:
plot_count("timezone", "Timezone", data,4)

In [None]:
from wordcloud import WordCloud, STOPWORDS
def show_wordcloud(data, title=""):
    text = " ".join(t for t in data.dropna())
    stopwords = set(STOPWORDS)
    stopwords.update(["t", "co", "https", "say", "says", "amp"])
    wordcloud = WordCloud(stopwords=stopwords, scale=4, max_font_size=50, max_words=500,background_color="black").generate(text)
    fig = plt.figure(1, figsize=(16,16))
    plt.axis('off')
    fig.suptitle(title, fontsize=20)
    fig.subplots_adjust(top=2.3)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.show()

In [None]:

show_wordcloud(data['tweet'], title = 'Most common words in tweets')

### Most common words are Police, Killed, died, London, UK, Syria, Details, report, shooting, jailed. so that's not good.

# Sentiment Analysis with NLTK SentimentIntensityAnalyzer

![](https://www.kdnuggets.com/images/sentiment-fig-1-689.jpg)

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer


In [None]:
sa = SentimentIntensityAnalyzer()
def sentiment(news):
    if sa.polarity_scores(news)["compound"] > 0:
        return "Positive"
    elif sa.polarity_scores(news)["compound"] < 0:
        return "Negative"
    else:
        return "Neutral" 

In [None]:
#Helpful function by Gabriel Preda https://www.kaggle.com/gpreda/tokyo-2020-tweets-sentiment-analysis

def plot_sentiment(df, feature, title):
    counts = df[feature].value_counts()
    percent = counts/sum(counts)

    fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 5))

    counts.plot(kind='bar', ax=ax1, color='green')
    percent.plot(kind='bar', ax=ax2, color='blue')
    ax1.set_ylabel(f'Counts : {title} sentiments', size=12)
    ax2.set_ylabel(f'Percentage : {title} sentiments', size=12)
    plt.suptitle(f"Sentiment analysis: {title}")
    plt.tight_layout()
    plt.show()

In [None]:
data['text_sentiment'] = data['tweet'].apply(lambda x: sentiment(x))
plot_sentiment(data, 'text_sentiment', 'Text')

In [None]:
show_wordcloud(data.loc[data['text_sentiment']=='Positive', 'tweet'], 
               title = 'Most common words in texts (Positive sentiment)')

In [None]:
show_wordcloud(data.loc[data['text_sentiment']=='Negative', 'tweet'], 
               title = 'Most common words in texts (Negative sentiment)')

In [None]:
show_wordcloud(data.loc[data['text_sentiment']=='Neutral', 'tweet'], 
               title = 'Most common words in texts (Neutral sentiment)')

Another great source of sentiment analysis and text analytics is done by Grandmaster Gabriel Preda
https://www.kaggle.com/gpreda/tokyo-2020-monitor-tweets-frequency

By Master Thomas Konstantin
https://www.kaggle.com/thomaskonstantin/exploring-internet-news-headlines

## Upvote if you like it or fork it

![](https://www.journal-leader.com/wp-content/uploads/2021/04/Thumbs-up-masked-image.jpg)