In [98]:
# import libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import wordcloud as cld
import nltk
from textblob import TextBlob
import plotly.express as px

In [99]:
# read all files into their respective segments
seg_1 = pd.read_csv('csv-1700-1830.csv', encoding = "ISO-8859-1", parse_dates=['date(yyyyMMddHHmmss)'])
seg_2 = pd.read_csv('csv-1831-2000.csv', encoding = "ISO-8859-1", parse_dates=['date(yyyyMMddHHmmss)'])
seg_3 = pd.read_csv('csv-2001-2131.csv', encoding = "ISO-8859-1", parse_dates=['date(yyyyMMddHHmmss)'])

# concat vertically as df
df = pd.concat([seg_1, seg_2, seg_3], ignore_index=True)

In [100]:
# remove unimportant authors
# list of top authors to exclude
exclude = ['choconibbs', 'POK', 'Clevvah4Evah', 'KronosQuoth']

df = df[~df['author'].isin(exclude)]

In [101]:
polarities = []
sentiments = []
bias = []

def sentiment_bias(message, sent_threshold, bias_threshold):
    text_blob = TextBlob(message)

    polarities.append(text_blob.polarity)

    if abs(text_blob.polarity) < sent_threshold:
        sentiments.append('neutral')
    elif text_blob.polarity <= -sent_threshold:
        sentiments.append('negative')
    else:
        sentiments.append('positive')

    if text_blob.subjectivity < bias_threshold:
        bias.append('unbiased')
    else:
        bias.append('biased')

for m in df['message'].tolist():
    sentiment_bias(m, 0.001, 0.1)

df['polarity'] = polarities
df['sentiment'] = sentiments
df['bias'] = bias

In [102]:
df[['negative', 'neutral', 'positive']] = pd.get_dummies(df['sentiment'])
df[['biased', 'unbiased']] = pd.get_dummies(df['bias'])

#### Bias throughout the Night

In [103]:
agg_5min = df.groupby(pd.Grouper(key='date(yyyyMMddHHmmss)', freq='5min'))

In [104]:
bias_5min = agg_5min[['biased', 'unbiased']].sum()
bias_5min = bias_5min.reset_index()
bias_5min['time'] = bias_5min['date(yyyyMMddHHmmss)'].apply(lambda x: (x.value - pd.to_datetime('2014-01-23 17:00:00').value) // 10**9)

In [105]:
bias_5min = pd.melt(
    bias_5min,
    id_vars=['time'],
    value_vars=['biased', 'unbiased'],
    var_name='bias',
    value_name='count'
)

In [106]:
fig = px.bar(
    bias_5min,
    x='bias',
    y='count',
    color='bias',
    color_discrete_sequence=['red', 'green'],
    animation_frame='time',
    labels={
        'bias': 'Bias',
        'count': 'Count'
    },
    title='Bias Counts Throughout the Evening (5 pm to 9:30 pm)'
)
fig.update_layout(yaxis_range=[0, 150])
fig.show()

In [107]:
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
import string


# Step 1: Tokenize Messages
stop_words = set(stopwords.words('english') + ['rt', 'abila', 'RT', 'Abila', 'AbilaPost', 'KronosStar', 'CentralBulletin', 'hi', 'Hi', 'HomelandIlluminations', 'HI'])
stop_words.update('rt', 'abila', 'abilapost', 'abilapost')
df['tokens'] = df['message'].apply(lambda x: [word.lower() for word in word_tokenize(x) if word.isalpha() and word not in stop_words and word not in string.punctuation])

In [108]:
df['tokens']

1       [do, miss, moment, follow, live, coverage, pok...
2       [come, join, us, park, music, tonight, city, p...
3       [pok, rally, start, city, park, pok, leader, s...
4       [pok, rally, set, take, place, city, park, pok...
5                   [pok, rally, park, tonight, pokrally]
                              ...                        
4058          [unknown, explosion, heard, dancingdolphin]
4059      [explosion, heard, dancing, dolphin, apartment]
4060    [there, explosion, inside, apartment, building...
4061                               [redisrad, what, boom]
4062      [explosion, heard, dancing, dolphin, apartment]
Name: tokens, Length: 2528, dtype: object

In [109]:
# Group into 5 min bins and concat messages
df_grouped = df.groupby(pd.Grouper(key='date(yyyyMMddHHmmss)', freq='5min'))['tokens'].apply(lambda x: ' '.join(' '.join(words) for words in x)).reset_index()

# get word frequenceis
df_grouped['word_frequency'] = df_grouped['tokens'].apply(lambda x: FreqDist(word_tokenize(x)))

# select top 3 words
top_words_overall = FreqDist()
for frequency_dist in df_grouped['word_frequency']:
    top_words_overall += frequency_dist

top_n_words = [word for word, count in top_words_overall.most_common(3)]


In [110]:
# create a new df for top words
df_top_words = pd.DataFrame(index=df_grouped['date(yyyyMMddHHmmss)'], columns=top_n_words)

# fill df with top words in each 5 min time bin
for index, row in df_grouped.iterrows():
    time_bin = row['date(yyyyMMddHHmmss)']
    word_frequency = row['word_frequency']
    for word in top_n_words:
        df_top_words.at[time_bin, word] = word_frequency.get(word, 0)


In [111]:
# plot
fig = px.line(df_top_words,
     x=df_top_words.index, y=top_n_words, title='Top 3 Most Frequent Words Over Time',
     color_discrete_sequence=['red', 'black', 'blue'],
              labels={'index': 'Time', 'value': 'Word Count'}, line_shape='linear')

fig.show()

In [112]:
import pandas as pd
import plotly.express as px
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from textblob import TextBlob  


# Sentiment Analysis
df['sentiment'] = df['message'].apply(lambda x: TextBlob(x).sentiment.polarity)