In [12]:
# import pandas as pd
import re
import collections
from parser import get_messages
import pandas as pd
import matplotlib.pyplot as plt
import sys


# get path to the file to be analyzed
# filepath = 'sample.txt'

with open('./filepath') as f:
    filepath = f.read()

# get messages in dataframe
msgs = get_messages(filepath)


In [13]:
# convert utc timestamp to datetime
msgs['date'] = pd.to_datetime(msgs['timestamp'],unit='s')
msgs = msgs.drop('timestamp', axis=1)
msgs = msgs.drop('platform', axis=1)
msgs = msgs.drop('language', axis=1)

In [14]:
# get number of messages of each user
participants = list(msgs['senderName'].unique())
msg_counts = {}

for participant in participants:
    msg_counts[participant] = len(msgs[msgs['senderName'] == participant])

print(msg_counts)

In [15]:
# get timeseries of message/activity over time
# x-axis: time in months, y-axis: number of messages 
msgs2 = msgs
msgs2.index = msgs2.date
msgs_over_time_values = msgs2.date.resample('M').count()
msgs_over_time_values.plot(x="time", y="count")
plt.ylim(ymin=0)
plt.show()

In [16]:
# generate a single string of all messages
total_log = ' '.join(msgs['text'])

In [17]:
# generate wordcloud of conversation
# %pip install wordcloud
from wordcloud import WordCloud

# generate and display the word cloud
wordcloud = WordCloud().generate(total_log)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

In [18]:
# carry out topic modelling using LDA to generate topics
# %pip install scikit-learn

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, MiniBatchNMF, LatentDirichletAllocation
from time import time

n_samples = 20000
n_features = 1000
n_components = 10
n_top_words = 15
batch_size = 128
init = "nndsvda"


def plot_top_words(model, feature_names, n_top_words, title):
    fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 30})
        ax.invert_yaxis()
        ax.tick_params(axis="both", which="major", labelsize=20)
        for i in "top right left".split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()



# print("Loading dataset...")
t0 = time()
data = msgs['text']
data_samples = data #[:n_samples]
# print("done in %0.3fs." % (time() - t0))

# Use tf-idf features for NMF.
# print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(
    max_df=0.95, min_df=2, max_features=n_features, stop_words="english"
)
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
# print("done in %0.3fs." % (time() - t0))

# Use tf (raw term count) features for LDA.
# print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(
    max_df=0.95, min_df=2, max_features=n_features, stop_words="english"
)
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)

print(
    "\n" * 2,
    "Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
    % (n_samples, n_features),
)
lda = LatentDirichletAllocation(
    n_components=n_components,
    max_iter=5,
    learning_method="online",
    learning_offset=50.0,
    random_state=0,
)
t0 = time()
lda.fit(tf)
# print("done in %0.3fs." % (time() - t0))

tf_feature_names = tf_vectorizer.get_feature_names_out()
plot_top_words(lda, tf_feature_names, n_top_words, "Topics in LDA model")


In [20]:
counts_by_hour = msgs.groupby([(msgs.date.dt.hour)]).count()
hours = list(counts_by_hour['date'].index)

def suffix(time):
    if time >= 12:
        return "pm"
    else:
        return "am"
def transform_time(time):
    if time%12 == 0:
        return 12
    else:
        return time%12
hours = [str(transform_time(h)) + suffix(h) for h in hours]
counts = list(counts_by_hour['text'])

plt.figure(figsize=(15,5))
plt.plot(hours, counts)
plt.ylim(ymin=0)
print("Message activity by time of day")
plt.show()

In [21]:
# %pip install emoji

In [22]:
# get most common emojis
import emoji

def extract_emojis(s):
  return ''.join(c for c in s if c in emoji.UNICODE_EMOJI['en'])

import re
import collections
emoji_counts = {}

# Iterate over the messages and count the emojis used by each user
for index, row in msgs.iterrows():
    sender = row['senderName']
    emojis = emoji.distinct_emoji_list(row['text'])

    for item in emojis:
        if sender not in emoji_counts:
            emoji_counts[sender] = collections.defaultdict(int)
        emoji_counts[sender][item] += 1

# Print the most common emoji for each user
import operator
for sender in emoji_counts.keys():
    vals = emoji_counts[sender]
    res = {k: v for k, v in sorted(vals.items(), key=lambda item: item[1])}
    most_common = list(res.keys())
    most_common.reverse()
    print(f"Most used emojis by {sender}: {most_common[:5]}")

In [1]:
from transformers import pipeline
classifier = pipeline('sentiment-analysis')


  from .autonotebook import tqdm as notebook_tqdm
No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Downloading config.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 629/629 [00:00<00:00, 151kB/s]


RuntimeError: Failed to import transformers.models.distilbert.modeling_distilbert because of the following error (look up to see its traceback):
Descriptors cannot not be created directly.
If this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0.
If you cannot immediately regenerate your protos, some other possible workarounds are:
 1. Downgrade the protobuf package to 3.20.x or lower.
 2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower).

More information: https://developers.google.com/protocol-buffers/docs/news/2022-05-06#python-updates

In [None]:
# Iterate over the messages and run sentiment analysis on each line
sentiments = []
for index, row in msgs.iterrows():
    sender = row['senderName']
    
    # make API call to get sentiment
    resp = requests.post(url, data={'text': row['text']})
    # sentiment = classifier(row['text'])
    sentiment = resp.json()

    sentiments.append({'date': msgs['date'], 'type': sentiment[0]['label'], 'score': sentiment[0]['score']})


In [None]:
# process the sentiments records
for idx, sentiment in enumerate(sentiments):
    if sentiments[idx]['type'] == 'NEGATIVE':
        sentiments[idx]['score'] *= -1

# plot the sentiment over time
xaxis = [x['date'] for x in sentiments]
yaxis = [x['score'] for x in sentiments]
plt.bar(xaxis, yaxis, color = list(map(lambda x: 'g' if x > 0 else 'r', yaxis))) # blue/red for positive/negative bars
plt.title("sentiment over time")
plt.show()