In [1]:
#Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv

#Progres bar
from tqdm import tqdm

In [2]:
#Load output.csv into df
with open('output.csv', encoding='utf8') as input_file:
    df = pd.read_csv(input_file)

In [3]:
#Generate list of all users
users = df['sender'].unique()

#Create empty graph of users
graph = np.zeros((len(users), len(users)))

#Add an additional edge from user a to user b if user a replies to user b
pbar = tqdm(total=len(df))
for k, row in df.iterrows():
    pbar.update(1)
    if row['reply_to_msg_id'] != -1:
        a = np.where(users == row['sender'])
        repliedMessageID = row['reply_to_msg_id']
        #Check if replied message exists in the df
        if repliedMessageID in df['msg_id'].values:
            #Get the row of the dataframe corresponding to repliedMessageID
            repliedMessageRow = df[df['msg_id'] == repliedMessageID]
            #Get the sender of repliedMessageRow
            repliedMessageSender = repliedMessageRow['sender'].values[0]
            #Get the index of repliedMessageSender in users
            b = np.where(users == repliedMessageSender)
            #Add an edge from a -> b
            graph[a, b] += 1
pbar.close()


print(graph)

In [30]:
leaderboard = []
for x, row in enumerate(graph):
    for y, val in enumerate(row):
        leaderboard.append([val, users[x], users[y]])
#Sort leaderboard by val
leaderboard = sorted(leaderboard, key=lambda x: x[0], reverse=True)
#Print leaderboard with newline after every element, adjusted so columns line up with each other
commonPairings = pd.DataFrame(leaderboard)
commonPairings.columns = ['Messages', 'Pair', 'Pair1']
commonPairings['Pair'] = commonPairings['Pair'] + ' & ' + commonPairings['Pair1']
del commonPairings['Pair1']
reduced = commonPairings.drop(commonPairings.index[10:])
print(reduced)


#print("#", "Original", "Reply", sep="\t")
#for x in leaderboard:
 #   print(x[0], x[1], x[2], sep='\t')

reduced.to_csv("CommonPairings.csv", index=False)

In [37]:

from datetime import date
#Create a line graph showing activity of users each day
#Create new df with columns "all" then each user

#Add day column to df with just the first 10 characters of df['date']
df['day'] = df['date'].apply(lambda x: x[:10])

dailyActivity = pd.DataFrame(columns=['all'] + list(users))
pbar = tqdm(total=len(df['day'].unique()))
for i, specificDate in enumerate(df['day'].unique()):
    pbar.update(1)
    #Get all messages sent on this date
    specificDateMessages = df[df['day'] == specificDate]
    #Add an empty row to dailyActivity for this date, with the row label being the date
    dailyActivity.loc[specificDate] = [0] * (len(users) + 1)
    #The first column is the number of messages sent on this date
    dailyActivity.loc[specificDate, 'all'] = len(specificDateMessages)
    #For each message sent on this date, add 1 to the corresponding user's row
    for k, row in specificDateMessages.iterrows():
        dailyActivity.loc[specificDate, row['sender']] += 1
pbar.close()

#Group the rows of dailyActivity by month
#If the first 7 characters of the row name are the same, then they are in the same month
#Add up the columns for all rows in the same month
dailyActivityByMonth = dailyActivity.groupby(lambda x: x[:7]).sum()

#Label index
dailyActivity.index.name = 'Date'
dailyActivityByMonth.index.name = 'Month'

dailyActivity.to_csv("DailyActivity.csv")
dailyActivityByMonth.to_csv("DailyActivityByMonth.csv")




In [4]:
#Calculate hourly activity
#Create new df with columns "all" then each user
hourlyActivity = pd.DataFrame(columns=['all'] + list(users))

#Add new column "time" and "hour" to df 11:19
df['time'] = df['date'].apply(lambda x: x[11:19])
df['hour'] = df['time'].apply(lambda x: int(x[:2]))

#For each hour from 0 to 23, add a row to dailyActivity
for i in range(24):
    hourlyActivity.loc[i] = [0] * (len(users) + 1)

for k, row in df.iterrows():
    thisHour = row['hour']
    hourlyActivity.loc[thisHour, 'all'] += 1
    hourlyActivity.loc[thisHour, row['sender']] += 1

hourlyActivity.index.name = 'Hour'
hourlyActivity.to_csv("HourlyActivity.csv")

In [82]:
from emoji import UNICODE_EMOJI
def is_emoji(s):
    return s in UNICODE_EMOJI['en']

#Add a column with the number of characters in the message
df['messageLength'] = df['msg_content'].apply(lambda x: len(str(x)))
#Add a column with the number of capital characters in each message
df['capitalLetters'] = df['msg_content'].apply(lambda x: sum(1 for c in str(x) if c.isupper()))
#Add a column with the number of emojis in each message
df['emojiCount'] = df['msg_content'].apply(lambda x: sum(1 for c in str(x) if is_emoji(c)))
#Add a column with 1 if message is all caps, 0 otherwise
df['allCaps'] = df['msg_content'].apply(lambda x: 1 if str(x).isupper() else 0)

#Make a new dataframe where each row is a user
userData = pd.DataFrame()
#Add column for num messages sent
for row in users:
    #Add column for num messages sent
    userData.loc[row, 'numMessages'] = len(df[df['sender'] == row])
    #Add column for percentage for num emojis sent
    userData.loc[row, 'emojiPercentage'] = (df[df['sender'] == row]['emojiCount'].sum() / df[df['sender'] == row]['messageLength'].sum()) * 100
    #Add column for percentage for num capital letters sent
    userData.loc[row, 'capitalLettersPercentage'] = (df[df['sender'] == row]['capitalLetters'].sum() / df[df['sender'] == row]['messageLength'].sum()) * 100
    #Add column for percentage of all caps messages
    userData.loc[row, 'allCapsPercentage'] = (df[df['sender'] == row]['allCaps'].sum() / userData['numMessages'][row]) * 100
userData.index.name = 'User'
userData.to_csv("UserData.csv")
