# Whatsapp Group Chat Data Analysis
Purpose of project

Find most active users in the group Find time when most users are active

Scope

Sentiment Analysis on individual member Sentiment Analysis on Over-all group chat

In [None]:
import os
import pandas as pd
import numpy as np
import re
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec

In [None]:
df = pd.read_csv('chat.txt', sep = "delimiter",skip_blank_lines = True, header = None)
df.head(3)

In [None]:
def getdate(x):
    res = re.search("\d+/\d+/\d+ \d{2}.\d{2}.\d{2}",x)
    if res != None:
        return res.group()
    else:
        return ""

df["Datetime"] = list(map(lambda x : getdate(x), df.iloc[:,0]))
df['Datetime'].dropna(inplace = True) 
new = df["Datetime"].str.split(" ", n = 1, expand = True)
df["Date"]= new[0] 
df["time"]= new[1] 
df.drop(columns =["Datetime"], inplace = True) 

In [None]:
def getsender(x):
    res = re.search(re.compile(".*?: "),x)
    if res !=None:
        return res.group()[1:-2]
    else:
        return ""
df["sender"] =list(map(getsender,df.iloc[:,0]))
df['sender'].dropna(inplace = True) 
new = df["sender"].str.split("]", n = 1, expand = True)
df["pengirim"]= new[1] 
df.drop(columns =["sender"], inplace = True) 


In [None]:
def getmessage(x):
    res = re.search(": .*",x)
    if res != None:
        return res.group()[2:]
    else:
        return None

df["Message"] = list(map(getmessage,df.iloc[:,0]))
df.head(3)

In [None]:
df = df.dropna()

In [None]:
group_members = list(set(df["pengirim"]))
n_message = list(map(lambda x : len(np.where(df["pengirim"]==x)[0]),group_members)) 
## Create a dataframe to store above values
activity_data = pd.DataFrame({"pengirim": group_members,"n_count":n_message})
## Sort data for convenience and rearrange index
activity_data = activity_data.sort_values(by=["n_count"], ascending=False)
activity_data.index = range(0,len(activity_data))

In [None]:
X = activity_data["pengirim"][:10]
Y = activity_data.n_count[:10]

plt.figure(figsize=[10,10])

plt.title("Top 10 Active Members", size = 16)

plt.bar(x = X, height= Y, color = "seagreen")
plt.xticks(rotation = 90, size = 12)
plt.yticks(size = 12)

for i in range(0,10):
    plt.annotate(s = Y[i], xy = (i-0.25,Y[i]+5), size = 12)

plt.show()

In [None]:
df['Date']= pd.to_datetime(df['Date'])

In [None]:
df['dayOfWeek'] = df['Date'].dt.day_name() 
df['dayOfWeek']

In [None]:
df['dayOfWeek'].value_counts().plot(kind='bar', title='Hari', COLOR='seagreen')

In [None]:
df['Date'].value_counts()

In [None]:
df['jam'] = df['time'].str[:2]
df['jam']

In [None]:
df['jam'].value_counts().plot(kind='bar', title='Hari', COLOR='seagreen')

In [None]:
# gunakan fungsi Series.str.lower() pada Pandas
df['Message'] = df['Message'].str.lower()


print('Case Folding Result : \n')
print(df['Message'].head(5))
print('\n\n\n')

In [None]:
import emoji
import regex

def split_count(text):

    emoji_list = []
    data = regex.findall(r'\X', text)
    for word in data:
        if any(char in emoji.UNICODE_EMOJI for char in word):
            emoji_list.append(word)

    return emoji_list

total_messages = df.shape[0]
media_messages = df[df['Message'] == '<Media omitted>'].shape[0]
df["emoji"] = df["Message"].apply(split_count)
emojis = sum(df['emoji'].str.len())
URLPATTERN = r'(https?://\S+)'
df['urlcount'] = df.Message.apply(lambda x: re.findall(URLPATTERN, x)).str.len()
links = np.sum(df.urlcount)

In [None]:
media_messages_df = df[df['Message'] == '<Media omitted>']
messages_df = df.drop(media_messages_df.index)

In [None]:
messages_df['Letter_Count'] = messages_df['Message'].apply(lambda s : len(s))
messages_df['Word_Count'] = messages_df['Message'].apply(lambda s : len(s.split(' ')))
messages_df.head(3)

In [None]:
# Creates a list of unique Authors - ['Manikanta', 'Teja Kura', .........]
l = messages_df.pengirim.unique()

for i in range(len(l)):
  # Filtering out messages of particular user
  req_df= messages_df[messages_df["pengirim"] == l[i]]
  # req_df will contain messages of only one particular user
  print(f'Stats of {l[i]} -')
  # shape will print number of rows which indirectly means the number of messages
  print('Messages Sent', req_df.shape[0])
  #Word_Count contains of total words in one message. Sum of all words/ Total Messages will yield words per message
  words_per_message = (np.sum(req_df['Word_Count']))/req_df.shape[0]
  print('Words per message', words_per_message)
  #media conists of media messages
  media = media_messages_df[media_messages_df['pengirim'] == l[i]].shape[0]
  print('Media Messages Sent', media)
  # emojis conists of total emojis
  emojis = sum(req_df['emoji'].str.len())
  print('Emojis Sent', emojis)
  #links consist of total links
  links = sum(req_df["urlcount"])   
  print('Links Sent', links)   
  print()

In [None]:
total_emojis_list = list(set([a for b in messages_df.emoji for a in b]))
total_emojis = len(total_emojis_list)
print(total_emojis)


In [None]:
import collections
from collections import Counter

In [None]:
total_emojis_list = list([a for b in messages_df.emoji for a in b])
emoji_dict = dict(Counter(total_emojis_list))
emoji_dict = sorted(emoji_dict.items(), key=lambda x: x[1], reverse=True)

emoji_df = pd.DataFrame(emoji_dict, columns=['emoji', 'count'])
emoji_df

In [None]:
import plotly
print(plotly.__version__)

In [None]:
import plotly
import plotly.express as px

In [None]:
fig = px.pie(emoji_df, values='count', names='emoji',
             title='Emoji Distribution')
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

In [None]:
date_df = messages_df.groupby("Date").count()
date_df.reset_index(inplace=True)
date_df.head()

In [None]:
date_df = messages_df.groupby("Date").count()
date_df.reset_index(inplace=True)

fig = px.line(date_df, x="Date", y="Letter_Count", title='Message')
fig.update_xaxes(nticks=20)
fig.show()

In [None]:
messages_df['Date'].value_counts().head(10).plot.barh()
plt.xlabel('Number of Messages')
plt.ylabel('Date')

In [None]:
messages_df['jam'].value_counts().head(10).plot.barh() 
plt.xlabel('Number of messages')
plt.ylabel('jam')

In [None]:
def dayofweek(i):
  l = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
  return l[i];
day_df=pd.DataFrame(messages_df["Message"])
day_df['day_of_date'] = messages_df['Date'].dt.weekday
day_df['day_of_date'] = day_df["day_of_date"].apply(dayofweek)
day_df["messagecount"] = 1
day = day_df.groupby("day_of_date").sum()
day.reset_index(inplace=True)

fig = px.line_polar(day, r='messagecount', theta='day_of_date', line_close=True)
fig.update_traces(fill='toself')
fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
      range=[0,6000]
    )),
  showlegend=False
)
fig.show()
