In [None]:
import pandas as pd
import numpy as np

In [None]:
## 'r' -> to read
f = open('Minerskape.txt', 'r', encoding = 'utf-8')

In [None]:
data = f.read()

In [None]:
print(data)

In [None]:
# Refer regex101 for using regular expression
import re

In [None]:
pattern = '\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s-\s'

In [None]:
# To have data splitted according to dates from text
# [1:] to remove first empty element
message = re.split(pattern, data)[1:]
len(message)

In [None]:
print(message)

In [None]:
# To filter dates from text
dates = re.findall(pattern, data)
len(dates)

In [None]:
print(dates)

In [None]:
df = pd.DataFrame({'user_message':message,'date':dates})
df['date'] = pd.to_datetime(df['date'], format = '%d/%m/%y, %H:%M - ')
df.head(10)

In [None]:
# Identifying the user
user = []
messages = []
for chat in df['user_message']:
    if ':' in chat:
        entry = re.split('([\w\W]+?):\s',chat)
        user.append(entry[1])
        messages.append(entry[2])
    else:
        user.append('group notification')
        messages.append(chat)
df['user_message'] = messages
df['user'] = user

In [None]:
df

## Identifying Year, Month, Day, Hour and Minute of message

In [None]:
df['year'] = df['date'].dt.year

In [None]:
df['month'] = df['date'].dt.month_name()

In [None]:
df['day'] = df['date'].dt.day

In [None]:
df['hour'] = df['date'].dt.hour

In [None]:
df['minute'] = df['date'].dt.minute

In [None]:
df

In [None]:
df.shape

In [None]:
# Number of messages sent by 'Mayank Cu'
df[df['user'] == 'Mayank Cu'].shape[0]

In [None]:
# Storing the messages word by word in a list
# message.split() will give a list
# append will exactly copy the given list but extend will copy the elements of list
# extend and append are same when we do not pass a collection
words = []
for message in df['user_message']:
    words.extend(message.split())

In [None]:
len(words)

In [None]:
words

In [None]:
## Identifying the media messages
df[df['user_message'] == "<Media omitted>\n"].shape[0]

# !pip install urlextract

In [None]:
from urlextract import URLExtract
extractor = URLExtract()

In [None]:
## Identifying the links
links =[]
for message in df['user_message']:
    links.extend(extractor.find_urls(message))
len(links)

In [None]:
## Finding and plotting count of each user
x = df['user'].value_counts().head()

In [None]:
import matplotlib.pyplot as plt

In [None]:
name = x.index
count = x.values

In [None]:
plt.bar(name, count)
plt.xticks(rotation = 'vertical')

In [None]:
df['user'].value_counts()

In [None]:
## Finding the most active user via percentage of messages sent
round((df['user'].value_counts()/df.shape[0]) * 100,2).reset_index().rename(columns = {'user': 'Name', 'count': 'Percentage'})

In [None]:
## Finding the most frequent words
from collections import Counter
pd.DataFrame(Counter(words).most_common(20)).rename(columns = {0: 'Word', 1: 'Count'})

## Identifying messages with no user, media omitted and deleted message

In [None]:
df1= df[df['user']!='group notification']

In [None]:
df2=df1[df1['user_message']!= '<Media omitted>\n']

In [None]:
temp = df2[df2['user_message'] != 'This message was deleted\n']

In [None]:
temp

## Removing stopwords from user messages and identifying admin messages

In [None]:
f = open('stop_hinglish.txt','r')
stop_words = f.read()
print(stop_words)

In [None]:
len(stop_words)

In [None]:
words = []
for message in temp['user_message']:
    for word in message.lower().split():
        if word not in stop_words:
            if word == '@917506941616':
                words.append('admin tagged')
            else:
                words.append(word)

In [None]:
words

In [None]:
len(words)

In [None]:
from collections import Counter
pd.DataFrame(Counter(words).most_common(20))

## Finding emojis in user messages

# !pip install emoji

In [None]:
import emoji

In [None]:
## emoji's code -> emoji's unicode data -> number for machine and symbol for human
emojis = []
for i in df['user_message']:
    emojis.extend([c for c in i if c in emoji.EMOJI_DATA])

In [None]:
emojis

In [None]:
pd.DataFrame(Counter(emojis).most_common(len(Counter(emojis)))).rename(columns = {0: 'Emoji', 1: 'Count'})

In [None]:
df

In [None]:
# Adding month number
df['month_num'] = df['date'].dt.month

In [None]:
df

## Plotting timeline for user messages

In [None]:
timeline = df.groupby(['year','month','month_num']).count()['user_message'].reset_index()

In [None]:
timeline

In [None]:
time =[]
for i in range(timeline.shape[0]):
    time.append(timeline['month'][i] + " - " + str(timeline['year'][i]))

In [None]:
time

In [None]:
timeline['time'] = time

In [None]:
timeline

In [None]:
# group based on year and then sort based on month number
timeline1 = timeline[timeline['year']==2021].sort_values('month_num').reset_index(drop=True)
timeline2 = timeline[timeline['year']==2022].sort_values('month_num').reset_index(drop=True)
timeline = pd.concat([timeline1,timeline2],axis=0)
timeline

In [None]:
plt.plot(timeline['time'], timeline['user_message'])
plt.xticks(rotation='vertical')
plt.show()

## Filtering user messages by date only

In [None]:
df['only_date'] = df['date'].dt.date

In [None]:
df

In [None]:
timeline_date = df.groupby('only_date').count()['user_message'].reset_index()

In [None]:
timeline_date