# Chatting users

Chatting users are defined as users which use many of the tweets to chat with other users. These chatting tweets can be identified by their contents, for example pronouns like *I* and *you* and specific verb forms like *am*.

In [57]:
import datetime
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import os
import pandas as pd
import re
from IPython.display import clear_output

In [5]:
DATADIR = "../data/text/"
QUERY_USERS = r'\b(ik|mij|me|jij|je|jou|jouw|jullie|heb|ben)\b'

In [33]:
def squeal(text=None):
    clear_output(wait=True)
    if not text is None: print(text)

## 1. Select users

In [38]:
files = sorted(os.listdir(DATADIR))
pattern = "^202103"
totals = {}
matches = {}
for file_name in files:
    if re.search(pattern, file_name):
        squeal(file_name)
        df = pd.read_csv(DATADIR+file_name, index_col="id_str")
        user_counts = df["user"].value_counts()
        for user in user_counts.keys():
            if user in totals:
                totals[user] += user_counts[user]
            else:
                totals[user] = user_counts[user]
        matching_rows = df["text"].str.contains(QUERY_USERS, flags=re.IGNORECASE)
        matching_counts = df[matching_rows]["user"].value_counts()
        for user in matching_counts.keys():
            if user in matches:
                matches[user] += matching_counts[user]
            else:
                matches[user] = matching_counts[user]        

20210331-23.out.gz


In [42]:
chatting_users = {} 
for user in matches:
    if matches[user] >= 0.5 * totals[user] and matches[user] > 10:
        chatting_users[user] = True

In [43]:
len(chatting_users)

39112

## 2. Count tweets

In [54]:
QUERY_TOPIC = r'\b(blij)\b'

In [56]:
dates_found = []
for file_name in files:
    if re.search(pattern, file_name):
        squeal(file_name)
        date = re.sub(".out.gz", "", file_name)
        df = pd.read_csv(DATADIR+file_name, index_col="id_str")
        matching_rows = df["text"].str.contains(QUERY_TOPIC, flags=re.IGNORECASE)
        for id_str, row in df[matching_rows].iterrows():
            if row["user"] in chatting_users:
                dates_found.append(date)

20210331-23.out.gz


## 3. Make graph

In [64]:
dates = {}
for hour in dates_found:
    date = re.sub("-[0-9][0-9]$", "", hour)
    if date in dates:
        dates[date] += 1
    else:
        dates[date] = 1
dates

{'20210301': 993,
 '20210302': 946,
 '20210303': 1038,
 '20210304': 1032,
 '20210305': 1087,
 '20210306': 1009,
 '20210307': 1084,
 '20210308': 1012,
 '20210309': 928,
 '20210310': 955,
 '20210311': 1102,
 '20210312': 1077,
 '20210313': 966,
 '20210314': 994,
 '20210315': 1127,
 '20210316': 1167,
 '20210317': 1816,
 '20210318': 1710,
 '20210319': 1159,
 '20210320': 1076,
 '20210321': 983,
 '20210322': 945,
 '20210323': 1041,
 '20210324': 1267,
 '20210325': 992,
 '20210326': 996,
 '20210327': 943,
 '20210328': 864,
 '20210329': 1235,
 '20210330': 1102,
 '20210331': 1083}