In [None]:
import os
import geopandas.datasets
import matplotlib.pyplot as plt
import pandas as pd
import geopandas as geopd
import sklearn.cluster as clustering
import numpy as np
from matplotlib.ticker import FormatStrFormatter, PercentFormatter
os.chdir("/Volumes/ExtraHDD2/DS_Assignments_Data/ADM_2")

In [None]:
posts = pd.read_csv("instagram_posts.csv", delimiter="\t", parse_dates=[5], infer_datetime_format = True, nrows=100000)

In [None]:
# RQ2 [H1]

# Most common time in which users publish their posts [H2]

# This is pretty straightforward to do
# It should be stressed that we have no information from the dataset for what concerns the timezones, which is a quite crucial bit of info
# The timestamps probably refer to a single timezone, so it is difficult to talk about the "most common time", since to check
# something like this we should have the timestamps relatively to the timezone of the account which posted.
# Considering the limitation, let's answer the question
print((posts.cts.dt.hour.value_counts(True)*100).astype("str").add("%"))
plt.figure(figsize=(9, 6))
posts.cts.dt.hour.value_counts(True).sort_index().plot.bar()
plt.xlabel("Hour")
plt.ylabel("Percentage")
plt.title("Barplot for posts frequency conditioned on time")
plt.gca().xaxis.set_major_formatter(FormatStrFormatter('% .0f'))
plt.gca().yaxis.set_major_formatter(PercentFormatter(xmax=1, decimals=0))
plt.show()
# The time range which seems more active is the one from early-mid afternoon to 9/10 PM.

In [None]:
posts.cts.isna().any()

In [None]:
# Define a function that receives a time interval and returns a plot with the number of posts for each time interval

def time_interval_obs(interval_list:list[tuple[str, str]], time_df: pd.DataFrame, time_col: str|None = None) -> None:
    if not time_col:
        dtypes_series = time_df.dtypes.index[time_df.apply(pd.api.types.is_datetime64_any_dtype)]
        if dtypes_series.empty:
            raise ValueError("The dataframe has no columns for time, you need to pass a dataframe which has at least one")
        elif len(dtypes_series) != 1:
            raise TypeError("The dataframe has more than one column with datetime 64 dtype.\n\
                            You need to explicitly tell the function which one to use")
        else:
            time_col = dtypes_series[0]

    try:
        time_df = pd.Series(np.zeros(len(time_df))).set_axis(time_df[time_col])
    except KeyError:
        raise KeyError("The column you specified does not exist in the dataframe")

    n_posts_int_list = [len(time_df.between_time(*interval)) for interval in interval_list]

    plt.figure(figsize=(13, 7))
    plt.bar(["|".join(x) for x in interval_list], n_posts_int_list)
    plt.xlabel("Time Interval")
    plt.ylabel("Number of posts")
    plt.xticks(rotation = 20)
    plt.show()

    return None

In [None]:
posts.cts.isna().any() # Every post has a timestamp

test_list = [("06:00:00", "10:59:59"), ("11:00:00","13:59:59"), ("14:00:00", "16:59:59"), ("17:00:00", "19:59:59"), ("20:00:00", "23:59:59"), ("00:00:00", "02:59:59"), ("03:00:00", "05:59:59")]

time_interval_obs(test_list, posts)