# **Data Exploration**

## Basic Info

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly
import plotly.express as px

df = pd.read_csv('https://github.com/CS-132-Group-43/Marcos-Achievements/blob/5a16757bb02aaa0571b8b2dc41c005d5de709d0f/dataset/Dataset%20-%20Group%2043.csv?raw=true')

# RESOURCES
# https://pandas.pydata.org/docs/user_guide/timeseries.html
# https://www.shiksha.com/online-courses/articles/working-with-date-and-time-in-pandas/
# https://stackoverflow.com/questions/33271098/python-get-a-frequency-count-based-on-two-columns-variables-in-pandas-datafra

In [None]:
print(df.shape)
df.describe()

# print(df.head(3))

# Sample Tweets

In [None]:
df.head(5)

## Finding and Filling Missing Values

In [None]:
df_copy = df.copy(deep=True)
new_copy = df_copy[:115].isnull()
features = ["Timestamp", "Tweet URL", "Tweet", "Date posted"]
# for feature in features:
#   print(new_copy[feature].sum())
# Missing 1 Tweet and Date Posted entry
# MIssing values are filled manually

df_new = pd.read_csv('https://github.com/CS-132-Group-43/Marcos-Achievements/blob/1f4d3f0bfb0fdc5e41e3b4e50db60568c8e16091/dataset/Dataset%20-%20Group%2043%20(Updated).csv?raw=true')

## Visualization

In [None]:
df_new["Date posted"] = pd.to_datetime(df_new["Date posted"], dayfirst=True, utc=True)
df_new["Date posted"].dt.tz_convert("Asia/Manila")
df_new["Year"] = df_new["Date posted"].dt.year
df_new["Month"] = df_new["Date posted"].dt.month
df_new["Day"] = df_new["Date posted"].dt.day

twt_count = df_new.value_counts(["Year", "Month"])
twt_count_w_day = df_new.value_counts(["Year", "Month", "Day"])

twt_count[2021, 2] = 0
twt_count[2021, 3] = 0
twt_count[2021, 4] = 0
twt_count[2021, 5] = 0
twt_count[2021, 6] = 0
twt_count[2021, 7] = 0
twt_count[2021, 9] = 0

twt_count[2022, 8] = 0
twt_count[2022, 11] = 0

for x in range(31):
  twt_count_w_day[2021, 2, x] = 0
  twt_count_w_day[2021, 3, x] = 0
  twt_count_w_day[2021, 4, x] = 0
  twt_count_w_day[2021, 5, x] = 0
  twt_count_w_day[2021, 6, x] = 0
  twt_count_w_day[2021, 7, x] = 0
  twt_count_w_day[2021, 9, x] = 0

  twt_count_w_day[2022, 8, x] = 0


twt_count.sort_index(inplace=True)
twt_count.drop(labels= [2016, 2019], inplace=True)
fig1 = plt.figure("Line")
twt_count.plot(kind="line", marker="o", xlabel='(Year, Month)', ylabel='Tweet Count', color="maroon")
plt.title("Number of Tweets Per Month During 2021-2022")

twt_count_year = df_new.value_counts(["Year"])
twt_count_year.sort_index(inplace=True)
twt_count_year.drop(labels= [2016, 2019], inplace=True)
fig2 = plt.figure("Bar")
twt_count_year.plot(kind="bar", xlabel='Year', ylabel='Tweet Count', color="maroon")
plt.title("Number of Tweets During 2021 vs. 2022")

In [None]:
year_list = [2021.0, 2021.0, 2021.0, 2021.0, 2021.0, 2021.0, 2021.0, 2021.0, 2021.0, 2021.0, 2021.0, 2021.0,
                 2022.0, 2022.0, 2022.0, 2022.0, 2022.0, 2022.0, 2022.0, 2022.0, 2022.0, 2022.0, 2022.0, 2022.0]
month_list = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0,
                  1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0]

data_to_plot = pd.DataFrame(
    {
        'Year': year_list,
        'Month': month_list,
        'Date': [f"{str(int(year_list[index]))}-{str(int(month_list[index]))}" for index in range(len(year_list))],
        'Count': list(twt_count)

    }
)

data_to_plot['Date'] = pd.to_datetime(data_to_plot['Date'])

print(data_to_plot)
print()

fig = px.line(data_to_plot, x="Date", y="Count", title='Number of Tweets Per Month During 2021-2022', width=1000)
fig.update_traces(line=dict(color='darkred'))
fig.show()
# print(plotly.offline.plot(fig, include_plotlyjs=False, output_type='div'))

# **Data Modeling**


## Linear Regression

In [None]:
import numpy as np
import scipy.stats as stats
import seaborn as sns

"""
Suggestion:
Fit segmented regression models (pre-announcement, post-announcement periods)

Apply t-test to the slopes of each model and check for significant difference
"""
# RESOURCES
# https://datagy.io/split-pandas-dataframe/
# https://stackoverflow.com/questions/59975797/python-plotly-how-to-extract-m-and-b-from-ols-line
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html

In [None]:
pre_announce = data_to_plot.iloc[:10]
fig_1 = px.scatter(pre_announce, x="Date", y="Count", title='Number of Tweets Per Month Pre-Announcement', opacity=0.75, trendline='ols', trendline_color_override='darkslategray')
fig_1.update_traces(marker=dict(color='firebrick'))
fig_1.show()
# print(plotly.offline.plot(fig_1, include_plotlyjs=False, output_type='div'))

print()
pre_fit = px.get_trendline_results(fig_1).px_fit_results.iloc[0]
pre_slope = pre_fit.params[1]
pre_intercept = pre_fit.params[0]
print(f"Slope of linear fit pre-announcement: {pre_slope}")

In [None]:
post_announce = data_to_plot.iloc[9:]
fig_2 = px.scatter(post_announce, x="Date", y="Count", title='Number of Tweets Per Month Post-Announcement', opacity=0.75, trendline='ols', trendline_color_override='darkslategray')
fig_2.update_traces(marker=dict(color='firebrick'))
fig_2.show()
# print(plotly.offline.plot(fig_2, include_plotlyjs=False, output_type='div'))

print()
post_fit = px.get_trendline_results(fig_2).px_fit_results.iloc[0]
post_slope = post_fit.params[1]
post_intercept = post_fit.params[0]
print(f"Slope of linear fit post-announcement: {post_slope}")

In [None]:
fig.add_traces(list(fig_1.select_traces()))
fig.add_traces(list(fig_2.select_traces()))
fig.add_vline(x=data_to_plot['Date'][9], line_width=1, line_dash="dash")
fig.add_vrect(x0=data_to_plot['Date'][8], x1=data_to_plot['Date'][10], line_width=0, fillcolor="red", opacity=0.2)
fig.add_vrect(x0=data_to_plot['Date'][11], x1=data_to_plot['Date'][13], line_width=0, fillcolor="red", opacity=0.2)
fig.add_vrect(x0=data_to_plot['Date'][14], x1=data_to_plot['Date'][16], line_width=0, fillcolor="red", opacity=0.2)
fig.show()

print(plotly.offline.plot(fig, include_plotlyjs=False, output_type='div'))

## Hypothesis Testing

In [None]:
pre_count = pre_announce['Count']
post_count = post_announce['Count']

print(pre_count)
print(post_count)

t_stat, p_val = stats.ttest_ind(pre_count, post_count)

print((t_stat, p_val))
alpha = 0.05
if p_val < alpha:
    print("Reject the null hypothesis")
else:
    print("Fail to reject the null hypothesis")

In [None]:
# const: constant term, i.e. y intercept
# x1: x term, i.e. the slope, what we're trying to compare

print(pre_fit.summary())
print(post_fit.summary())

In [None]:
print(pre_fit.bse[1])
print(post_fit.bse[1])

In [None]:
# x1, x2 = slopes; pre_slope, post_slope
# n1, n2 = sample sizes for pre and post
# s1, s2 = stderr from scipy.stats.linregress

n1 = 10
n2 = 15
s1 = pre_fit.bse[1]
s2 = post_fit.bse[1]

# Compute t-statistic
t_statistic = (pre_slope - post_slope) / np.sqrt((s1**2 / n1) + (s2**2 / n2))

# Degrees of freedom
df = n1 + n2 - 2

# Compute p-value
p_value = 2 * (1 - stats.t.cdf(np.abs(t_statistic), df))

# Critical value at alpha = 0.05
t_critical = stats.t.ppf(0.975, df)

# Output results
print("T-Statistic:", t_statistic)
print("P-Value:", p_value)
print("Degrees of Freedom:", df)
print("Critical Value (alpha=0.05):", t_critical)

# Compare p-value with alpha
alpha = 0.05
if p_value <= alpha:
    print("Reject the null hypothesis")
else:
    print("Fail to reject the null hypothesis")

## Data Interpretation


---
We split our dataset into two: the first containing tweets posted before
Marcos' announcement and the second containing those posted after. Using linear regression, we were able to find a line of best fit for both groups to visualize the general trend in the tweet count before and after. We compared the slopes of the two regression models. And using the statistical t-test, we were able to get a p-value of 1.47e-10, which is a very small number.

This means that we can say that the slopes of the two models are statistically significant, aside from the fact that the first model has a positive slope while the latter, a negative slope.

---

