In [1]:
import pandas as pd
import os

In [2]:
path = os.getcwd()
path_trump = path + "\\data\\hashtag_donaldtrump.csv"
trump = pd.read_csv(path_trump, lineterminator="\n")
path_biden = path + "\\data\\hashtag_joebiden.csv"
biden = pd.read_csv(path_biden, lineterminator="\n")
trump["source"] = "Trump"
biden["source"] = "Biden"
# Concatenate and remove duplicates
df = pd.concat([trump, biden], ignore_index=True)
df = df.drop_duplicates()
df = df[df["country"].isin(["United States of America", "United States"])].dropna(
    subset=["state"]
)
## Lower case all tweets
df["tweet"] = df["tweet"].str.lower()

In [3]:
frequent_names_trump = [
    "Trump",
    "Donald" "Donald Trump",
    "@realDonaldTrump",
    "The Donald",
    "45",
    "Donald J. Trump",
    "DJT",
    "The Trump Administration",
    "Trumpster",
    "POTUS",
    "@POTUS",
    "Republican",
    "Republicans",
    "GOP",
    "MAGA",
    "Right Wing",
    "the Right",

]
frequent_names_biden = [
    "Biden",
    "Joe Biden",
    "@JoeBiden",
    "The Biden",
    "46",
    "Joseph R. Biden",
    "JRB",
    "The Biden Administration",
    "Bidenster",
    "Joe",
    "Joseph",
    "Joseph Biden",
    "Sleepy Joe",
    "Uncle Joe",
    "Dems",
    "Democrat",
    "Democrats",
    "Left Wing",
    "The Left",
]

##Lower case all names
frequent_names_trump = [name.lower() for name in frequent_names_trump]
frequent_names_biden = [name.lower() for name in frequent_names_biden]

# Create regex patterns from the lists of names
pattern_trump = "|".join(frequent_names_trump)
pattern_biden = "|".join(frequent_names_biden)

# Create boolean masks where tweets contain any of the frequent names
mask_trump = df["tweet"].str.contains(pattern_trump, case=False, na=False)
mask_biden = df["tweet"].str.contains(pattern_biden, case=False, na=False)

# Combine the masks to filter for tweets containing Trump names but not Biden names or vice versa
filtered_tweets = df[(mask_trump & ~mask_biden) | (mask_biden & ~mask_trump)]

# Count the number of filtered tweets
count_filtered = filtered_tweets.shape[0]

# Total number of tweets for percentage calculation
total_tweets = len(df["tweet"])

# Print the results
print(
    f"Number of tweets containing Trump names but not Biden names, or vice versa: {count_filtered}"
)
print(f"Percentage of filtered tweets: {count_filtered / total_tweets * 100:.2f}%")


filtered_tweets_trump = df[mask_trump & ~mask_biden]
filtered_tweets_biden = df[mask_biden & ~mask_trump]
count_filtered_trump = filtered_tweets_trump.shape[0]
count_filtered_biden = filtered_tweets_biden.shape[0]
print(
    f"\nNumber of tweets containing Trump names but not Biden names: {count_filtered_trump}"
)
print(
    f"Percentage of filtered tweets Trump: {count_filtered_trump / total_tweets * 100:.2f}%"
)
print(
    f"\nNumber of tweets containing Biden names but not Trump names: {count_filtered_biden}"
)
print(
    f"Percentage of filtered tweets Biden: {count_filtered_biden / total_tweets * 100:.2f}%"
)

Number of tweets containing Trump names but not Biden names, or vice versa: 199341
Percentage of filtered tweets: 59.96%

Number of tweets containing Trump names but not Biden names: 114507
Percentage of filtered tweets Trump: 34.44%

Number of tweets containing Biden names but not Trump names: 84834
Percentage of filtered tweets Biden: 25.52%


In [4]:
##show random 100 tweets in a new file  
import random
random.seed(42)
random_trump_index = random.sample(range(0, count_filtered_trump), 100)
random_biden_index = random.sample(range(0, count_filtered_biden), 100)
with open("filtered_tweets.txt" ,"w",encoding="UTF-8") as file:
    file.write("Trump Tweets\n")
    for i in random_trump_index:
        file.write(filtered_tweets_trump.iloc[i]["tweet"] + "\n")
    file.write("\nBiden Tweets\n")
    for i in random_biden_index:
        file.write(filtered_tweets_biden.iloc[i]["tweet"] + "\n")
## Looking at the tweets there are some problems
## 1. Some tweets are not in English
## 2. Some tweets are empty ## never mind, I checked and there are no empty tweets
## 3. Some hashtags are multiple words and are not separated by spaces
## 4. Some tweets does not mention either Trump or Biden


In [5]:
#combine the two dataframes
df = pd.concat([filtered_tweets_trump, filtered_tweets_biden], ignore_index=True)
# show stats for each state
state_stats = df.groupby("state").size().sort_values(ascending=False)
print("\nState stats:")
print(state_stats)
# average, standard deviation, and median of tweets per state
state_avg = df.groupby("state").size().mean()
state_std = df.groupby("state").size().std()
state_median = df.groupby("state").size().median()
print(f"\nAverage tweets per state: {state_avg:.2f}")
print(f"Standard deviation: {state_std:.2f}")
print(f"Median tweets per state: {state_median}")
# show counrty stats
country_stats = df.groupby("country").size().sort_values(ascending=False)
print("\nCountry stats:")
print(country_stats)
##print number of empty tweets
number_empty_tweets = df["tweet"].str.len() == 0
print("\nNumber of empty tweets:")
print(number_empty_tweets.sum())


State stats:
state
California                  33737
New York                    25571
Florida                     18228
Texas                       18187
District of Columbia         9755
Illinois                     7652
Pennsylvania                 7379
New Jersey                   5441
Massachusetts                5032
Ohio                         4639
Colorado                     4164
North Carolina               4021
Arizona                      3711
Michigan                     3673
Georgia                      3497
Oregon                       3314
Washington                   3089
Virginia                     2924
Tennessee                    2730
Maryland                     2724
Nevada                       2675
Missouri                     2472
Minnesota                    2159
Wisconsin                    1918
Indiana                      1682
Kentucky                     1615
South Carolina               1318
Louisiana                    1291
Connecticut                 

In [6]:
##new method  GPT code
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point


df_all = pd.concat([trump, biden], ignore_index=True)
df_all = df_all.drop_duplicates()
# Convert latitude and longitude to a GeoDataFrame with Points
geometry = [Point(xy) for xy in zip(df_all['long'], df_all['lat'])]
gdf_points = gpd.GeoDataFrame(df_all, geometry=geometry, crs="EPSG:4326")  # Set CRS to WGS84

# Load the boundary shapefile (e.g., countries or states boundaries)
# Ensure that the shapefile CRS matches the points CRS (EPSG:4326)
shapefile_path = "map/ne_10m_admin_1_states_provinces.shp"
gdf_boundaries = gpd.read_file(shapefile_path)

# Spatial join: Assigns each point to a boundary region (state/country)
gdf_result = gpd.sjoin(gdf_points, gdf_boundaries, how="left", predicate="within")

# Extract desired information, such as 'admin' for country and 'name' for state
gdf_result['country_calculated'] = gdf_result['admin']   # Column name may vary based on shapefile
gdf_result['state_calculated'] = gdf_result['name']      # Column name may vary based on shapefile

df_all["calc_country"] = gdf_result['country_calculated']
df_all["calc_state"] = gdf_result['state_calculated']


In [9]:
df_all["tweet"] = df_all["tweet"].str.lower()
df_all = df_all[df_all["country"].isin(["United States of America"])].dropna()
##Lower case all names
frequent_names_trump = [name.lower() for name in frequent_names_trump]
frequent_names_biden = [name.lower() for name in frequent_names_biden]

# Create regex patterns from the lists of names
pattern_trump = "|".join(frequent_names_trump)
pattern_biden = "|".join(frequent_names_biden)

# Create boolean masks where tweets contain any of the frequent names
mask_trump = df_all["tweet"].str.contains(pattern_trump, case=False, na=False)
mask_biden = df_all["tweet"].str.contains(pattern_biden, case=False, na=False)

# Combine the masks to filter for tweets containing Trump names but not Biden names or vice versa
filtered_tweets = df_all[(mask_trump & ~mask_biden) | (mask_biden & ~mask_trump)]


# Count the number of filtered tweets
count_filtered = filtered_tweets.shape[0]

# Total number of tweets for percentage calculation
total_tweets = len(df_all["tweet"])


print(
    f"Number of tweets containing Trump names but not Biden names, or vice versa: {count_filtered}"
)
print(f"Percentage of filtered tweets: {count_filtered / total_tweets * 100:.2f}%")


filtered_tweets_trump = df_all[mask_trump & ~mask_biden]
filtered_tweets_biden = df_all[mask_biden & ~mask_trump]
count_filtered_trump = filtered_tweets_trump.shape[0]
count_filtered_biden = filtered_tweets_biden.shape[0]
print(
    f"\nNumber of tweets containing Trump names but not Biden names: {count_filtered_trump}"
)
print(
    f"Percentage of filtered tweets Trump: {count_filtered_trump / total_tweets * 100:.2f}%"
)
print(
    f"\nNumber of tweets containing Biden names but not Trump names: {count_filtered_biden}"
)
print(
    f"Percentage of filtered tweets Biden: {count_filtered_biden / total_tweets * 100:.2f}%"
)

Number of tweets containing Trump names but not Biden names, or vice versa: 103356
Percentage of filtered tweets: 58.73%

Number of tweets containing Trump names but not Biden names: 58286
Percentage of filtered tweets Trump: 33.12%

Number of tweets containing Biden names but not Trump names: 45070
Percentage of filtered tweets Biden: 25.61%
