# Docs

This jupyter notebook is used to extract Huawei-related tweets for four different countries, AUS, UK, CAN and USA, with the goal of identifying which countries had the most similar conversations six months prior to their respective bans. Process below:

1. It cleans data over multiple iterations of identifying irrelevant terms
2. Extracts tweets by country using a set method (identifying top five countries and iterating until done)
3. Uses unique windows of times to identify six months of tweets prior to the ban
4. Processes the topic model for each of the countries individually with a unique seed to define how many topics every country will be restricted to (30)
5. Recalculates topic models with the same seed
6. Creates a DF with similarity scores based on each topic models embeddings 
7. Creates this DF for each country
7. And, finally, identifies which countries similarity scores are highest, utlimately identifying which countries had the most similar conversations. 

In [None]:
from platform import python_version
print(python_version())

In [None]:
# Import relevant packages. 
import pandas as pd
from bertopic import BERTopic
from ipywidgets import FloatProgress
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
import pickle
from dateutil.relativedelta import relativedelta
import datetime
from datetime import date
import numpy as np
import os
import glob
from umap import UMAP
from nltk.corpus import stopwords
stop = stopwords.words('english')
from sklearn.metrics.pairwise import cosine_similarity
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from sklearn.feature_extraction.text import CountVectorizer

# Set relevant options.
pd.set_option('display.max_rows', 500)
pd.options.display.float_format = '{:.4f}'.format
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

In [None]:
# Import data.
combined = pd.read_csv(r".\huawei-v2.csv")

In [None]:
len(combined)

In [None]:
# Quality check.
print(len(combined))
combined = combined.drop_duplicates(subset=['tweet_id'])
print(len(combined))

# Drop tweets with irrelevant terms.
print(len(combined))
combined = combined[~combined['cleaned_text'].str.contains("giveaway|foldable|p30|mate|p20|charging|apple|ios|iphone|samsung|galaxy|win|smartphone|smartwatch|gsma|android|tablet|nova|cloud|p40|camera|review|router|battery|wallet|dlink|gb|modem|notifications", na=False, case=False)]
print(len(combined))

# Remove stop words.
# combined['cleaned_text'] = combined['cleaned_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

# Create month timestamp.
combined["month"] = combined['created_at'].str[:7]

# Cleaning function. Add as required.
def cleaner (pre):
    pre['cleaned_text'] = pre['cleaned_text'].str.replace('&amp;','and')
    pre['cleaned_text'] = pre['cleaned_text'].str.replace('&','and')
    pre['cleaned_text'] = pre['cleaned_text'].str.replace('\n',' ')
    pre['cleaned_text'] = pre['cleaned_text'].str.replace('RT','')
    pre['cleaned_text'] = pre['cleaned_text'].str.replace('http\S+|t.co\S+', '', case=False)
    pre['cleaned_text'] = pre['cleaned_text'].str.replace('s://', '', case=False)
    pre['cleaned_text'] = pre['cleaned_text'].str.replace('=andgt;', ' ', case=False)
    pre['cleaned_text'] = pre['cleaned_text'].str.replace('andgt;', ' ', case=False)
    pre['cleaned_text'] = pre['cleaned_text'].str.replace('http','')
    pre['cleaned_text'] = pre['cleaned_text'].str.replace('https','')

cleaner(combined)

In [None]:
# Extract tweets from locations.
aus = combined[combined['profile_loc'].str.contains("australia|sydney|canberra|melbourne|brisbane", na=False, case=False)]
uk = combined[combined['profile_loc'].str.contains("united kingdom|england|london", na=False, case=False)]
nz = combined[combined['profile_loc'].str.contains("new zealand|auckland|wellington|christchurch", na=False, case=False)]
usa = combined[combined['profile_loc'].str.contains("usa|united states|america|washington|california|new york|seattle", na=False, case=False)]
can = combined[combined['profile_loc'].str.contains("canada|ontario|toronto|british columbia|ottawa", na=False, case=False)]

print(len(aus))
print(len(uk))
print(len(nz))
print(len(usa))
print(len(can))

In [None]:
# To make things reproducible and consistent with others.
umap_model = UMAP(random_state=42)

# To remove stop words after clustering.
vectorizer_model = CountVectorizer(stop_words="english")

In [None]:
# # Restrict to six months before Aus's ban (2018-01-11) and the day of Canada's ban (2022-05-14) as requested by Jon.

# nz["created_at"] = pd.to_datetime(nz["created_at"])
# nz['date'] = nz['created_at'].dt.date
# nz = nz.sort_values("date")
# start = date.fromisoformat('2018-01-10')
# end = date.fromisoformat('2022-05-15')
# print(len(nz))
# nz = nz[(nz['date'] >= start) & (nz['date'] <= end)]
# print(len(nz))

# # Run topic model.
# nz_data = nz.cleaned_text.to_list()
# nz_model = BERTopic(verbose=True, n_gram_range=(1, 3), min_topic_size=10, nr_topics=30, umap_model=umap_model, top_n_words=20, vectorizer_model=vectorizer_model)
# nz_topics, nz_probs = nz_model.fit_transform(nz_data)

# # Run code to create CSV with topics. 
# nz = concat_topics('NZ', nz, nz_model)
# nz = nz.drop(columns=['Unnamed: 0.1', 'Unnamed: 0_x', 'Unnamed: 0_y', 'Unnamed: 0_x.1', 'Unnamed: 0_y.1', 'edit_history_tweet_ids_y', 'month'], errors='ignore')
# nz.to_csv("nz-tweets-with-country-and-topics.csv")

In [None]:
# Function to select subset of data for each country based on specific date. 
def select_subset(date_of_ban, country):

    # Used for one month timesteps.
    month = relativedelta(months=1)
    
    # Format date. 
    d = date.fromisoformat(date_of_ban)
    
    # Formats date.
    country["created_at"] = pd.to_datetime(country["created_at"])
    country['date'] = country['created_at'].dt.date
    print(len(country))

    # Only includes 6 months before ban in country.
    country = country[(country['date'] >= d-month*6) & (country['date'] <= d)]
    print(d)

    # Setting multiple conditions with one month intervals. 
    conditions = [
        (country.date >= country.date.min()) & (country.date <= country.date.min() + month),
        (country.date >= country.date.min() + month) & (country.date <= country.date.min() + month*2),
        (country.date >= country.date.min() + month*2) & (country.date <= country.date.min() + month*3),
        (country.date >= country.date.min() + month*3) & (country.date <= country.date.min() + month*4),
        (country.date >= country.date.min() + month*4) & (country.date <= country.date.min() + month*5),
        (country.date >= country.date.min() + month*5) & (country.date <= country.date.min() + month*6)]

    # Based sequentially on the choices above. 
    choices_cat = ['m1','m2','m3','m4','m5','m6']

    # Using month so BertTopic runs, although incorrect.
    choices_month = ['2020-01-01','2020-02-01','2020-03-01','2020-04-01','2020-05-01','2020-06-01']

    # Categorise.
    country['m_cat'] = np.select(conditions, choices_cat, default=None)
    country['month_cat'] = np.select(conditions, choices_month, default=None)
    # country["month_cat"] = pd.to_numeric(country["month_cat"])
    return country

In [None]:
# Dates are date of ban minus one day.
aus = select_subset('2018-07-10', aus)
usa = select_subset('2019-05-14', usa)
uk = select_subset('2020-07-13', uk)
can = select_subset('2022-05-13', can)

In [None]:
print(len(aus))
print(len(uk))
print(len(usa))
print(len(can))

In [None]:
# Run BERTopic for each country.

# Aus
aus_data = aus.cleaned_text.to_list()
aus_model = BERTopic(verbose=True, n_gram_range=(1, 3), min_topic_size=10, nr_topics=30, umap_model=umap_model, top_n_words=20, vectorizer_model=vectorizer_model)
aus_topics, aus_probs = aus_model.fit_transform(aus_data)

# Can
can_data = can.cleaned_text.to_list()
can_model = BERTopic(verbose=True, n_gram_range=(1, 3), min_topic_size=10, nr_topics=30, umap_model=umap_model, top_n_words=20, vectorizer_model=vectorizer_model)
can_topics, can_probs = can_model.fit_transform(can_data)

# USA
usa_data = usa.cleaned_text.to_list()
usa_model = BERTopic(verbose=True, n_gram_range=(1, 3), min_topic_size=10, nr_topics=30, umap_model=umap_model, top_n_words=20, vectorizer_model=vectorizer_model)
usa_topics, usa_probs = usa_model.fit_transform(usa_data)

# UK
uk_data = uk.cleaned_text.to_list()
uk_model = BERTopic(verbose=True, n_gram_range=(1, 3), min_topic_size=10, nr_topics=30, umap_model=umap_model, top_n_words=20, vectorizer_model=vectorizer_model)
uk_topics, uk_probs = uk_model.fit_transform(uk_data)

In [None]:
# # Topics over time.
# # aus_timestamps = aus.month_cat.to_list()
# # aus_topics_over_time = aus_model.topics_over_time(aus_data, aus_topics, aus_timestamps, datetime_format="%Y-%m-%d")
# # aus_model.visualize_topics_over_time(aus_topics_over_time, top_n_topics=20)

# # Note that when using CountVectorizer, some documents/topics will be empty over time, causing the aus_model viz to throw an error. 
# aus_timestamps = aus.month_cat.to_list()
# aus_model.topics_over_time(aus_data, aus_timestamps)
# aus_model.visualize_topics_over_time(aus_topics_over_time)

In [None]:
def concat_topics(name, country, model):
    # For each country, creates column with topic numbers, i.e., -1.
    country['topic'] = model.topics_
    
    # For each country, creates column with country, i.e., AU.
    country['country_cat'] = name
    
    # For each country, gets a list of keywords describing each topic. 
    get_topics = pd.DataFrame(model.get_topics().items())
    get_topics = get_topics.rename(columns = {0:'topic', 1:'keywords'})
    
    # For each country, gets topic definition by keywords.
    topic_info = model.get_topic_info()
    topic_info = topic_info.rename(columns = {'Topic':'topic'})
    
    # Merges into one DF. 
    country = pd.merge(country, topic_info, how='left', on='topic')
    country = pd.merge(country, get_topics, how='left', on='topic')
    return country

In [None]:
aus = concat_topics('AUS', aus, aus_model)
usa = concat_topics('USA', usa, usa_model)
can = concat_topics('CAN', can, can_model)
uk = concat_topics('UK', uk, uk_model)

In [None]:
aus = aus.drop(columns=['Unnamed: 0.1', 'Unnamed: 0_x', 'Unnamed: 0_y', 'Unnamed: 0_x.1', 'Unnamed: 0_y.1', 'edit_history_tweet_ids_y', 'month'], errors='ignore')
can = can.drop(columns=['Unnamed: 0.1', 'Unnamed: 0_x', 'Unnamed: 0_y', 'Unnamed: 0_x.1', 'Unnamed: 0_y.1', 'edit_history_tweet_ids_y', 'month'], errors='ignore')
uk = uk.drop(columns=['Unnamed: 0.1', 'Unnamed: 0_x', 'Unnamed: 0_y', 'Unnamed: 0_x.1', 'Unnamed: 0_y.1', 'edit_history_tweet_ids_y', 'month'], errors='ignore')
usa = usa.drop(columns=['Unnamed: 0.1', 'Unnamed: 0_x', 'Unnamed: 0_y', 'Unnamed: 0_x.1', 'Unnamed: 0_y.1', 'edit_history_tweet_ids_y', 'month'], errors='ignore')

In [None]:
combined = pd.concat([aus,can,uk,usa])
len(combined)

In [None]:
combined.to_csv("tweets-with-country-and-topics.csv")