# Docs

This jupyter notebook is used to extract Huawei-related tweets for four different countries, AUS, UK, CAN and USA, with the goal of identifying which countries had the most similar conversations six months prior to their respective bans. Process below:

1. It cleans data over multiple iterations of identifying irrelevant terms
2. Extracts tweets by country using a set method (identifying top five countries and iterating until done)
3. Uses unique windows of times to identify six months of tweets prior to the ban
4. Processes the topic model for each of the countries individually with a unique seed to define how many topics every country will be restricted to (30)
5. Recalculates topic models with the same seed
6. Creates a DF with similarity scores based on each topic models embeddings 
7. Creates this DF for each country
7. And, finally, identifies which countries similarity scores are highest, utlimately identifying which countries had the most similar conversations. 

In [25]:
from platform import python_version
print(python_version())

3.7.11


In [28]:
# Import relevant packages. 
import pandas as pd
from bertopic import BERTopic
from ipywidgets import FloatProgress
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
import pickle
from dateutil.relativedelta import relativedelta
import datetime
from datetime import date
import numpy as np
import os
import glob
from umap import UMAP
from nltk.corpus import stopwords
stop = stopwords.words('english')
from sklearn.metrics.pairwise import cosine_similarity
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from sklearn.feature_extraction.text import CountVectorizer

# Set relevant options.
pd.set_option('display.max_rows', 500)
pd.options.display.float_format = '{:.4f}'.format
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

In [29]:
# Import data.
combined = pd.read_csv(r".\huawei-v2.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [30]:
len(combined)

5117742

In [31]:
# Quality check.
print(len(combined))
combined = combined.drop_duplicates(subset=['tweet_id'])
print(len(combined))

# Drop tweets with irrelevant terms.
print(len(combined))
combined = combined[~combined['cleaned_text'].str.contains("giveaway|foldable|p30|mate|p20|charging|apple|ios|iphone|samsung|galaxy|win|smartphone|smartwatch|gsma|android|tablet|nova|cloud|p40|camera|review|router|battery|wallet|dlink|gb|modem|notifications", na=False, case=False)]
print(len(combined))

# Remove stop words.
# combined['cleaned_text'] = combined['cleaned_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

# Create month timestamp.
combined["month"] = combined['created_at'].str[:7]

# Cleaning function. Add as required.
def cleaner (pre):
    pre['cleaned_text'] = pre['cleaned_text'].str.replace('&amp;','and')
    pre['cleaned_text'] = pre['cleaned_text'].str.replace('&','and')
    pre['cleaned_text'] = pre['cleaned_text'].str.replace('\n',' ')
    pre['cleaned_text'] = pre['cleaned_text'].str.replace('RT','')
    pre['cleaned_text'] = pre['cleaned_text'].str.replace('http\S+|t.co\S+', '', case=False)
    pre['cleaned_text'] = pre['cleaned_text'].str.replace('s://', '', case=False)
    pre['cleaned_text'] = pre['cleaned_text'].str.replace('=andgt;', ' ', case=False)
    pre['cleaned_text'] = pre['cleaned_text'].str.replace('andgt;', ' ', case=False)
    pre['cleaned_text'] = pre['cleaned_text'].str.replace('http','')
    pre['cleaned_text'] = pre['cleaned_text'].str.replace('https','')

cleaner(combined)

5117742
5117742
5117742
2985155


In [64]:
# Extract tweets from locations.
aus = combined[combined['profile_loc'].str.contains("australia|sydney|canberra|melbourne|brisbane", na=False, case=False)]
uk = combined[combined['profile_loc'].str.contains("united kingdom|england|london", na=False, case=False)]
nz = combined[combined['profile_loc'].str.contains("new zealand|auckland|wellington|christchurch", na=False, case=False)]
usa = combined[combined['profile_loc'].str.contains("usa|united states|america|washington|california|new york|seattle", na=False, case=False)]
can = combined[combined['profile_loc'].str.contains("canada|ontario|toronto|british columbia|ottawa", na=False, case=False)]

print(len(aus))
print(len(uk))
print(len(nz))
print(len(usa))
print(len(can))

36034
149668
7301
210865
108971


In [65]:
# Restrict to six months before Aus's ban (2018-01-11) and the day of Canada's ban (2022-05-14) as requested by Jon.
nz["created_at"] = pd.to_datetime(nz["created_at"])
nz['date'] = nz['created_at'].dt.date
nz = nz.sort_values("date")
start = date.fromisoformat('2018-01-10')
end = date.fromisoformat('2022-05-15')
print(len(nz))
nz = nz[(nz['date'] >= start) & (nz['date'] <= end)]
print(len(nz))

# To make things reproducible and consistent with others.
umap_model = UMAP(random_state=42)

# To remove stop words after clustering.
vectorizer_model = CountVectorizer(stop_words="english")

# Run topic model.
nz_data = nz.cleaned_text.to_list()
nz_model = BERTopic(verbose=True, n_gram_range=(1, 3), min_topic_size=10, nr_topics=30, umap_model=umap_model, top_n_words=20, vectorizer_model=vectorizer_model)
nz_topics, nz_probs = nz_model.fit_transform(nz_data)

# Run code to create CSV with topics. 
nz = concat_topics('NZ', nz, nz_model)
nz = nz.drop(columns=['Unnamed: 0.1', 'Unnamed: 0_x', 'Unnamed: 0_y', 'Unnamed: 0_x.1', 'Unnamed: 0_y.1', 'edit_history_tweet_ids_y', 'month'], errors='ignore')
nz.to_csv("nz-tweets-with-country-and-topics.csv")

7301
7199


Batches:   0%|          | 0/225 [00:00<?, ?it/s]

2022-11-18 12:42:08,250 - BERTopic - Transformed documents to Embeddings
2022-11-18 12:42:27,154 - BERTopic - Reduced dimensionality
2022-11-18 12:42:27,561 - BERTopic - Clustered reduced embeddings
2022-11-18 12:42:35,686 - BERTopic - Reduced number of topics from 118 to 31


In [782]:
# Edit date of ban and country.

d = date.fromisoformat('2018-07-11') #aus
# d = date.fromisoformat('2019-05-15') #usa
# d = date.fromisoformat('2020-07-14') #uk
# d = date.fromisoformat('2022-05-14') #can

country = aus

# Used for one month timesteps.
month = relativedelta(months=1)

# Formats date.
country["created_at"] = pd.to_datetime(country["created_at"])
country['date'] = country['created_at'].dt.date
print(len(country))

# Only includes 6 months before ban in country.
country = country[(country['date'] >= d-month*6) & (country['date'] <= d)]
print(d-month*6)
print(d)
print(len(country))

# Setting multiple conditions with one month intervals. 
conditions = [
    (country.date >= country.date.min()) & (country.date <= country.date.min() + month),
    (country.date >= country.date.min() + month) & (country.date <= country.date.min() + month*2),
    (country.date >= country.date.min() + month*2) & (country.date <= country.date.min() + month*3),
    (country.date >= country.date.min() + month*3) & (country.date <= country.date.min() + month*4),
    (country.date >= country.date.min() + month*4) & (country.date <= country.date.min() + month*5),
    (country.date >= country.date.min() + month*5) & (country.date <= country.date.min() + month*6)]

# Based sequentially on the choices above. 
choices_cat = ['m1','m2','m3','m4','m5','m6']

# Using month so BertTopic runs, although incorrect.
choices_month = ['2020-01-01','2020-02-01','2020-03-01','2020-04-01','2020-05-01','2020-06-01']

# Categorise.
country['m_cat'] = np.select(conditions, choices_cat, default=None)
country['month_cat'] = np.select(conditions, choices_month, default=None)
# country["month_cat"] = pd.to_numeric(country["month_cat"])

# Edit country name. 
aus = country
print(len(country))
country.head()

2598
2018-01-11
2018-07-11
2598
2598


Unnamed: 0.1,Unnamed: 0,created_at,source,entities,author_id,referenced_tweets,possibly_sensitive,conversation_id,tweet_id,reply_settings,in_reply_to_user_id,attachments,text,geo_id,retweets,replies,likes,quotes,tweet_loc_short,place_type,tweet_loc_long,country,full_text,tweet_loc,verified,name,profile_desc,username,protected,profile_created,profile_loc,url,withheld,followers,following,total_tweets,cleaned_text,date,m_cat,month_cat,topic,country_cat,Count,Name,keywords,test
0,4014315,2018-07-11 23:32:21+00:00,Twitter Web Client,"{'urls': [{'start': 67, 'end': 90, 'url': 'https://t.co/k3a4hBcgg3', 'expanded_url': 'https://www.canberratimes.com.au/politics/western-australia/former-wa-minister-free-huawei-phones-kept-coming-up-in-chinese-20180711-p4zqxq.html', 'display_url': 'canberratimes.com.au/politics/weste…'}], 'annotations': [{'start': 25, 'end': 37, 'probability': 0.437, 'type': 'Product', 'normalized_text': 'Huawei phones'}], 'mentions': [{'start': 95, 'end': 109, 'username': 'canberratimes', 'id': '17125730'}]}",55113432,,False,1017189884711264256,1017189884711264256,everyone,,,Former WA minister: Free Huawei phones 'kept coming up in Chinese' https://t.co/k3a4hBcgg3 via @canberratimes,,0.0,0.0,0.0,0.0,,,,,,,False,lynlinking,"Sharing Political News & Current Affairs\nArticles by Independent Authors, Blogs, \nIndependent Newspapers & others \nSharing articles by #WgarNews \n I follow Back",lynlinking,False,2009-07-09 02:27:53+00:00,Australia,,,32183.0,35291.0,378999.0,Former WA minister: Free Huawei phones 'kep up in Chinese' via @canberratimes,2018-07-11,m6,2020-06-01,4,AUS,98,4_data_facebook_user_shared,"[(data, 0.14619563497907975), (facebook, 0.12118788462092657), (user, 0.09347864633439568), (shared, 0.06658371455174689), (access, 0.06369123086882364), (zuckerberg, 0.05514332288010932), (mark, 0.05514332288010932), (gave, 0.04887174525279423), (chinese, 0.04850362111108616), (device, 0.04368261948577905), (facebooks, 0.04082674011860171), (australian, 0.04050222416759447), (users, 0.03938354762877259), (threat, 0.0385248089790342), (prescientinfo, 0.03542518238106668), (committee, 0.033395840276195306), (makers, 0.03299446060346893), (says, 0.031732796796069565), (privacy, 0.02834014590485334), (social, 0.02518607605403095)]",2020-06-01
1,4014316,2018-07-11 23:30:39+00:00,Twitter Web Client,"{'urls': [{'start': 63, 'end': 86, 'url': 'https://t.co/2S6Z6BtnBk', 'expanded_url': 'https://cnet.co/2m7SaGa', 'display_url': 'cnet.co/2m7SaGa'}], 'annotations': [{'start': 0, 'end': 8, 'probability': 0.9487000000000001, 'type': 'Place', 'normalized_text': 'Australia'}, {'start': 17, 'end': 22, 'probability': 0.8402000000000001, 'type': 'Organization', 'normalized_text': 'Huawei'}, {'start': 29, 'end': 30, 'probability': 0.7378, 'type': 'Organization', 'normalized_text': '5G'}], 'mentions': [{'start': 91, 'end': 96, 'username': 'CNET', 'id': '30261067'}]}",206445861,,False,1017189455319330816,1017189455319330816,everyone,,,Australia to ban Huawei from 5G rollout amid security concerns https://t.co/2S6Z6BtnBk via @CNET,,0.0,0.0,0.0,0.0,,,,,,,False,Eduardo Almeida,,geduardoalmeida,False,2010-10-23 00:11:19+00:00,Sydney,,,56.0,114.0,147.0,Australia to ban Huawei from 5G rollout amid security concerns via @CNET,2018-07-11,m6,2020-06-01,11,AUS,42,11_ban_looms_5g_slashes,"[(ban, 0.2552812140624132), (looms, 0.20077168365243575), (5g, 0.15392018205861477), (slashes, 0.13534348431998378), (warns, 0.12817540228977792), (research, 0.11360681258206272), (local, 0.11096158260487514), (exclusion, 0.10398418855390186), (executive, 0.08934061949426127), (economy, 0.08781869109666174), (offensive, 0.07657767385222394), (weather, 0.07521012508982133), (risks, 0.07162793535112166), (possible, 0.07086059423027143), (goes, 0.06991385579442431), (rollouts, 0.06496970937959921), (australia, 0.06442829747284846), (googleausedhuawei, 0.051286348156877784), (computerworld, 0.051286348156877784), (pushes, 0.051051782568149295)]",2020-06-01
2,4014517,2018-07-11 20:00:13+00:00,Tumblr,"{'urls': [{'start': 108, 'end': 131, 'url': 'https://t.co/aWVLI3rKhH', 'expanded_url': 'https://tmblr.co/Zu1l4a2ZjqZnM', 'display_url': 'tmblr.co/Zu1l4a2ZjqZnM'}], 'annotations': [{'start': 71, 'end': 74, 'probability': 0.41140000000000004, 'type': 'Product', 'normalized_text': 'Sony'}, {'start': 77, 'end': 82, 'probability': 0.5118, 'type': 'Product', 'normalized_text': 'Huawei'}]}",920266768303755264,,False,1017136499777818624,1017136499777818624,everyone,,,"Why Ignoring Mobile Phone Repairs Adelaide Will Cheap Cost - These are Sony, Huawei as well as LG. These... https://t.co/aWVLI3rKhH",,0.0,0.0,0.0,0.0,,,,,,,False,Mobile Repair Shop,,M_RepairShop,False,2017-10-17 12:34:30+00:00,"344A Magill Road, SA Australia",https://t.co/wVeaCCaJcm,,79.0,878.0,664.0,"Why Ignoring Mobile Phone Repairs Adelaide Will Cheap Cost - These are Sony, Huawei as well as LG. These...",2018-07-11,m6,2020-06-01,15,AUS,37,15_trusted_desperately_convince_trying,"[(trusted, 0.10890219512055276), (desperately, 0.08347595156055189), (convince, 0.08347595156055189), (trying, 0.07044727734773171), (australian, 0.05872970214819679), (politicians, 0.04803771194921652), (chinas, 0.04485837322699218), (bribery, 0.04423737773335662), (australia, 0.03901026017663523), (telco, 0.038872857878964065), (prominent, 0.03770724941074603), (adelaide, 0.0343679052523468), (huawei, 0.0296880621331647), (intensifies, 0.02815109235705388), (lobbies, 0.026907957531368258), (friction, 0.026907957531368258), (chinaaustralia, 0.026907957531368258), (fabricate, 0.026907957531368258), (repairing, 0.026907957531368258), (scoopit, 0.026907957531368258)]",2020-06-01
3,4014757,2018-07-11 16:43:52+00:00,Fan Page App,"{'urls': [{'start': 256, 'end': 279, 'url': 'https://t.co/glt6o2a98B', 'expanded_url': 'http://uk.businessinsider.com/r-australia-prepares-to-ban-huawei-from-5g-project-over-security-fears-2018-7', 'display_url': 'uk.businessinsider.com/r-australia-pr…'}], 'annotations': [{'start': 0, 'end': 8, 'probability': 0.8901, 'type': 'Place', 'normalized_text': 'Australia'}, {'start': 26, 'end': 31, 'probability': 0.8117000000000001, 'type': 'Organization', 'normalized_text': 'Huawei'}, {'start': 193, 'end': 196, 'probability': 0.9218000000000001, 'type': 'Place', 'normalized_text': 'U.S.'}, {'start': 225, 'end': 230, 'probability': 0.39540000000000003, 'type': 'Organization', 'normalized_text': 'Huawei'}, {'start': 235, 'end': 243, 'probability': 0.9992000000000001, 'type': 'Place', 'normalized_text': 'Australia'}]}",231651901,,False,1017087085344444416,1017087085344444416,everyone,,,"Australia prepares to ban Huawei from 5G project over security fears\nIts business serving small, rural telecom operators is now at risk after new attacks on the company in recent weeks by some U.S. lawmakers. The move to ban Huawei in Australia comes a... https://t.co/glt6o2a98B",,0.0,0.0,0.0,0.0,,,,,,,False,Adam Mulcahy,Founder e-Cruitment ♦ Fully Managed Offshore Outsourcing ♦ Outsourcing Partner ♦ Virtual Staff Expert ♦ Mr Outsource,adam_mulcahy,False,2010-12-29 03:17:04+00:00,Sydney,http://t.co/vu3JZP8Lyk,,614.0,1978.0,431.0,"Australia prepares to ban Huawei from 5G project over security fears Its business serving small, rural telecom operators is now at risk after new attacks on the company in recent weeks by some U.S. lawmakers. The move to ban Huawei in Australia comes a...",2018-07-11,m6,2020-06-01,-1,AUS,983,-1_huawei_china_chinese_5g,"[(huawei, 0.024465978490767162), (china, 0.016171303305510334), (chinese, 0.016117524135508345), (5g, 0.01481923681594938), (auspol, 0.012050279782231773), (chinas, 0.011577122043916645), (network, 0.011474267920155318), (huaweis, 0.011032983832263652), (government, 0.010895449880417905), (nyt, 0.01038360299036452), (security, 0.01009115153750317), (john, 0.009768137464887474), (phone, 0.00960987664979409), (lord, 0.009555468034517947), (new, 0.009165001524514306), (company, 0.008805082078123897), (market, 0.008781147438451575), (australia, 0.008607778014381437), (australian, 0.008365908200613762), (tech, 0.008043039333984243)]",2020-06-01
4,4014879,2018-07-11 15:40:55+00:00,IFTTT,"{'urls': [{'start': 67, 'end': 90, 'url': 'https://t.co/fjAS2RJEPd', 'expanded_url': 'http://bit.ly/2NHwCMY', 'display_url': 'bit.ly/2NHwCMY'}, {'start': 91, 'end': 114, 'url': 'https://t.co/4IwtUVCrZe', 'expanded_url': 'http://bit.ly/ONymEF', 'display_url': 'bit.ly/ONymEF'}], 'annotations': [{'start': 31, 'end': 36, 'probability': 0.5446, 'type': 'Organization', 'normalized_text': 'Huawei'}, {'start': 57, 'end': 65, 'probability': 0.9913000000000001, 'type': 'Place', 'normalized_text': 'Australia'}], 'hashtags': [{'start': 0, 'end': 8, 'tag': 'weather'}]}",1036843507,,False,1017071242007007232,1017071242007007232,everyone,,,#weather news: Hammer blow for Huawei as 5G ban looms in Australia https://t.co/fjAS2RJEPd https://t.co/4IwtUVCrZe,,0.0,0.0,0.0,0.0,,,,,,,False,Long Beach Weather,"Automated weather tweets from an amateur weather station in Long Beach, NSW Australia. Not for official use, never rely on internet weather, look outside!",LongBeachNSW,False,2012-12-26 10:37:57+00:00,"Long Beach, NSW, Australia",https://t.co/gMx7lHCA4A,,677.0,1472.0,105790.0,#weather news: Hammer blow for Huawei as 5G ban looms in Australia,2018-07-11,m6,2020-06-01,11,AUS,42,11_ban_looms_5g_slashes,"[(ban, 0.2552812140624132), (looms, 0.20077168365243575), (5g, 0.15392018205861477), (slashes, 0.13534348431998378), (warns, 0.12817540228977792), (research, 0.11360681258206272), (local, 0.11096158260487514), (exclusion, 0.10398418855390186), (executive, 0.08934061949426127), (economy, 0.08781869109666174), (offensive, 0.07657767385222394), (weather, 0.07521012508982133), (risks, 0.07162793535112166), (possible, 0.07086059423027143), (goes, 0.06991385579442431), (rollouts, 0.06496970937959921), (australia, 0.06442829747284846), (googleausedhuawei, 0.051286348156877784), (computerworld, 0.051286348156877784), (pushes, 0.051051782568149295)]",2020-06-01


In [618]:
print(len(aus))
print(len(uk))
print(len(usa))
print(len(can))

2598
38828
56358
6062


In [34]:
# To make things reproducible
umap_model = UMAP(random_state=42)

# To remove stop words after clustering
vectorizer_model = CountVectorizer(stop_words="english")

# Topics over time.
# aus_timestamps = aus.month_cat.to_list()
# aus_topics_over_time = aus_model.topics_over_time(aus_data, aus_topics, aus_timestamps, datetime_format="%Y-%m-%d")
# aus_model.visualize_topics_over_time(aus_topics_over_time, top_n_topics=20)

# # Aus
# aus_data = aus.cleaned_text.to_list()
# aus_model = BERTopic(verbose=True, n_gram_range=(1, 3), min_topic_size=10, nr_topics=30, umap_model=umap_model, top_n_words=20, vectorizer_model=vectorizer_model)
# aus_topics, aus_probs = aus_model.fit_transform(aus_data)

# # Can
# can_data = can.cleaned_text.to_list()
# can_model = BERTopic(verbose=True, n_gram_range=(1, 3), min_topic_size=10, nr_topics=30, umap_model=umap_model, top_n_words=20, vectorizer_model=vectorizer_model)
# can_topics, can_probs = can_model.fit_transform(can_data)

# # USA
# usa_data = usa.cleaned_text.to_list()
# usa_model = BERTopic(verbose=True, n_gram_range=(1, 3), min_topic_size=10, nr_topics=30, umap_model=umap_model, top_n_words=20, vectorizer_model=vectorizer_model)
# usa_topics, usa_probs = usa_model.fit_transform(usa_data)

# # UK
# uk_data = uk.cleaned_text.to_list()
# uk_model = BERTopic(verbose=True, n_gram_range=(1, 3), min_topic_size=10, nr_topics=30, umap_model=umap_model, top_n_words=20, vectorizer_model=vectorizer_model)
# uk_topics, uk_probs = uk_model.fit_transform(uk_data)

In [788]:
# Note that when using CountVectorizer, some documents/topics will be empty over time, causing the aus_model viz to throw an error. 
aus_timestamps = aus.month_cat.to_list()
aus_model.topics_over_time(aus_data, aus_timestamps)
aus_model.visualize_topics_over_time(aus_topics_over_time)

In [45]:
def concat_topics(name, country, model):
    # For each country, creates column with topic numbers, i.e., -1.
    country['topic'] = model.topics_
    
    # For each country, creates column with country, i.e., AU.
    country['country_cat'] = name
    
    # For each country, gets a list of keywords describing each topic. 
    get_topics = pd.DataFrame(model.get_topics().items())
    get_topics = get_topics.rename(columns = {0:'topic', 1:'keywords'})
    
    # For each country, gets topic definition by keywords.
    topic_info = model.get_topic_info()
    topic_info = topic_info.rename(columns = {'Topic':'topic'})
    
    # Merges into one DF. 
    country = pd.merge(country, topic_info, how='left', on='topic')
    country = pd.merge(country, get_topics, how='left', on='topic')
    return country

In [620]:
aus = concat_topics('AUS', aus, aus_model)
usa = concat_topics('USA', usa, usa_model)
can = concat_topics('CAN', can, can_model)
uk = concat_topics('UK', uk, uk_model)

In [48]:
aus = aus.drop(columns=['Unnamed: 0.1', 'Unnamed: 0_x', 'Unnamed: 0_y', 'Unnamed: 0_x.1', 'Unnamed: 0_y.1', 'edit_history_tweet_ids_y', 'month'], errors='ignore')
can = can.drop(columns=['Unnamed: 0.1', 'Unnamed: 0_x', 'Unnamed: 0_y', 'Unnamed: 0_x.1', 'Unnamed: 0_y.1', 'edit_history_tweet_ids_y', 'month'], errors='ignore')
uk = uk.drop(columns=['Unnamed: 0.1', 'Unnamed: 0_x', 'Unnamed: 0_y', 'Unnamed: 0_x.1', 'Unnamed: 0_y.1', 'edit_history_tweet_ids_y', 'month'], errors='ignore')
usa = usa.drop(columns=['Unnamed: 0.1', 'Unnamed: 0_x', 'Unnamed: 0_y', 'Unnamed: 0_x.1', 'Unnamed: 0_y.1', 'edit_history_tweet_ids_y', 'month'], errors='ignore')

In [622]:
combined = pd.concat([aus,can,uk,usa])
len(combined)

103846

In [624]:
combined.to_csv("tweets-with-country-and-topics.csv")

In [30]:
# CHANGE THESE. 3 months before and after ban. 
name = "aus"
model = "1"
sample = aus
# sample = combined_by_year.sample(n=10000)

# Creating lists. Can be regenerated from DF.
timestamps = sample.month_cat.to_list()
tweets = sample.cleaned_text.to_list()

# Seed for consistency.
umap_model = UMAP(random_state=30)

# Generating topic model and probabilities.
topic_model = BERTopic(verbose=True, n_gram_range=(1, 3), min_topic_size=10, nr_topics="auto", umap_model=umap_model, top_n_words=20)
topics, probs = topic_model.fit_transform(tweets)

# Topics over time.
# topics_over_time = topic_model.topics_over_time(tweets, topics, timestamps)


topic_model.save(name + "-model-" + model)
topics_over_time.to_csv(name + "-time-" + model + ".csv")
topics = topics
probs = probs

# Saving topics.
with open(name + "-topics-" + model + ".txt", "wb") as fp: 
    pickle.dump(topics, fp)

# Saving probs.
with open(name + "-probs-" + model + ".txt", "wb") as fp:
    pickle.dump(probs, fp)

# Saving sample.
sample.to_csv(name + "-df-" + model + ".csv")

# Saving topic viz.
# topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20).write_html(name + "-viz-" + model + ".html")
# topic_model.get_topic_info().to_csv(name + "-info-" + model + ".csv")

# Saving heirarchy.
hierarchical_topics = topic_model.hierarchical_topics(tweets)
topic_model.visualize_hierarchical_documents(tweets, hierarchical_topics).write_html(name + "-hier-docs-" + model + ".html")
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics).write_html(name + "-hier-" + model + ".html")

# Saving rep docs.
# rep_docs = topic_model.get_representative_docs(topic=None)
# rep_docs = pd.DataFrame.from_dict(rep_docs)
# rep_docs = rep_docs.transpose()
# rep_docs.to_csv(name + "-rep-" + model + ".csv")

# Top keywords for each topic. 
rep_keywords = pd.DataFrame(topic_model.get_topics())
rep_keywords.to_csv(name + "-key-" + model + ".csv")

# Visualising topics.
# topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20)

Batches:   0%|          | 0/82 [00:00<?, ?it/s]

2022-10-13 13:15:52,391 - BERTopic - Transformed documents to Embeddings
2022-10-13 13:16:05,011 - BERTopic - Reduced dimensionality
2022-10-13 13:16:05,179 - BERTopic - Clustered reduced embeddings
2022-10-13 13:16:14,404 - BERTopic - Reduced number of topics from 69 to 57
100%|██████████████████████████████████████████| 55/55 [00:00<00:00, 74.49it/s]


In [31]:
# CHANGE THESE. 3 months before and after ban. 
name = "can"
model = "1"
sample = can
# sample = combined_by_year.sample(n=10000)

# Creating lists. Can be regenerated from DF.
timestamps = sample.month_cat.to_list()
tweets = sample.cleaned_text.to_list()

# Seed for consistency. CHANGE THIS.
umap_model = UMAP(random_state=31)

# Generating topic model and probabilities.
topic_model = BERTopic(verbose=True, n_gram_range=(1, 3), min_topic_size=10, nr_topics="auto", umap_model=umap_model, top_n_words=20)
topics, probs = topic_model.fit_transform(tweets)

# Topics over time.
# topics_over_time = topic_model.topics_over_time(tweets, topics, timestamps)

topic_model.save(name + "-model-" + model)
topics_over_time.to_csv(name + "-time-" + model + ".csv")
topics = topics
probs = probs

# Saving topics.
with open(name + "-topics-" + model + ".txt", "wb") as fp: 
    pickle.dump(topics, fp)

# Saving probs.
with open(name + "-probs-" + model + ".txt", "wb") as fp:
    pickle.dump(probs, fp)

# Saving sample.
sample.to_csv(name + "-df-" + model + ".csv")

# Saving topic viz.
# topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20).write_html(name + "-viz-" + model + ".html")
# topic_model.get_topic_info().to_csv(name + "-info-" + model + ".csv")

# Saving heirarchy.
hierarchical_topics = topic_model.hierarchical_topics(tweets)
topic_model.visualize_hierarchical_documents(tweets, hierarchical_topics).write_html(name + "-hier-docs-" + model + ".html")
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics).write_html(name + "-hier-" + model + ".html")

# Saving rep docs.
# rep_docs = topic_model.get_representative_docs(topic=None)
# rep_docs = pd.DataFrame.from_dict(rep_docs)
# rep_docs = rep_docs.transpose()
# rep_docs.to_csv(name + "-rep-" + model + ".csv")

# Top keywords for each topic. 
rep_keywords = pd.DataFrame(topic_model.get_topics())
rep_keywords.to_csv(name + "-key-" + model + ".csv")

# Visualising topics.
# topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20)

Batches:   0%|          | 0/190 [00:00<?, ?it/s]

2022-10-13 13:18:40,911 - BERTopic - Transformed documents to Embeddings
2022-10-13 13:19:26,422 - BERTopic - Reduced dimensionality
2022-10-13 13:19:26,734 - BERTopic - Clustered reduced embeddings
2022-10-13 13:19:39,566 - BERTopic - Reduced number of topics from 96 to 57

Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.

100%|██████████████████████████████████████████| 55/55 [00:00<00:00, 61.76it/s]


In [32]:
# CHANGE THESE. 3 months before and after ban. 
name = "uk"
model = "1"
sample = uk
# sample = combined_by_year.sample(n=10000)

# Creating lists. Can be regenerated from DF.
timestamps = sample.month_cat.to_list()
tweets = sample.cleaned_text.to_list()

# Seed for consistency. CHANGE THIS.
umap_model = UMAP(random_state=32)

# Generating topic model and probabilities.
topic_model = BERTopic(verbose=True, n_gram_range=(1, 3), min_topic_size=10, nr_topics="auto", umap_model=umap_model, top_n_words=20)
topics, probs = topic_model.fit_transform(tweets)

# Topics over time.
# topics_over_time = topic_model.topics_over_time(tweets, topics, timestamps)

topic_model.save(name + "-model-" + model)
topics_over_time.to_csv(name + "-time-" + model + ".csv")
topics = topics
probs = probs

# Saving topics.
with open(name + "-topics-" + model + ".txt", "wb") as fp: 
    pickle.dump(topics, fp)

# Saving probs.
with open(name + "-probs-" + model + ".txt", "wb") as fp:
    pickle.dump(probs, fp)

# Saving sample.
sample.to_csv(name + "-df-" + model + ".csv")

# Saving topic viz.
# topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20).write_html(name + "-viz-" + model + ".html")
# topic_model.get_topic_info().to_csv(name + "-info-" + model + ".csv")

# Saving heirarchy.
hierarchical_topics = topic_model.hierarchical_topics(tweets)
topic_model.visualize_hierarchical_documents(tweets, hierarchical_topics).write_html(name + "-hier-docs-" + model + ".html")
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics).write_html(name + "-hier-" + model + ".html")

# Saving rep docs.
# rep_docs = topic_model.get_representative_docs(topic=None)
# rep_docs = pd.DataFrame.from_dict(rep_docs)
# rep_docs = rep_docs.transpose()
# rep_docs.to_csv(name + "-rep-" + model + ".csv")

# Top keywords for each topic. 
rep_keywords = pd.DataFrame(topic_model.get_topics())
rep_keywords.to_csv(name + "-key-" + model + ".csv")

# Visualising topics.
# topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20)

Batches:   0%|          | 0/1214 [00:00<?, ?it/s]

2022-10-13 13:31:02,693 - BERTopic - Transformed documents to Embeddings
2022-10-13 13:31:52,085 - BERTopic - Reduced dimensionality
2022-10-13 13:31:55,646 - BERTopic - Clustered reduced embeddings
2022-10-13 13:33:07,773 - BERTopic - Reduced number of topics from 555 to 80

Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.

100%|██████████████████████████████████████████| 78/78 [00:02<00:00, 26.69it/s]


In [33]:
# CHANGE THESE. 3 months before and after ban. 
name = "usa"
model = "1"
sample = usa
# sample = combined_by_year.sample(n=10000)

# Creating lists. Can be regenerated from DF.
timestamps = sample.month_cat.to_list()
tweets = sample.cleaned_text.to_list()

# Seed for consistency. CHANGE THIS.
umap_model = UMAP(random_state=32)

# Generating topic model and probabilities.
topic_model = BERTopic(verbose=True, n_gram_range=(1, 3), min_topic_size=10, nr_topics="auto", umap_model=umap_model, top_n_words=20)
topics, probs = topic_model.fit_transform(tweets)

# Topics over time.
# topics_over_time = topic_model.topics_over_time(tweets, topics, timestamps)

topic_model.save(name + "-model-" + model)
topics_over_time.to_csv(name + "-time-" + model + ".csv")
topics = topics
probs = probs

# Saving topics.
with open(name + "-topics-" + model + ".txt", "wb") as fp: 
    pickle.dump(topics, fp)

# Saving probs.
with open(name + "-probs-" + model + ".txt", "wb") as fp:
    pickle.dump(probs, fp)

# Saving sample.
sample.to_csv(name + "-df-" + model + ".csv")

# Saving topic viz.
# topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20).write_html(name + "-viz-" + model + ".html")
# topic_model.get_topic_info().to_csv(name + "-info-" + model + ".csv")

# Saving heirarchy.
hierarchical_topics = topic_model.hierarchical_topics(tweets)
topic_model.visualize_hierarchical_documents(tweets, hierarchical_topics).write_html(name + "-hier-docs-" + model + ".html")
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics).write_html(name + "-hier-" + model + ".html")

# Saving rep docs.
# rep_docs = topic_model.get_representative_docs(topic=None)
# rep_docs = pd.DataFrame.from_dict(rep_docs)
# rep_docs = rep_docs.transpose()
# rep_docs.to_csv(name + "-rep-" + model + ".csv")

# Top keywords for each topic. 
rep_keywords = pd.DataFrame(topic_model.get_topics())
rep_keywords.to_csv(name + "-key-" + model + ".csv")

# Visualising topics.
# topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20)

Batches:   0%|          | 0/1762 [00:00<?, ?it/s]

2022-10-13 13:53:02,930 - BERTopic - Transformed documents to Embeddings
2022-10-13 13:54:24,556 - BERTopic - Reduced dimensionality
2022-10-13 13:54:30,575 - BERTopic - Clustered reduced embeddings
2022-10-13 13:57:19,435 - BERTopic - Reduced number of topics from 1363 to 1067

Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.

100%|██████████████████████████████████████| 1065/1065 [00:42<00:00, 24.86it/s]


In [404]:
# To make things reproducible
umap_model = UMAP(random_state=42)

# To remove stop words after clustering
vectorizer_model = CountVectorizer(stop_words="english")

# Topics over time.
# aus_timestamps = aus.month_cat.to_list()
# aus_topics_over_time = aus_model.topics_over_time(aus_data, aus_topics, aus_timestamps, datetime_format="%Y-%m-%d")
# aus_model.visualize_topics_over_time(aus_topics_over_time, top_n_topics=20)

# USA
usa_data = usa.cleaned_text.to_list()
usa_model = BERTopic(verbose=True, n_gram_range=(1, 3), min_topic_size=10, nr_topics=30, umap_model=umap_model, top_n_words=20, vectorizer_model=vectorizer_model)
usa_topics, usa_probs = usa_model.fit_transform(usa_data)

# UK
uk_data = uk.cleaned_text.to_list()
uk_model = BERTopic(verbose=True, n_gram_range=(1, 3), min_topic_size=10, nr_topics=30, umap_model=umap_model, top_n_words=20, vectorizer_model=vectorizer_model)
uk_topics, uk_probs = uk_model.fit_transform(uk_data)

# Aus
aus_data = aus.cleaned_text.to_list()
aus_model = BERTopic(verbose=True, n_gram_range=(1, 3), min_topic_size=10, nr_topics=30, umap_model=umap_model, top_n_words=20, vectorizer_model=vectorizer_model)
aus_topics, aus_probs = aus_model.fit_transform(aus_data)

# Can
can_data = can.cleaned_text.to_list()
can_model = BERTopic(verbose=True, n_gram_range=(1, 3), min_topic_size=10, nr_topics=30, umap_model=umap_model, top_n_words=20, vectorizer_model=vectorizer_model)
can_topics, can_probs = can_model.fit_transform(can_data)

Batches:   0%|          | 0/1762 [00:00<?, ?it/s]

2022-10-21 16:38:26,764 - BERTopic - Transformed documents to Embeddings
2022-10-21 16:39:47,785 - BERTopic - Reduced dimensionality
2022-10-21 16:39:54,129 - BERTopic - Clustered reduced embeddings
2022-10-21 16:41:24,005 - BERTopic - Reduced number of topics from 1349 to 31


Batches:   0%|          | 0/1214 [00:00<?, ?it/s]

2022-10-21 16:50:30,551 - BERTopic - Transformed documents to Embeddings
2022-10-21 16:51:24,863 - BERTopic - Reduced dimensionality
2022-10-21 16:51:28,692 - BERTopic - Clustered reduced embeddings
2022-10-21 16:52:13,035 - BERTopic - Reduced number of topics from 566 to 31


Batches:   0%|          | 0/82 [00:00<?, ?it/s]

2022-10-21 16:52:44,350 - BERTopic - Transformed documents to Embeddings
2022-10-21 16:52:57,318 - BERTopic - Reduced dimensionality
2022-10-21 16:52:57,459 - BERTopic - Clustered reduced embeddings
2022-10-21 16:53:02,334 - BERTopic - Reduced number of topics from 66 to 31


Batches:   0%|          | 0/190 [00:00<?, ?it/s]

2022-10-21 16:54:36,631 - BERTopic - Transformed documents to Embeddings
2022-10-21 16:54:53,615 - BERTopic - Reduced dimensionality
2022-10-21 16:54:53,928 - BERTopic - Clustered reduced embeddings
2022-10-21 16:55:00,440 - BERTopic - Reduced number of topics from 90 to 31


In [405]:
# aus_hier = aus_model.hierarchical_topics(aus_data)
# aus_model.visualize_hierarchical_documents(aus_data, aus_hier)

# can_hier = can_model.hierarchical_topics(can_data)
# can_model.visualize_hierarchical_documents(can_data, can_hier)

# uk_hier = uk_model.hierarchical_topics(uk_data)
# uk_model.visualize_hierarchical_documents(uk_data, uk_hier)

# usa_hier = usa_model.hierarchical_topics(usa_data)
# usa_model.visualize_hierarchical_documents(usa_data, usa_hier)

In [406]:
# Calculate similarity matrix for each country.
def calc_matrix(orig_model, orig_name, comp_model, comp_name):
    
    # Create initial DF.
    df = pd.DataFrame(columns = ["Original Country", "Original Topic", "Original Keywords", "Comparator Country", "Comparator Topic", "Comparator Keywords", "Similarity Score"])
    
    # Calculate similarity.
    sim_matrix = cosine_similarity(orig_model.topic_embeddings_, comp_model.topic_embeddings_)
    
    # Drops first row of 2D array, -1.
    sim_matrix = np.delete(sim_matrix, 0, 0)

    # Drops first col of 2D array, -1.
    sim_matrix = np.delete(sim_matrix, 0, 1)
    
    for i in range(0,30):
        # Original country. 
        orig = orig_name

        # Topic to search.
        orig_topic = i

        # Function to identify most similar keywords.
        most_similar_topic = np.argmax(sim_matrix[orig_topic])

        # Output original topics keywords. EDIT.
        orig_keywords = orig_model.get_topic(orig_topic)
        
        # Comparator country. 
        comp = comp_name

        # Output most similar topic from comparison.
        comp_topic = most_similar_topic

        # Output comparator's topic's keywords. EDIT.
        comp_keywords = comp_model.get_topic(most_similar_topic)

        # Output similarity.
        similarity = sim_matrix[orig_topic].max()

        # Output. 
        df.loc[i] = orig, orig_topic, orig_keywords, comp, comp_topic, comp_keywords, similarity
        
    return df

In [408]:
# Calculate for Aus.
ausuk = calc_matrix(aus_model, "AUS", uk_model, "UK")
aususa = calc_matrix(aus_model, "AUS", usa_model, "USA")
auscan = calc_matrix(aus_model, "AUS", can_model, "CAN")

aus_combined = pd.concat([auscan, ausuk, aususa])
print(aus_combined.groupby('Original Topic')['Similarity Score'].max())
aus_groups = aus_combined.sort_values('Similarity Score').drop_duplicates(['Original Topic'], keep='last').sort_values('Original Topic')
aus_groups['Comparator Country'].value_counts()

Original Topic
0    0.8665
1    0.8655
2    0.9071
3    0.9304
4    0.8248
5    0.8784
6    0.8763
7    0.7469
8    0.8508
9    0.8177
10   0.8338
11   0.8776
12   0.8416
13   0.8305
14   0.8262
15   0.8371
16   0.8959
17   0.8843
18   0.8252
19   0.8268
20   0.8426
21   0.8543
22   0.8685
23   0.8413
24   0.8378
25   0.9152
26   0.8630
27   0.8500
28   0.8419
29   0.8405
Name: Similarity Score, dtype: float64


UK     11
CAN    10
USA     9
Name: Comparator Country, dtype: int64

In [409]:
usauk = calc_matrix(usa_model, "USA", uk_model, "UK")
usacan = calc_matrix(usa_model, "USA", can_model, "CAN")
usaaus = calc_matrix(usa_model, "USA", aus_model, "AUS")

usa_combined = pd.concat([usaaus, usacan, usauk])
print(usa_combined.groupby('Original Topic')['Similarity Score'].max())
usa_groups = usa_combined.sort_values('Similarity Score').drop_duplicates(['Original Topic'], keep='last').sort_values('Original Topic')
usa_groups['Comparator Country'].value_counts()

Original Topic
0    0.8959
1    0.9440
2    0.8598
3    0.9244
4    0.9428
5    0.8506
6    0.8505
7    0.9110
8    0.8237
9    0.8697
10   0.8286
11   0.9180
12   0.9070
13   0.8546
14   0.8701
15   0.8214
16   0.8355
17   0.8166
18   0.8679
19   0.9014
20   0.8198
21   0.8646
22   0.8429
23   0.8839
24   0.8534
25   0.9233
26   0.8440
27   0.8361
28   0.8949
29   0.9018
Name: Similarity Score, dtype: float64


CAN    12
AUS    11
UK      7
Name: Comparator Country, dtype: int64

In [410]:
ukaus = calc_matrix(uk_model, "UK", aus_model, "AUS")
ukcan = calc_matrix(uk_model, "UK", can_model, "CAN")
ukusa = calc_matrix(uk_model, "UK", usa_model, "USA")

uk_combined = pd.concat([ukaus, ukcan, ukusa])
print(uk_combined.groupby('Original Topic')['Similarity Score'].max())
uk_groups = uk_combined.sort_values('Similarity Score').drop_duplicates(['Original Topic'], keep='last').sort_values('Original Topic')
uk_groups['Comparator Country'].value_counts()

Original Topic
0    0.9304
1    0.8299
2    0.8678
3    0.8354
4    0.9029
5    0.7670
6    0.9004
7    0.8063
8    0.9295
9    0.8419
10   0.8995
11   0.8590
12   0.7192
13   0.8081
14   0.8212
15   0.8204
16   0.7524
17   0.8701
18   0.8425
19   0.8415
20   0.8610
21   0.8659
22   0.8506
23   0.8684
24   0.9440
25   0.9152
26   0.8176
27   0.9233
28   0.8610
29   0.7840
Name: Similarity Score, dtype: float64


CAN    15
AUS    10
USA     5
Name: Comparator Country, dtype: int64

In [411]:
canaus = calc_matrix(can_model, "CAN", aus_model, "AUS")
canuk = calc_matrix(can_model, "CAN", uk_model, "UK")
canusa = calc_matrix(can_model, "CAN", usa_model, "USA")

can_combined = pd.concat([canaus, canuk, canusa])
print(can_combined.groupby('Original Topic')['Similarity Score'].max())
can_groups = can_combined.sort_values('Similarity Score').drop_duplicates(['Original Topic'], keep='last').sort_values('Original Topic')
can_groups['Comparator Country'].value_counts()

Original Topic
0    0.9180
1    0.9070
2    0.8432
3    0.9156
4    0.8149
5    0.8251
6    0.8920
7    0.8784
8    0.8595
9    0.8519
10   0.8052
11   0.8330
12   0.9428
13   0.7849
14   0.8188
15   0.8692
16   0.8388
17   0.8392
18   0.9029
19   0.8598
20   0.8697
21   0.8679
22   0.8257
23   0.8330
24   0.9004
25   0.8843
26   0.8646
27   0.8704
28   0.8764
29   0.8017
Name: Similarity Score, dtype: float64


USA    17
UK      7
AUS     6
Name: Comparator Country, dtype: int64

In [403]:
aus_groups = aus_groups.sort_values('Similarity Score', ascending = False)
can_groups = can_groups.sort_values('Similarity Score', ascending = False)
usa_groups = usa_groups.sort_values('Similarity Score', ascending = False)
uk_groups = uk_groups.sort_values('Similarity Score', ascending = False)

pd.concat([aus_groups, can_groups, usa_groups, uk_groups]).to_csv("Top 30 Topic Matches per Country.csv")

In [365]:
usa_combined = pd.concat([usaaus, usacan, usauk])
print(usa_combined.groupby('Original Topic')['Similarity Score'].max())
usa_groups = usa_combined.sort_values('Similarity Score').drop_duplicates(['Original Topic'], keep='last').sort_values('Original Topic')
usa_groups['Comparator Country'].value_counts()

Original Topic
0    0.8459
1    0.9010
2    0.7896
3    0.9643
4    0.8071
5    0.6596
6    0.7699
7    0.6995
8    0.8417
9    0.8238
10   0.7369
11   0.7147
12   0.7034
13   0.7871
14   0.7942
15   0.7890
16   0.7453
17   0.6749
18   0.7813
19   0.7818
20   0.7573
21   0.6189
22   0.7036
23   0.7357
24   0.8851
25   0.7191
26   0.9341
27   0.6680
28   0.7689
29   0.8964
Name: Similarity Score, dtype: float64


UK     17
AUS     7
CAN     6
Name: Comparator Country, dtype: int64

In [366]:
uk_combined = pd.concat([ukaus, ukcan, ukusa])
print(uk_combined.groupby('Original Topic')['Similarity Score'].max())
uk_groups = uk_combined.sort_values('Similarity Score').drop_duplicates(['Original Topic'], keep='last').sort_values('Original Topic')
uk_groups['Comparator Country'].value_counts()

Original Topic
0    0.7764
1    0.9402
2    0.9341
3    0.9786
4    0.9832
5    0.9010
6    0.9643
7    0.8645
8    0.8047
9    0.8631
10   0.7247
11   0.7839
12   0.8259
13   0.8507
14   0.9151
15   0.7754
16   0.7031
17   0.9749
18   0.7996
19   0.8417
20   0.6827
21   0.7332
22   0.8047
23   0.9258
24   0.7671
25   0.8964
26   0.9016
27   0.7290
28   0.8385
29   0.7418
Name: Similarity Score, dtype: float64


CAN    12
USA    10
AUS     8
Name: Comparator Country, dtype: int64

In [367]:
can_combined = pd.concat([canaus, canuk, canusa])
print(can_combined.groupby('Original Topic')['Similarity Score'].max())
can_groups = can_combined.sort_values('Similarity Score').drop_duplicates(['Original Topic'], keep='last').sort_values('Original Topic')
can_groups['Comparator Country'].value_counts()

Original Topic
0    0.9786
1    0.9432
2    0.9362
3    0.8822
4    0.9832
5    0.8372
6    0.7094
7    0.8119
8    0.7157
9    0.9104
10   0.7638
11   0.6517
12   0.8545
13   0.8934
14   0.9344
15   0.8694
16   0.8089
17   0.7904
18   0.8794
19   0.7556
20   0.9749
21   0.6962
22   0.7332
23   0.6885
24   0.8634
25   0.7194
26   0.8715
27   0.6929
28   0.8509
29   0.5946
Name: Similarity Score, dtype: float64


UK     18
AUS    10
USA     2
Name: Comparator Country, dtype: int64

Original Topic
0    0.8067
1    0.9554
2    0.9437
3    0.9342
4    0.6542
5    0.7513
6    0.7741
7    0.8725
8    0.9126
9    0.7217
10   0.8215
11   0.7102
12   0.8033
13   0.6122
14   0.6950
15   0.6682
16   0.6547
17   0.7493
18   0.5908
19   0.7335
20   0.7015
21   0.6797
22   0.8273
23   0.6164
24   0.7300
25   0.6769
26   0.6838
27   0.6896
28   0.7203
29   0.7397
Name: Similarity Score, dtype: float64


CAN    15
UK     10
USA     5
Name: Comparator Country, dtype: int64

In [39]:
# Look at top values, add to location.
# can['profile_loc'].value_counts()

In [98]:
# # Run BERTopic. Adjust nr_topics to see how coherence changes with different levels of pre-defined topics. 
# #20 topics performs optimially. You can see a dataframe with the same 20 topics and topic definitions as the article.

# topic_model = BERTopic(verbose=True, n_gram_range=(1, 3), nr_topics=20, min_topic_size=10)
# topics, probs = topic_model.fit_transform(docs)

# # Preprocess documents.
# documents = pd.DataFrame({"Document": docs,
#                           "ID": range(len(docs)),
#                           "Topic": topics})
# documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
# cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

# # Extract vectorizer and analyzer from BERTopic.
# vectorizer = topic_model.vectorizer_model
# analyzer = vectorizer.build_analyzer()

# # Extract features for Topic Coherence evaluation.
# words = vectorizer.get_feature_names()
# tokens = [analyzer(doc) for doc in cleaned_docs]
# dictionary = corpora.Dictionary(tokens)
# corpus = [dictionary.doc2bow(token) for token in tokens]
# topic_words = [[words for words, _ in topic_model.get_topic(topic)] 
#                for topic in range(len(set(topics))-1)]

# # Find coherence.
# coherence_model = CoherenceModel(topics=topic_words, 
#                                  texts=tokens, 
#                                  corpus=corpus,
#                                  dictionary=dictionary, 
#                                  coherence='c_v')

# # Extract coherence.
# coherence = coherence_model.get_coherence()

# print(len(topic_words))
# print(coherence)
# topic_model.get_topic_info()

Batches:   0%|          | 0/488 [00:00<?, ?it/s]

2021-11-03 10:47:15,302 - BERTopic - Transformed documents to Embeddings
2021-11-03 10:47:29,693 - BERTopic - Reduced dimensionality with UMAP
2021-11-03 10:47:31,199 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2021-11-03 10:48:26,897 - BERTopic - Reduced number of topics from 384 to 21


20
0.3438359176627711


Unnamed: 0,Topic,Count,Name
0,-1,4416,-1_you_this_maori_david
1,0,1084,0_health_need_race_for
2,1,920,1_racist_racism_you_racial
3,2,663,2_seymours wiki_david seymours wiki_whoever updated david_updated david seymours
4,3,616,3_had no issue_no issue with_no issue_seymour
5,4,611,4_david_david seymour_dick_fuck
6,5,588,5_hate_fiery woke hate_fiery_fiery woke
7,6,581,6_cussing him_him out hes_me cussing him_cussing him out
8,7,566,7_seymour_david seymour_seymours_shared david
9,8,564,8_maori_literally stealing health_care from maori_literally stealing


In [105]:
# This extracts 3 representative tweets from each topic for manual analysis. This helped start the process of thematic analysis.
representative_docs = topic_model.get_representative_docs(topic=None)
rep = pd.DataFrame.from_dict(representative_docs)
rep.head()
rep.to_csv("Final 20 Representative Topics.csv")

In [None]:
# For in-depth thematic analysis as required for the article. topic_docs contains all tweets related to each topic. 
topic_docs = {topic: [] for topic in set(topics)}
for topic, doc in zip(topics, docs):
    topic_docs[topic].append(doc)

In [16]:
df = pd.read_csv(r"D:\Work\huawei\tweets-with-country-and-topics.csv")

In [17]:
df.head(50)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,created_at,source,entities,author_id,referenced_tweets,possibly_sensitive,conversation_id,tweet_id,reply_settings,in_reply_to_user_id,attachments,text,geo_id,retweets,replies,likes,quotes,tweet_loc_short,place_type,tweet_loc_long,country,full_text,tweet_loc,verified,name,profile_desc,username,protected,profile_created,profile_loc,url,withheld,followers,following,total_tweets,cleaned_text,date,m_cat,month_cat,topic,country_cat,Count,Name,keywords
0,0,4014315,2018-07-11 23:32:21+00:00,Twitter Web Client,"{'urls': [{'start': 67, 'end': 90, 'url': 'https://t.co/k3a4hBcgg3', 'expanded_url': 'https://www.canberratimes.com.au/politics/western-australia/former-wa-minister-free-huawei-phones-kept-coming-up-in-chinese-20180711-p4zqxq.html', 'display_url': 'canberratimes.com.au/politics/weste…'}], 'annotations': [{'start': 25, 'end': 37, 'probability': 0.437, 'type': 'Product', 'normalized_text': 'Huawei phones'}], 'mentions': [{'start': 95, 'end': 109, 'username': 'canberratimes', 'id': '17125730'}]}",55113432,,False,1017189884711264256,1017189884711264256,everyone,,,Former WA minister: Free Huawei phones 'kept coming up in Chinese' https://t.co/k3a4hBcgg3 via @canberratimes,,0.0,0.0,0.0,0.0,,,,,,,False,lynlinking,"Sharing Political News & Current Affairs\nArticles by Independent Authors, Blogs, \nIndependent Newspapers & others \nSharing articles by #WgarNews \n I follow Back",lynlinking,False,2009-07-09 02:27:53+00:00,Australia,,,32183.0,35291.0,378999.0,Former WA minister: Free Huawei phones 'kep up in Chinese' via @canberratimes,2018-07-11,m6,2020-06-01,4,AUS,98,4_data_facebook_user_shared,"[('data', 0.14619563497907975), ('facebook', 0.12118788462092657), ('user', 0.09347864633439568), ('shared', 0.06658371455174689), ('access', 0.06369123086882364), ('zuckerberg', 0.05514332288010932), ('mark', 0.05514332288010932), ('gave', 0.04887174525279423), ('chinese', 0.04850362111108616), ('device', 0.04368261948577905), ('facebooks', 0.04082674011860171), ('australian', 0.04050222416759447), ('users', 0.03938354762877259), ('threat', 0.0385248089790342), ('prescientinfo', 0.03542518238106668), ('committee', 0.033395840276195306), ('makers', 0.03299446060346893), ('says', 0.031732796796069565), ('privacy', 0.02834014590485334), ('social', 0.02518607605403095)]"
1,1,4014316,2018-07-11 23:30:39+00:00,Twitter Web Client,"{'urls': [{'start': 63, 'end': 86, 'url': 'https://t.co/2S6Z6BtnBk', 'expanded_url': 'https://cnet.co/2m7SaGa', 'display_url': 'cnet.co/2m7SaGa'}], 'annotations': [{'start': 0, 'end': 8, 'probability': 0.9487000000000001, 'type': 'Place', 'normalized_text': 'Australia'}, {'start': 17, 'end': 22, 'probability': 0.8402000000000001, 'type': 'Organization', 'normalized_text': 'Huawei'}, {'start': 29, 'end': 30, 'probability': 0.7378, 'type': 'Organization', 'normalized_text': '5G'}], 'mentions': [{'start': 91, 'end': 96, 'username': 'CNET', 'id': '30261067'}]}",206445861,,False,1017189455319330816,1017189455319330816,everyone,,,Australia to ban Huawei from 5G rollout amid security concerns https://t.co/2S6Z6BtnBk via @CNET,,0.0,0.0,0.0,0.0,,,,,,,False,Eduardo Almeida,,geduardoalmeida,False,2010-10-23 00:11:19+00:00,Sydney,,,56.0,114.0,147.0,Australia to ban Huawei from 5G rollout amid security concerns via @CNET,2018-07-11,m6,2020-06-01,11,AUS,42,11_ban_looms_5g_slashes,"[('ban', 0.2552812140624132), ('looms', 0.20077168365243575), ('5g', 0.15392018205861477), ('slashes', 0.13534348431998378), ('warns', 0.12817540228977792), ('research', 0.11360681258206272), ('local', 0.11096158260487514), ('exclusion', 0.10398418855390186), ('executive', 0.08934061949426127), ('economy', 0.08781869109666174), ('offensive', 0.07657767385222394), ('weather', 0.07521012508982133), ('risks', 0.07162793535112166), ('possible', 0.07086059423027143), ('goes', 0.06991385579442431), ('rollouts', 0.06496970937959921), ('australia', 0.06442829747284846), ('googleausedhuawei', 0.051286348156877784), ('computerworld', 0.051286348156877784), ('pushes', 0.051051782568149295)]"
2,2,4014517,2018-07-11 20:00:13+00:00,Tumblr,"{'urls': [{'start': 108, 'end': 131, 'url': 'https://t.co/aWVLI3rKhH', 'expanded_url': 'https://tmblr.co/Zu1l4a2ZjqZnM', 'display_url': 'tmblr.co/Zu1l4a2ZjqZnM'}], 'annotations': [{'start': 71, 'end': 74, 'probability': 0.41140000000000004, 'type': 'Product', 'normalized_text': 'Sony'}, {'start': 77, 'end': 82, 'probability': 0.5118, 'type': 'Product', 'normalized_text': 'Huawei'}]}",920266768303755264,,False,1017136499777818624,1017136499777818624,everyone,,,"Why Ignoring Mobile Phone Repairs Adelaide Will Cheap Cost - These are Sony, Huawei as well as LG. These... https://t.co/aWVLI3rKhH",,0.0,0.0,0.0,0.0,,,,,,,False,Mobile Repair Shop,,M_RepairShop,False,2017-10-17 12:34:30+00:00,"344A Magill Road, SA Australia",https://t.co/wVeaCCaJcm,,79.0,878.0,664.0,"Why Ignoring Mobile Phone Repairs Adelaide Will Cheap Cost - These are Sony, Huawei as well as LG. These...",2018-07-11,m6,2020-06-01,15,AUS,37,15_trusted_desperately_convince_trying,"[('trusted', 0.10890219512055276), ('desperately', 0.08347595156055189), ('convince', 0.08347595156055189), ('trying', 0.07044727734773171), ('australian', 0.05872970214819679), ('politicians', 0.04803771194921652), ('chinas', 0.04485837322699218), ('bribery', 0.04423737773335662), ('australia', 0.03901026017663523), ('telco', 0.038872857878964065), ('prominent', 0.03770724941074603), ('adelaide', 0.0343679052523468), ('huawei', 0.0296880621331647), ('intensifies', 0.02815109235705388), ('lobbies', 0.026907957531368258), ('friction', 0.026907957531368258), ('chinaaustralia', 0.026907957531368258), ('fabricate', 0.026907957531368258), ('repairing', 0.026907957531368258), ('scoopit', 0.026907957531368258)]"
3,3,4014757,2018-07-11 16:43:52+00:00,Fan Page App,"{'urls': [{'start': 256, 'end': 279, 'url': 'https://t.co/glt6o2a98B', 'expanded_url': 'http://uk.businessinsider.com/r-australia-prepares-to-ban-huawei-from-5g-project-over-security-fears-2018-7', 'display_url': 'uk.businessinsider.com/r-australia-pr…'}], 'annotations': [{'start': 0, 'end': 8, 'probability': 0.8901, 'type': 'Place', 'normalized_text': 'Australia'}, {'start': 26, 'end': 31, 'probability': 0.8117000000000001, 'type': 'Organization', 'normalized_text': 'Huawei'}, {'start': 193, 'end': 196, 'probability': 0.9218000000000001, 'type': 'Place', 'normalized_text': 'U.S.'}, {'start': 225, 'end': 230, 'probability': 0.39540000000000003, 'type': 'Organization', 'normalized_text': 'Huawei'}, {'start': 235, 'end': 243, 'probability': 0.9992000000000001, 'type': 'Place', 'normalized_text': 'Australia'}]}",231651901,,False,1017087085344444416,1017087085344444416,everyone,,,"Australia prepares to ban Huawei from 5G project over security fears\nIts business serving small, rural telecom operators is now at risk after new attacks on the company in recent weeks by some U.S. lawmakers. The move to ban Huawei in Australia comes a... https://t.co/glt6o2a98B",,0.0,0.0,0.0,0.0,,,,,,,False,Adam Mulcahy,Founder e-Cruitment ♦ Fully Managed Offshore Outsourcing ♦ Outsourcing Partner ♦ Virtual Staff Expert ♦ Mr Outsource,adam_mulcahy,False,2010-12-29 03:17:04+00:00,Sydney,http://t.co/vu3JZP8Lyk,,614.0,1978.0,431.0,"Australia prepares to ban Huawei from 5G project over security fears Its business serving small, rural telecom operators is now at risk after new attacks on the company in recent weeks by some U.S. lawmakers. The move to ban Huawei in Australia comes a...",2018-07-11,m6,2020-06-01,-1,AUS,983,-1_huawei_china_chinese_5g,"[('huawei', 0.024465978490767162), ('china', 0.016171303305510334), ('chinese', 0.016117524135508345), ('5g', 0.01481923681594938), ('auspol', 0.012050279782231773), ('chinas', 0.011577122043916645), ('network', 0.011474267920155318), ('huaweis', 0.011032983832263652), ('government', 0.010895449880417905), ('nyt', 0.01038360299036452), ('security', 0.01009115153750317), ('john', 0.009768137464887474), ('phone', 0.00960987664979409), ('lord', 0.009555468034517947), ('new', 0.009165001524514306), ('company', 0.008805082078123897), ('market', 0.008781147438451575), ('australia', 0.008607778014381437), ('australian', 0.008365908200613762), ('tech', 0.008043039333984243)]"
4,4,4014879,2018-07-11 15:40:55+00:00,IFTTT,"{'urls': [{'start': 67, 'end': 90, 'url': 'https://t.co/fjAS2RJEPd', 'expanded_url': 'http://bit.ly/2NHwCMY', 'display_url': 'bit.ly/2NHwCMY'}, {'start': 91, 'end': 114, 'url': 'https://t.co/4IwtUVCrZe', 'expanded_url': 'http://bit.ly/ONymEF', 'display_url': 'bit.ly/ONymEF'}], 'annotations': [{'start': 31, 'end': 36, 'probability': 0.5446, 'type': 'Organization', 'normalized_text': 'Huawei'}, {'start': 57, 'end': 65, 'probability': 0.9913000000000001, 'type': 'Place', 'normalized_text': 'Australia'}], 'hashtags': [{'start': 0, 'end': 8, 'tag': 'weather'}]}",1036843507,,False,1017071242007007232,1017071242007007232,everyone,,,#weather news: Hammer blow for Huawei as 5G ban looms in Australia https://t.co/fjAS2RJEPd https://t.co/4IwtUVCrZe,,0.0,0.0,0.0,0.0,,,,,,,False,Long Beach Weather,"Automated weather tweets from an amateur weather station in Long Beach, NSW Australia. Not for official use, never rely on internet weather, look outside!",LongBeachNSW,False,2012-12-26 10:37:57+00:00,"Long Beach, NSW, Australia",https://t.co/gMx7lHCA4A,,677.0,1472.0,105790.0,#weather news: Hammer blow for Huawei as 5G ban looms in Australia,2018-07-11,m6,2020-06-01,11,AUS,42,11_ban_looms_5g_slashes,"[('ban', 0.2552812140624132), ('looms', 0.20077168365243575), ('5g', 0.15392018205861477), ('slashes', 0.13534348431998378), ('warns', 0.12817540228977792), ('research', 0.11360681258206272), ('local', 0.11096158260487514), ('exclusion', 0.10398418855390186), ('executive', 0.08934061949426127), ('economy', 0.08781869109666174), ('offensive', 0.07657767385222394), ('weather', 0.07521012508982133), ('risks', 0.07162793535112166), ('possible', 0.07086059423027143), ('goes', 0.06991385579442431), ('rollouts', 0.06496970937959921), ('australia', 0.06442829747284846), ('googleausedhuawei', 0.051286348156877784), ('computerworld', 0.051286348156877784), ('pushes', 0.051051782568149295)]"
5,5,4015026,2018-07-11 14:24:03+00:00,dlvr.it,"{'urls': [{'start': 87, 'end': 110, 'url': 'https://t.co/2W3pPnHbb3', 'expanded_url': 'http://dlvr.it/QbCDHK', 'display_url': 'dlvr.it/QbCDHK'}], 'annotations': [{'start': 0, 'end': 5, 'probability': 0.6413, 'type': 'Organization', 'normalized_text': 'Huawei'}, {'start': 26, 'end': 34, 'probability': 0.9312, 'type': 'Place', 'normalized_text': 'Australia'}]}",349313088,,False,1017051898468159488,1017051898468159488,everyone,,,Huawei is a test case for Australia in balancing the risks and rewards of Chinese tech https://t.co/2W3pPnHbb3,,0.0,0.0,0.0,0.0,,,,,,,False,T Lawrence,social media geek.,geek_au,False,2011-08-05 22:25:05+00:00,australia,,,872.0,753.0,163258.0,Huawei is a test case for Australia in balancing the risks and rewards of Chinese tech,2018-07-11,m6,2020-06-01,19,AUS,32,19_rewards_balancing_test_risks,"[('rewards', 0.21412362315203853), ('balancing', 0.21412362315203853), ('test', 0.1940585917901192), ('risks', 0.19047872004730415), ('case', 0.17821730947680656), ('tech', 0.11572668599110354), ('australia', 0.0917134767350178), ('chinese', 0.07809073624996878), ('conversationedu', 0.040728806015485314), ('vw', 0.031823107997619456), ('infrastructure', 0.0303076059558053), ('huawei', 0.03006635876976458), ('sell', 0.025771976206632897), ('national', 0.024668104068234862), ('china', 0.023371836878153356), ('make', 0.019993555381941273), ('chinesetech', 0.01894888235260024), ('challenges', 0.01894888235260024), ('cfo', 0.01894888235260024), ('proactive', 0.01894888235260024)]"
6,6,4015285,2018-07-11 12:19:51+00:00,Twitter for Android,"{'annotations': [{'start': 52, 'end': 57, 'probability': 0.9207000000000001, 'type': 'Product', 'normalized_text': 'Huawei'}], 'mentions': [{'start': 0, 'end': 11, 'username': 'phuckislam', 'id': '207942295'}]}",938327557933481984,"[{'type': 'replied_to', 'id': '1017018522717704192'}]",False,1017018522717704192,1017020643664003072,everyone,207942295.0,,@phuckislam The irony that I liked this tweet on my Huawei phone is not lost on me. 🙄,,0.0,0.0,1.0,0.0,,,,,,,False,Mucki,"small government, free speech, kelpies, bacon, trump, @shaneshoemark",mucki777,False,2017-12-06 08:41:37+00:00,Sydney,,,676.0,936.0,1331.0,@phuckislam The irony that I liked this tweet on my Huawei phone is not lost on me. 🙄,2018-07-11,m6,2020-06-01,2,AUS,119,2_huaweimobile_raiderscanberra_huawei_therustedpixel,"[('huaweimobile', 0.04405091399652186), ('raiderscanberra', 0.04145558283729417), ('huawei', 0.04116866854507943), ('therustedpixel', 0.03907661502170961), ('belon1986', 0.029979645791389287), ('ivabiggun71', 0.029979645791389287), ('fishywanda', 0.02811695793111795), ('just', 0.021724519897670987), ('like', 0.02110821537168777), ('jeremyjmitchell', 0.018509358733309278), ('harrytuckerr', 0.018509358733309278), ('got', 0.017835212408170528), ('thats', 0.017546179430095833), ('phone', 0.017233486022118077), ('creative', 0.01687017475867077), ('love', 0.01687017475867077), ('phones', 0.016281996242958088), ('im', 0.016018514180553403), ('pretty', 0.015792802332535123), ('work', 0.014771819105709301)]"
7,7,4015428,2018-07-11 11:26:12+00:00,Prescieŋt Info,"{'hashtags': [{'start': 0, 'end': 7, 'tag': 'Huawei'}, {'start': 29, 'end': 32, 'tag': 'US'}, {'start': 52, 'end': 66, 'tag': 'PrescientInfo'}], 'urls': [{'start': 67, 'end': 90, 'url': 'https://t.co/YdMem4OVR7', 'expanded_url': 'https://prescient.info/xgxSBXm3/', 'display_url': 'prescient.info/xgxSBXm3/'}]}",3190279062,,False,1017007144519045120,1017007144519045120,everyone,,,#Huawei says does not expect #US sanctions: press - #PrescientInfo https://t.co/YdMem4OVR7,,0.0,0.0,0.0,0.0,,,,,,,False,Quantum Electric Monk,"Deep in the heart of Australia, in the middle of a brutal desert, there lies a hidden treasure. Many people have looked for it and just as many have failed.",QuantumEMonk,False,2015-05-10 02:37:56+00:00,Australia,https://t.co/n9DPz1Fj1g,,17081.0,5656.0,178302.0,#Huawei says does not expect #US sanctions: press - #PrescientInfo,2018-07-11,m6,2020-06-01,-1,AUS,983,-1_huawei_china_chinese_5g,"[('huawei', 0.024465978490767162), ('china', 0.016171303305510334), ('chinese', 0.016117524135508345), ('5g', 0.01481923681594938), ('auspol', 0.012050279782231773), ('chinas', 0.011577122043916645), ('network', 0.011474267920155318), ('huaweis', 0.011032983832263652), ('government', 0.010895449880417905), ('nyt', 0.01038360299036452), ('security', 0.01009115153750317), ('john', 0.009768137464887474), ('phone', 0.00960987664979409), ('lord', 0.009555468034517947), ('new', 0.009165001524514306), ('company', 0.008805082078123897), ('market', 0.008781147438451575), ('australia', 0.008607778014381437), ('australian', 0.008365908200613762), ('tech', 0.008043039333984243)]"
8,8,4015443,2018-07-11 11:21:00+00:00,Zoho Social,"{'hashtags': [{'start': 93, 'end': 100, 'tag': 'Huawei'}, {'start': 101, 'end': 111, 'tag': 'PowerBank'}, {'start': 112, 'end': 118, 'tag': 'AP09Q'}, {'start': 119, 'end': 128, 'tag': '10000mAh'}, {'start': 129, 'end': 145, 'tag': 'GeardoAustralia'}], 'annotations': [{'start': 18, 'end': 23, 'probability': 0.3795, 'type': 'Organization', 'normalized_text': 'Huawei'}, {'start': 37, 'end': 90, 'probability': 0.3125, 'type': 'Product', 'normalized_text': 'HUAWEI POWER BANK AP09Q 10000MAH From Geardo Australia'}], 'urls': [{'start': 188, 'end': 211, 'url': 'https://t.co/PSfPD4oPCP', 'expanded_url': 'https://zurl.co/6jnwL', 'display_url': 'zurl.co/6jnwL'}, {'start': 212, 'end': 235, 'url': 'https://t.co/ghT4aNQ5Jz', 'expanded_url': 'https://twitter.com/Geardo_au/status/1017005834339958784/photo/1', 'display_url': 'pic.twitter.com/ghT4aNQ5Jz', 'media_key': '3_1017005832393908225'}]}",844063546199719936,,False,1017005834339958784,1017005834339958784,everyone,,{'media_keys': ['3_1017005832393908225']},Latest Release By Huawei . Grab Your HUAWEI POWER BANK AP09Q 10000MAH From Geardo Australia. #Huawei #PowerBank #AP09Q #10000mAh #GeardoAustralia\nFor limited Time On Sale $26.65\nVisit Now https://t.co/PSfPD4oPCP https://t.co/ghT4aNQ5Jz,,0.0,0.0,0.0,0.0,,,,,,,False,Geardo,Latest Gadgets and accessories providers in Australia. Genuine Quality Products.,Geardo_au,False,2017-03-21 05:50:06+00:00,"Melbourne, Victoria",,,18.0,0.0,429.0,Latest Release By Huawei . Grab Your HUAWEI POWER BANK AP09Q 10000MAH From Geardo Australia. #Huawei #PowerBank #AP09Q #10000mAh #GeardoAustralia For limited Time On Sale $26.65 Visit Now,2018-07-11,m6,2020-06-01,-1,AUS,983,-1_huawei_china_chinese_5g,"[('huawei', 0.024465978490767162), ('china', 0.016171303305510334), ('chinese', 0.016117524135508345), ('5g', 0.01481923681594938), ('auspol', 0.012050279782231773), ('chinas', 0.011577122043916645), ('network', 0.011474267920155318), ('huaweis', 0.011032983832263652), ('government', 0.010895449880417905), ('nyt', 0.01038360299036452), ('security', 0.01009115153750317), ('john', 0.009768137464887474), ('phone', 0.00960987664979409), ('lord', 0.009555468034517947), ('new', 0.009165001524514306), ('company', 0.008805082078123897), ('market', 0.008781147438451575), ('australia', 0.008607778014381437), ('australian', 0.008365908200613762), ('tech', 0.008043039333984243)]"
9,9,4015484,2018-07-11 11:07:03+00:00,dlvr.it,"{'annotations': [{'start': 19, 'end': 24, 'probability': 0.7669, 'type': 'Organization', 'normalized_text': 'Huawei'}], 'urls': [{'start': 35, 'end': 58, 'url': 'https://t.co/Ij9NvL5JzO', 'expanded_url': 'http://dlvr.it/QbBBfK', 'display_url': 'dlvr.it/QbBBfK'}]}",580030935,,False,1017002325292941312,1017002325292941312,everyone,,,State govt defends Huawei contract https://t.co/Ij9NvL5JzO,,0.0,0.0,0.0,0.0,,,,,,,False,Ruby Martin,Sharing information concerning small business and entrepreneurship throughout Australia,AUTSmallBiz,False,2012-05-14 14:59:24+00:00,Sydney,,,3454.0,3409.0,161903.0,State govt defends Huawei contract,2018-07-11,m6,2020-06-01,-1,AUS,983,-1_huawei_china_chinese_5g,"[('huawei', 0.024465978490767162), ('china', 0.016171303305510334), ('chinese', 0.016117524135508345), ('5g', 0.01481923681594938), ('auspol', 0.012050279782231773), ('chinas', 0.011577122043916645), ('network', 0.011474267920155318), ('huaweis', 0.011032983832263652), ('government', 0.010895449880417905), ('nyt', 0.01038360299036452), ('security', 0.01009115153750317), ('john', 0.009768137464887474), ('phone', 0.00960987664979409), ('lord', 0.009555468034517947), ('new', 0.009165001524514306), ('company', 0.008805082078123897), ('market', 0.008781147438451575), ('australia', 0.008607778014381437), ('australian', 0.008365908200613762), ('tech', 0.008043039333984243)]"


In [21]:
df['in_reply_to_user_id'] = df['in_reply_to_user_id'].astype('Int64')
df['in_reply_to_user_id'] = df['in_reply_to_user_id'].astype(str)
df['author_id'] = df['author_id'].astype(str)
df['tweet_id'] = df['tweet_id'].astype(str)
df['conversation_id'] = df['conversation_id'].astype(str)

In [22]:
df.head(50)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,created_at,source,entities,author_id,referenced_tweets,possibly_sensitive,conversation_id,tweet_id,reply_settings,in_reply_to_user_id,attachments,text,geo_id,retweets,replies,likes,quotes,tweet_loc_short,place_type,tweet_loc_long,country,full_text,tweet_loc,verified,name,profile_desc,username,protected,profile_created,profile_loc,url,withheld,followers,following,total_tweets,cleaned_text,date,m_cat,month_cat,topic,country_cat,Count,Name,keywords
0,0,4014315,2018-07-11 23:32:21+00:00,Twitter Web Client,"{'urls': [{'start': 67, 'end': 90, 'url': 'https://t.co/k3a4hBcgg3', 'expanded_url': 'https://www.canberratimes.com.au/politics/western-australia/former-wa-minister-free-huawei-phones-kept-coming-up-in-chinese-20180711-p4zqxq.html', 'display_url': 'canberratimes.com.au/politics/weste…'}], 'annotations': [{'start': 25, 'end': 37, 'probability': 0.437, 'type': 'Product', 'normalized_text': 'Huawei phones'}], 'mentions': [{'start': 95, 'end': 109, 'username': 'canberratimes', 'id': '17125730'}]}",55113432,,False,1017189884711264256,1017189884711264256,everyone,,,Former WA minister: Free Huawei phones 'kept coming up in Chinese' https://t.co/k3a4hBcgg3 via @canberratimes,,0.0,0.0,0.0,0.0,,,,,,,False,lynlinking,"Sharing Political News & Current Affairs\nArticles by Independent Authors, Blogs, \nIndependent Newspapers & others \nSharing articles by #WgarNews \n I follow Back",lynlinking,False,2009-07-09 02:27:53+00:00,Australia,,,32183.0,35291.0,378999.0,Former WA minister: Free Huawei phones 'kep up in Chinese' via @canberratimes,2018-07-11,m6,2020-06-01,4,AUS,98,4_data_facebook_user_shared,"[('data', 0.14619563497907975), ('facebook', 0.12118788462092657), ('user', 0.09347864633439568), ('shared', 0.06658371455174689), ('access', 0.06369123086882364), ('zuckerberg', 0.05514332288010932), ('mark', 0.05514332288010932), ('gave', 0.04887174525279423), ('chinese', 0.04850362111108616), ('device', 0.04368261948577905), ('facebooks', 0.04082674011860171), ('australian', 0.04050222416759447), ('users', 0.03938354762877259), ('threat', 0.0385248089790342), ('prescientinfo', 0.03542518238106668), ('committee', 0.033395840276195306), ('makers', 0.03299446060346893), ('says', 0.031732796796069565), ('privacy', 0.02834014590485334), ('social', 0.02518607605403095)]"
1,1,4014316,2018-07-11 23:30:39+00:00,Twitter Web Client,"{'urls': [{'start': 63, 'end': 86, 'url': 'https://t.co/2S6Z6BtnBk', 'expanded_url': 'https://cnet.co/2m7SaGa', 'display_url': 'cnet.co/2m7SaGa'}], 'annotations': [{'start': 0, 'end': 8, 'probability': 0.9487000000000001, 'type': 'Place', 'normalized_text': 'Australia'}, {'start': 17, 'end': 22, 'probability': 0.8402000000000001, 'type': 'Organization', 'normalized_text': 'Huawei'}, {'start': 29, 'end': 30, 'probability': 0.7378, 'type': 'Organization', 'normalized_text': '5G'}], 'mentions': [{'start': 91, 'end': 96, 'username': 'CNET', 'id': '30261067'}]}",206445861,,False,1017189455319330816,1017189455319330816,everyone,,,Australia to ban Huawei from 5G rollout amid security concerns https://t.co/2S6Z6BtnBk via @CNET,,0.0,0.0,0.0,0.0,,,,,,,False,Eduardo Almeida,,geduardoalmeida,False,2010-10-23 00:11:19+00:00,Sydney,,,56.0,114.0,147.0,Australia to ban Huawei from 5G rollout amid security concerns via @CNET,2018-07-11,m6,2020-06-01,11,AUS,42,11_ban_looms_5g_slashes,"[('ban', 0.2552812140624132), ('looms', 0.20077168365243575), ('5g', 0.15392018205861477), ('slashes', 0.13534348431998378), ('warns', 0.12817540228977792), ('research', 0.11360681258206272), ('local', 0.11096158260487514), ('exclusion', 0.10398418855390186), ('executive', 0.08934061949426127), ('economy', 0.08781869109666174), ('offensive', 0.07657767385222394), ('weather', 0.07521012508982133), ('risks', 0.07162793535112166), ('possible', 0.07086059423027143), ('goes', 0.06991385579442431), ('rollouts', 0.06496970937959921), ('australia', 0.06442829747284846), ('googleausedhuawei', 0.051286348156877784), ('computerworld', 0.051286348156877784), ('pushes', 0.051051782568149295)]"
2,2,4014517,2018-07-11 20:00:13+00:00,Tumblr,"{'urls': [{'start': 108, 'end': 131, 'url': 'https://t.co/aWVLI3rKhH', 'expanded_url': 'https://tmblr.co/Zu1l4a2ZjqZnM', 'display_url': 'tmblr.co/Zu1l4a2ZjqZnM'}], 'annotations': [{'start': 71, 'end': 74, 'probability': 0.41140000000000004, 'type': 'Product', 'normalized_text': 'Sony'}, {'start': 77, 'end': 82, 'probability': 0.5118, 'type': 'Product', 'normalized_text': 'Huawei'}]}",920266768303755264,,False,1017136499777818624,1017136499777818624,everyone,,,"Why Ignoring Mobile Phone Repairs Adelaide Will Cheap Cost - These are Sony, Huawei as well as LG. These... https://t.co/aWVLI3rKhH",,0.0,0.0,0.0,0.0,,,,,,,False,Mobile Repair Shop,,M_RepairShop,False,2017-10-17 12:34:30+00:00,"344A Magill Road, SA Australia",https://t.co/wVeaCCaJcm,,79.0,878.0,664.0,"Why Ignoring Mobile Phone Repairs Adelaide Will Cheap Cost - These are Sony, Huawei as well as LG. These...",2018-07-11,m6,2020-06-01,15,AUS,37,15_trusted_desperately_convince_trying,"[('trusted', 0.10890219512055276), ('desperately', 0.08347595156055189), ('convince', 0.08347595156055189), ('trying', 0.07044727734773171), ('australian', 0.05872970214819679), ('politicians', 0.04803771194921652), ('chinas', 0.04485837322699218), ('bribery', 0.04423737773335662), ('australia', 0.03901026017663523), ('telco', 0.038872857878964065), ('prominent', 0.03770724941074603), ('adelaide', 0.0343679052523468), ('huawei', 0.0296880621331647), ('intensifies', 0.02815109235705388), ('lobbies', 0.026907957531368258), ('friction', 0.026907957531368258), ('chinaaustralia', 0.026907957531368258), ('fabricate', 0.026907957531368258), ('repairing', 0.026907957531368258), ('scoopit', 0.026907957531368258)]"
3,3,4014757,2018-07-11 16:43:52+00:00,Fan Page App,"{'urls': [{'start': 256, 'end': 279, 'url': 'https://t.co/glt6o2a98B', 'expanded_url': 'http://uk.businessinsider.com/r-australia-prepares-to-ban-huawei-from-5g-project-over-security-fears-2018-7', 'display_url': 'uk.businessinsider.com/r-australia-pr…'}], 'annotations': [{'start': 0, 'end': 8, 'probability': 0.8901, 'type': 'Place', 'normalized_text': 'Australia'}, {'start': 26, 'end': 31, 'probability': 0.8117000000000001, 'type': 'Organization', 'normalized_text': 'Huawei'}, {'start': 193, 'end': 196, 'probability': 0.9218000000000001, 'type': 'Place', 'normalized_text': 'U.S.'}, {'start': 225, 'end': 230, 'probability': 0.39540000000000003, 'type': 'Organization', 'normalized_text': 'Huawei'}, {'start': 235, 'end': 243, 'probability': 0.9992000000000001, 'type': 'Place', 'normalized_text': 'Australia'}]}",231651901,,False,1017087085344444416,1017087085344444416,everyone,,,"Australia prepares to ban Huawei from 5G project over security fears\nIts business serving small, rural telecom operators is now at risk after new attacks on the company in recent weeks by some U.S. lawmakers. The move to ban Huawei in Australia comes a... https://t.co/glt6o2a98B",,0.0,0.0,0.0,0.0,,,,,,,False,Adam Mulcahy,Founder e-Cruitment ♦ Fully Managed Offshore Outsourcing ♦ Outsourcing Partner ♦ Virtual Staff Expert ♦ Mr Outsource,adam_mulcahy,False,2010-12-29 03:17:04+00:00,Sydney,http://t.co/vu3JZP8Lyk,,614.0,1978.0,431.0,"Australia prepares to ban Huawei from 5G project over security fears Its business serving small, rural telecom operators is now at risk after new attacks on the company in recent weeks by some U.S. lawmakers. The move to ban Huawei in Australia comes a...",2018-07-11,m6,2020-06-01,-1,AUS,983,-1_huawei_china_chinese_5g,"[('huawei', 0.024465978490767162), ('china', 0.016171303305510334), ('chinese', 0.016117524135508345), ('5g', 0.01481923681594938), ('auspol', 0.012050279782231773), ('chinas', 0.011577122043916645), ('network', 0.011474267920155318), ('huaweis', 0.011032983832263652), ('government', 0.010895449880417905), ('nyt', 0.01038360299036452), ('security', 0.01009115153750317), ('john', 0.009768137464887474), ('phone', 0.00960987664979409), ('lord', 0.009555468034517947), ('new', 0.009165001524514306), ('company', 0.008805082078123897), ('market', 0.008781147438451575), ('australia', 0.008607778014381437), ('australian', 0.008365908200613762), ('tech', 0.008043039333984243)]"
4,4,4014879,2018-07-11 15:40:55+00:00,IFTTT,"{'urls': [{'start': 67, 'end': 90, 'url': 'https://t.co/fjAS2RJEPd', 'expanded_url': 'http://bit.ly/2NHwCMY', 'display_url': 'bit.ly/2NHwCMY'}, {'start': 91, 'end': 114, 'url': 'https://t.co/4IwtUVCrZe', 'expanded_url': 'http://bit.ly/ONymEF', 'display_url': 'bit.ly/ONymEF'}], 'annotations': [{'start': 31, 'end': 36, 'probability': 0.5446, 'type': 'Organization', 'normalized_text': 'Huawei'}, {'start': 57, 'end': 65, 'probability': 0.9913000000000001, 'type': 'Place', 'normalized_text': 'Australia'}], 'hashtags': [{'start': 0, 'end': 8, 'tag': 'weather'}]}",1036843507,,False,1017071242007007232,1017071242007007232,everyone,,,#weather news: Hammer blow for Huawei as 5G ban looms in Australia https://t.co/fjAS2RJEPd https://t.co/4IwtUVCrZe,,0.0,0.0,0.0,0.0,,,,,,,False,Long Beach Weather,"Automated weather tweets from an amateur weather station in Long Beach, NSW Australia. Not for official use, never rely on internet weather, look outside!",LongBeachNSW,False,2012-12-26 10:37:57+00:00,"Long Beach, NSW, Australia",https://t.co/gMx7lHCA4A,,677.0,1472.0,105790.0,#weather news: Hammer blow for Huawei as 5G ban looms in Australia,2018-07-11,m6,2020-06-01,11,AUS,42,11_ban_looms_5g_slashes,"[('ban', 0.2552812140624132), ('looms', 0.20077168365243575), ('5g', 0.15392018205861477), ('slashes', 0.13534348431998378), ('warns', 0.12817540228977792), ('research', 0.11360681258206272), ('local', 0.11096158260487514), ('exclusion', 0.10398418855390186), ('executive', 0.08934061949426127), ('economy', 0.08781869109666174), ('offensive', 0.07657767385222394), ('weather', 0.07521012508982133), ('risks', 0.07162793535112166), ('possible', 0.07086059423027143), ('goes', 0.06991385579442431), ('rollouts', 0.06496970937959921), ('australia', 0.06442829747284846), ('googleausedhuawei', 0.051286348156877784), ('computerworld', 0.051286348156877784), ('pushes', 0.051051782568149295)]"
5,5,4015026,2018-07-11 14:24:03+00:00,dlvr.it,"{'urls': [{'start': 87, 'end': 110, 'url': 'https://t.co/2W3pPnHbb3', 'expanded_url': 'http://dlvr.it/QbCDHK', 'display_url': 'dlvr.it/QbCDHK'}], 'annotations': [{'start': 0, 'end': 5, 'probability': 0.6413, 'type': 'Organization', 'normalized_text': 'Huawei'}, {'start': 26, 'end': 34, 'probability': 0.9312, 'type': 'Place', 'normalized_text': 'Australia'}]}",349313088,,False,1017051898468159488,1017051898468159488,everyone,,,Huawei is a test case for Australia in balancing the risks and rewards of Chinese tech https://t.co/2W3pPnHbb3,,0.0,0.0,0.0,0.0,,,,,,,False,T Lawrence,social media geek.,geek_au,False,2011-08-05 22:25:05+00:00,australia,,,872.0,753.0,163258.0,Huawei is a test case for Australia in balancing the risks and rewards of Chinese tech,2018-07-11,m6,2020-06-01,19,AUS,32,19_rewards_balancing_test_risks,"[('rewards', 0.21412362315203853), ('balancing', 0.21412362315203853), ('test', 0.1940585917901192), ('risks', 0.19047872004730415), ('case', 0.17821730947680656), ('tech', 0.11572668599110354), ('australia', 0.0917134767350178), ('chinese', 0.07809073624996878), ('conversationedu', 0.040728806015485314), ('vw', 0.031823107997619456), ('infrastructure', 0.0303076059558053), ('huawei', 0.03006635876976458), ('sell', 0.025771976206632897), ('national', 0.024668104068234862), ('china', 0.023371836878153356), ('make', 0.019993555381941273), ('chinesetech', 0.01894888235260024), ('challenges', 0.01894888235260024), ('cfo', 0.01894888235260024), ('proactive', 0.01894888235260024)]"
6,6,4015285,2018-07-11 12:19:51+00:00,Twitter for Android,"{'annotations': [{'start': 52, 'end': 57, 'probability': 0.9207000000000001, 'type': 'Product', 'normalized_text': 'Huawei'}], 'mentions': [{'start': 0, 'end': 11, 'username': 'phuckislam', 'id': '207942295'}]}",938327557933481984,"[{'type': 'replied_to', 'id': '1017018522717704192'}]",False,1017018522717704192,1017020643664003072,everyone,207942295.0,,@phuckislam The irony that I liked this tweet on my Huawei phone is not lost on me. 🙄,,0.0,0.0,1.0,0.0,,,,,,,False,Mucki,"small government, free speech, kelpies, bacon, trump, @shaneshoemark",mucki777,False,2017-12-06 08:41:37+00:00,Sydney,,,676.0,936.0,1331.0,@phuckislam The irony that I liked this tweet on my Huawei phone is not lost on me. 🙄,2018-07-11,m6,2020-06-01,2,AUS,119,2_huaweimobile_raiderscanberra_huawei_therustedpixel,"[('huaweimobile', 0.04405091399652186), ('raiderscanberra', 0.04145558283729417), ('huawei', 0.04116866854507943), ('therustedpixel', 0.03907661502170961), ('belon1986', 0.029979645791389287), ('ivabiggun71', 0.029979645791389287), ('fishywanda', 0.02811695793111795), ('just', 0.021724519897670987), ('like', 0.02110821537168777), ('jeremyjmitchell', 0.018509358733309278), ('harrytuckerr', 0.018509358733309278), ('got', 0.017835212408170528), ('thats', 0.017546179430095833), ('phone', 0.017233486022118077), ('creative', 0.01687017475867077), ('love', 0.01687017475867077), ('phones', 0.016281996242958088), ('im', 0.016018514180553403), ('pretty', 0.015792802332535123), ('work', 0.014771819105709301)]"
7,7,4015428,2018-07-11 11:26:12+00:00,Prescieŋt Info,"{'hashtags': [{'start': 0, 'end': 7, 'tag': 'Huawei'}, {'start': 29, 'end': 32, 'tag': 'US'}, {'start': 52, 'end': 66, 'tag': 'PrescientInfo'}], 'urls': [{'start': 67, 'end': 90, 'url': 'https://t.co/YdMem4OVR7', 'expanded_url': 'https://prescient.info/xgxSBXm3/', 'display_url': 'prescient.info/xgxSBXm3/'}]}",3190279062,,False,1017007144519045120,1017007144519045120,everyone,,,#Huawei says does not expect #US sanctions: press - #PrescientInfo https://t.co/YdMem4OVR7,,0.0,0.0,0.0,0.0,,,,,,,False,Quantum Electric Monk,"Deep in the heart of Australia, in the middle of a brutal desert, there lies a hidden treasure. Many people have looked for it and just as many have failed.",QuantumEMonk,False,2015-05-10 02:37:56+00:00,Australia,https://t.co/n9DPz1Fj1g,,17081.0,5656.0,178302.0,#Huawei says does not expect #US sanctions: press - #PrescientInfo,2018-07-11,m6,2020-06-01,-1,AUS,983,-1_huawei_china_chinese_5g,"[('huawei', 0.024465978490767162), ('china', 0.016171303305510334), ('chinese', 0.016117524135508345), ('5g', 0.01481923681594938), ('auspol', 0.012050279782231773), ('chinas', 0.011577122043916645), ('network', 0.011474267920155318), ('huaweis', 0.011032983832263652), ('government', 0.010895449880417905), ('nyt', 0.01038360299036452), ('security', 0.01009115153750317), ('john', 0.009768137464887474), ('phone', 0.00960987664979409), ('lord', 0.009555468034517947), ('new', 0.009165001524514306), ('company', 0.008805082078123897), ('market', 0.008781147438451575), ('australia', 0.008607778014381437), ('australian', 0.008365908200613762), ('tech', 0.008043039333984243)]"
8,8,4015443,2018-07-11 11:21:00+00:00,Zoho Social,"{'hashtags': [{'start': 93, 'end': 100, 'tag': 'Huawei'}, {'start': 101, 'end': 111, 'tag': 'PowerBank'}, {'start': 112, 'end': 118, 'tag': 'AP09Q'}, {'start': 119, 'end': 128, 'tag': '10000mAh'}, {'start': 129, 'end': 145, 'tag': 'GeardoAustralia'}], 'annotations': [{'start': 18, 'end': 23, 'probability': 0.3795, 'type': 'Organization', 'normalized_text': 'Huawei'}, {'start': 37, 'end': 90, 'probability': 0.3125, 'type': 'Product', 'normalized_text': 'HUAWEI POWER BANK AP09Q 10000MAH From Geardo Australia'}], 'urls': [{'start': 188, 'end': 211, 'url': 'https://t.co/PSfPD4oPCP', 'expanded_url': 'https://zurl.co/6jnwL', 'display_url': 'zurl.co/6jnwL'}, {'start': 212, 'end': 235, 'url': 'https://t.co/ghT4aNQ5Jz', 'expanded_url': 'https://twitter.com/Geardo_au/status/1017005834339958784/photo/1', 'display_url': 'pic.twitter.com/ghT4aNQ5Jz', 'media_key': '3_1017005832393908225'}]}",844063546199719936,,False,1017005834339958784,1017005834339958784,everyone,,{'media_keys': ['3_1017005832393908225']},Latest Release By Huawei . Grab Your HUAWEI POWER BANK AP09Q 10000MAH From Geardo Australia. #Huawei #PowerBank #AP09Q #10000mAh #GeardoAustralia\nFor limited Time On Sale $26.65\nVisit Now https://t.co/PSfPD4oPCP https://t.co/ghT4aNQ5Jz,,0.0,0.0,0.0,0.0,,,,,,,False,Geardo,Latest Gadgets and accessories providers in Australia. Genuine Quality Products.,Geardo_au,False,2017-03-21 05:50:06+00:00,"Melbourne, Victoria",,,18.0,0.0,429.0,Latest Release By Huawei . Grab Your HUAWEI POWER BANK AP09Q 10000MAH From Geardo Australia. #Huawei #PowerBank #AP09Q #10000mAh #GeardoAustralia For limited Time On Sale $26.65 Visit Now,2018-07-11,m6,2020-06-01,-1,AUS,983,-1_huawei_china_chinese_5g,"[('huawei', 0.024465978490767162), ('china', 0.016171303305510334), ('chinese', 0.016117524135508345), ('5g', 0.01481923681594938), ('auspol', 0.012050279782231773), ('chinas', 0.011577122043916645), ('network', 0.011474267920155318), ('huaweis', 0.011032983832263652), ('government', 0.010895449880417905), ('nyt', 0.01038360299036452), ('security', 0.01009115153750317), ('john', 0.009768137464887474), ('phone', 0.00960987664979409), ('lord', 0.009555468034517947), ('new', 0.009165001524514306), ('company', 0.008805082078123897), ('market', 0.008781147438451575), ('australia', 0.008607778014381437), ('australian', 0.008365908200613762), ('tech', 0.008043039333984243)]"
9,9,4015484,2018-07-11 11:07:03+00:00,dlvr.it,"{'annotations': [{'start': 19, 'end': 24, 'probability': 0.7669, 'type': 'Organization', 'normalized_text': 'Huawei'}], 'urls': [{'start': 35, 'end': 58, 'url': 'https://t.co/Ij9NvL5JzO', 'expanded_url': 'http://dlvr.it/QbBBfK', 'display_url': 'dlvr.it/QbBBfK'}]}",580030935,,False,1017002325292941312,1017002325292941312,everyone,,,State govt defends Huawei contract https://t.co/Ij9NvL5JzO,,0.0,0.0,0.0,0.0,,,,,,,False,Ruby Martin,Sharing information concerning small business and entrepreneurship throughout Australia,AUTSmallBiz,False,2012-05-14 14:59:24+00:00,Sydney,,,3454.0,3409.0,161903.0,State govt defends Huawei contract,2018-07-11,m6,2020-06-01,-1,AUS,983,-1_huawei_china_chinese_5g,"[('huawei', 0.024465978490767162), ('china', 0.016171303305510334), ('chinese', 0.016117524135508345), ('5g', 0.01481923681594938), ('auspol', 0.012050279782231773), ('chinas', 0.011577122043916645), ('network', 0.011474267920155318), ('huaweis', 0.011032983832263652), ('government', 0.010895449880417905), ('nyt', 0.01038360299036452), ('security', 0.01009115153750317), ('john', 0.009768137464887474), ('phone', 0.00960987664979409), ('lord', 0.009555468034517947), ('new', 0.009165001524514306), ('company', 0.008805082078123897), ('market', 0.008781147438451575), ('australia', 0.008607778014381437), ('australian', 0.008365908200613762), ('tech', 0.008043039333984243)]"


In [7]:
len(df)

103846

In [23]:
df.dtypes

Unnamed: 0               int64
Unnamed: 0.1             int64
created_at              object
source                  object
entities                object
author_id               object
referenced_tweets       object
possibly_sensitive        bool
conversation_id         object
tweet_id                object
reply_settings          object
in_reply_to_user_id     object
attachments             object
text                    object
geo_id                  object
retweets               float64
replies                float64
likes                  float64
quotes                 float64
tweet_loc_short         object
place_type              object
tweet_loc_long          object
country                 object
full_text               object
tweet_loc               object
verified                  bool
name                    object
profile_desc            object
username                object
protected                 bool
profile_created         object
profile_loc             object
url     

In [24]:
df.to_excel(r"D:\Work\huawei\tweets-with-country-and-topics-v2.xlsx", engine='xlsxwriter')