In [2]:
#import modules used for performance profiling
import time
import humanize
import psutil

#import database module
from pymongo import MongoClient

#import modules used for network analysis
import networkx as nx
import igraph
import leidenalg

#import modules used for WOC analysis
import numpy as np
from wisdom_of_crowds import Crowd
from wisdom_of_crowds import make_sullivanplot

#import modules used for text analysis
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.decomposition import NMF, LatentDirichletAllocation, MiniBatchNMF

In [3]:
#VARIABLES NEEDED TO CONNECT TO AND QUERY DATABASE

CONNECTION_STRING = "mongodb://JamIs:morticiaetpollito@118.138.244.29:27017/"

class Tweet:
	def __init__(self, tweet):
		self.id = tweet['id']
		self.user = tweet['user']
		self.connected_user = tweet['connected_user']
		self.connection_type = tweet['connection_type']
		self.text = tweet['text']

query = {"$and":
        	[
            	{"datetime": {"$eq": None}},
            	{"lang": 'en'},
            	{"connection_type": {"$exists": "true"}},
            	{"connection_type": {"$ne": None}},
            	{"connected_user": {"$ne": None}}
            ]
        }

In [4]:
#CONNECT TO DATABASE
print("connecting...")
client = MongoClient(CONNECTION_STRING)
tw_coll = client.get_database('Tw_Covid_DB').get_collection('tweets')
tu_coll = client.get_database('Tw_Covid_DB').get_collection('users')
print("connected")

connecting...
connected


In [7]:
#FIND DATE WITH THE MOST TWEETS
dates = {}

for tweet in tw_coll.find():
	#check date of tweet
	messy_date = tweet["created_at"] #"Thu Mar 12 02:01:57 +0000 2020"
	real_date = messy_date[:10] + messy_date[-4:]

	if real_date in dates.keys():
		#increment counter for each date
		dates[real_date] += 1
	else:
		dates[real_date] = 1

best_date = None
for date in dates.keys():
	if best_date is None:
		best_date = date
	if dates[best_date] < dates[date]:
		best_date = date

print(best_date)
print(dates[best_date])

Wed Mar 182020
4753798


In [8]:
#RUN QUERY
query_results = tw_coll.find(query)
db_tweets = []
for t in query_results:
	db_tweets.append(Tweet(t))
print(len(db_tweets))

7616588


In [20]:
#UNUSED CODE FOR CULLING RETURNED TWEETS ACCORDING TO USER CRITERIA

tweets_by_id = {}
final_tweets = {}
start_time = time.time()

try:
	for t in tweets:
		#we do a mapping of tweets to users this way so we can filter users based on ...location? eventually
		#user_tweet_mapping is an interim mapping we use to get the users from the database. At query time we add any additional criteria
		#tweets is a interm list of Tweets objects so we can keep the tweet id, users id, and connected_user id bundled during the batched query
		tweets_by_id[t.id] = t
		if t.user not in user_tweet_mapping:
			user_tweet_mapping[t.user] = []
		if t.connected_user not in user_tweet_mapping:
			user_tweet_mapping[t.connected_user] = []
		user_tweet_mapping[t.user].append(t.id) 
		user_tweet_mapping[t.connected_user].append(t.id)
		
		if len(user_tweet_mapping) > 4096:
			user_list = user_tweet_mapping.keys()
			users = tu_coll.find({"_id":{"$in":list(user_list)}})#list()}}) #get the users
			for user in users:
				# if the user is found by our constrained query, the Tweet object with the bundled tweet id, users id, and connected_user id
				# is added to the final_tweets list. The final_tweets list therefore only has tweets by users who meet our criteria
				for tweet_id in user_tweet_mapping[user['id']]:
					final_tweets[tweet_id] = tweets[tweet_id]
			user_tweet_mapping = {} #reset
except Exception as e:
	print(e)
finally:
	end_time = time.time()
	print("Tweets loaded: {}".format(len(final_tweets)))
	print("Time taken: {}".format(humanize.precisedelta(end_time - start_time, suppress=['days', 'milliseconds', 'microseconds'])))
	print("Memory used: {}".format(humanize.naturalsize(psutil.Process().memory_info().rss)))

'dict' object has no attribute 'user'
Tweets loaded: 0
Time taken: 0 seconds
Memory used: 11.2 GB


In [36]:
def ig_plot(i_g, layout="auto", filename=None):
	print(f"plotting {layout}:")
	filename = filename or f"ig_{layout}.pdf"
	start_time = time.time()
	if hasattr(i_g, 'membership'):
		vertex_colour = i_g.membership
	else:
		vertex_colour = "black"
	igraph.plot(
		i_g, 
		layout=layout, 
		target=filename, 
		vertex_size=5, 
		edge_arrow_size=0.5, 
		edge_arrow_width = 0.5,
		#vertex_label = i_g.vs["name"],
		vertex_color=vertex_colour,
		palette=igraph.RainbowPalette(),
	)
	end_time = time.time()
	print("plotting complete")
	print("Time taken: {}".format(humanize.precisedelta(end_time - start_time, suppress=['days', 'milliseconds', 'microseconds'])))
	print("Memory used: {}".format(humanize.naturalsize(psutil.Process().memory_info().rss)))

In [30]:
#CREATE THE NETWORKX GRAPH

nx_g = nx.DiGraph()
print("building networkx graph...")
start_time = time.time()
for tweet in tweets:
	if tweet.user == "299200325":
		print(f"{tweet.user} tweeted {tweet.text}")
	if tweet.connected_user == "299200325":
		print(f"{tweet.connected_user} tweeted {tweet.user}")

	if tweet.user not in nx_g:
		nx_g.add_node(tweet.user)
	
	if tweet.connected_user not in nx_g:
		nx_g.add_node(tweet.connected_user)
	
	if tweet.connected_user != tweet.user:
		if tweet.connected_user not in nx_g[tweet.user]:
			nx_g.add_edge(tweet.user, tweet.connected_user, weight=1, tweets=[tweet.id])
		else:
			nx_g[tweet.user][tweet.connected_user]['weight'] += 1
			nx_g[tweet.user][tweet.connected_user]['tweets'].append(tweet.id)
end_time = time.time()
print("Time taken: {}".format(humanize.precisedelta(end_time - start_time, suppress=['days', 'milliseconds', 'microseconds'])))
print("Memory used: {}".format(humanize.naturalsize(psutil.Process().memory_info().rss)))
print("Nodes: {}".format(nx_g.number_of_nodes()))
print("Edges: {}".format(nx_g.number_of_edges()))
print("Density: {}".format(nx.density(nx_g)))
print("networkx graph built")

building networkx graph...
Time taken: 2 minutes and 3 seconds
Memory used: 14.1 GB
Nodes: 3617873
Edges: 7220344
Density: 5.516344421444012e-07
networkx graph built


In [29]:
degree_sequence = sorted([(n, d) for n, d in nx_g.degree()], key=lambda x: x[1], reverse=True)
central_nodes = degree_sequence[:5]
central_nodes = [{'name': n, 'degree': d} for n, d in central_nodes]
for node in central_nodes:
		print("Node id: ", node['name'])
		print("Node degree: ", node['degree'])
		print("Node outedges: ", nx_g.out_degree(node['name']))
		print("Node inedges: ", nx_g.in_degree(node['name']))

		for tweet in tweets:
			if tweet.user == node['name']:
				print(tweet.text)

Node id:  470021270
Node degree:  48900
Node outedges:  0
Node inedges:  48900
I want to add there are lines out the door for gun shops?! Excuse me??? https://t.co/MRago1EVHQ
Node id:  165213594
Node degree:  44587
Node outedges:  2
Node inedges:  44585
Lmao what!? https://t.co/ItkQyWD3PO
They closed for 50 days complete lock off. sterilised whole cities, took temperatures of people everywhere upon ent‚Ä¶ https://t.co/9HfIPlPjY1
Node id:  466519303
Node degree:  40151
Node outedges:  3
Node inedges:  40148
Read @NaomiAKlein right now!
There was never anyone cooler than @gloriagaynor BUT she is now even cooler because of the #IWillSurviveChallenge.‚Ä¶ https://t.co/6T5bwVDJGK
RT @JonWiener1: What we need to do now about the coronavirus and the elections:  John Nichols @NicholsUprising explains universal vote by m‚Ä¶
Node id:  25073877
Node degree:  36700
Node outedges:  9
Node inedges:  36691
....Together we are putting into policy a plan to prevent, detect, treat and create a vaccine ag

In [10]:
print("building igraph from networkx graph")
start_time = time.time()
i_g = igraph.Graph.from_networkx(nx_g, vertex_attr_hashable="name")
end_time = time.time()
print("Time taken: {}".format(humanize.precisedelta(end_time - start_time, suppress=['days', 'milliseconds', 'microseconds'])))
print("Memory used: {}".format(humanize.naturalsize(psutil.Process().memory_info().rss)))
print("Nodes: {}".format(len(i_g.vs)))
print("Edges: {}".format(len(i_g.es)))
print("Transitivity: {}".format(i_g.transitivity_undirected()))
print("igraph from networkx built")

building igraph from networkx graph
Time taken: 20 seconds
Memory used: 10.0 GB
Nodes: 3617873
Edges: 7220344
Transitivity: 0.00010257998573380519
igraph from networkx built


In [32]:
nodes = sorted(i_g.vs, key=lambda vertex: vertex.degree(), reverse=True)
central_nodes = nodes[:5]
for node in central_nodes:
		print("Node id: ", node['name'])
		print("Node degree: ", node.degree())
		print("Node outedges: ", node.outdegree())
		print("Node inedges: ", node.indegree())

		for tweet in tweets:
			if tweet.user == node['name']:
				print(tweet.text)

Node id:  470021270
Node degree:  48900
Node outedges:  0
Node inedges:  48900
I want to add there are lines out the door for gun shops?! Excuse me??? https://t.co/MRago1EVHQ
Node id:  165213594
Node degree:  44587
Node outedges:  2
Node inedges:  44585
Lmao what!? https://t.co/ItkQyWD3PO
They closed for 50 days complete lock off. sterilised whole cities, took temperatures of people everywhere upon ent‚Ä¶ https://t.co/9HfIPlPjY1
Node id:  466519303
Node degree:  40151
Node outedges:  3
Node inedges:  40148
Read @NaomiAKlein right now!
There was never anyone cooler than @gloriagaynor BUT she is now even cooler because of the #IWillSurviveChallenge.‚Ä¶ https://t.co/6T5bwVDJGK
RT @JonWiener1: What we need to do now about the coronavirus and the elections:  John Nichols @NicholsUprising explains universal vote by m‚Ä¶
Node id:  25073877
Node degree:  36700
Node outedges:  9
Node inedges:  36691
....Together we are putting into policy a plan to prevent, detect, treat and create a vaccine ag

In [33]:
print("leidenalg:")
start_time = time.time()
ig_community_graph = leidenalg.find_partition(i_g.connected_components("weak").giant(), leidenalg.ModularityVertexPartition);
print("Graphs: {}".format(len(ig_community_graph.subgraphs())))
end_time = time.time()
print("leidenalg analysis complete")
print("Time taken: {}".format(humanize.precisedelta(end_time - start_time, suppress=['days', 'milliseconds', 'microseconds'])))
print("Memory used: {}".format(humanize.naturalsize(psutil.Process().memory_info().rss)))


leidenalg:
Graphs: 2069
leidenalg analysis complete
Time taken: 1 minute and 1 second
Memory used: 15.3 GB


In [None]:
ig_plot(ig_community_graph, "fruchterman_reingold", filename="leidencommunities.pdf")

In [34]:
# Get all subgraphs
subgraphs = ig_community_graph.subgraphs()
# Sort subgraphs by size in descending order
sorted_subgraphs = sorted(subgraphs, key=lambda x: len(x.vs), reverse=True)
# Get the largest 4 subgraphs
largest_4_subgraphs = sorted_subgraphs[:4]
for i, subgraph in enumerate(largest_4_subgraphs):
	print(f"Community {i}: {len(subgraph.vs)} nodes")
	print(f"Community {i} as a proportion of total: {len(subgraph.vs)/len(i_g.vs)}")
# Filter subgraphs with less than 10 nodes
small_subgraphs = [sg for sg in subgraphs if len(sg.vs) < 10]
print("Small Graphs: {}".format(len(small_subgraphs)))
# Count the number of small subgraphs
num_small_subgraphs = len(small_subgraphs)

Subgraph 1: 629118 nodes
Subgraph 1 as a proportion of total: 0.1738916761312517 nodes
Subgraph 2: 384460 nodes
Subgraph 2 as a proportion of total: 0.10626685900804146 nodes
Subgraph 3: 234365 nodes
Subgraph 3 as a proportion of total: 0.06477977529891182 nodes
Subgraph 4: 218652 nodes
Subgraph 4 as a proportion of total: 0.060436615657874115 nodes
Small Graphs: 1643


In [35]:
for i, subgraph in enumerate(largest_4_subgraphs):
	#order the edges in a community by weight
	edges = sorted(subgraph.es, key=lambda edge: edge['weight'], reverse=True)
	top_five = edges[:5]

	# Print the top five edges in the community
	print(f"Top five edges in community {i}:")
	for edge in top_five:
		print(f"{edge.source} -> {edge.target}: {edge['weight']} tweets")

Top five edges in community:
51108 -> 51109: 6 tweets
5237 -> 16336: 5 tweets
80457 -> 16336: 4 tweets
104503 -> 104504: 4 tweets
187258 -> 1194: 4 tweets
Top five edges in community:
206038 -> 710: 52 tweets
93367 -> 172: 48 tweets
11304 -> 11303: 44 tweets
9911 -> 72: 43 tweets
4103 -> 72: 42 tweets
Top five edges in community:
558 -> 15: 95 tweets
34120 -> 7662: 62 tweets
21059 -> 13979: 47 tweets
136214 -> 13979: 44 tweets
28831 -> 1944: 40 tweets
Top five edges in community:
3525 -> 3526: 72 tweets
9388 -> 3526: 62 tweets
10704 -> 10705: 52 tweets
3526 -> 3525: 25 tweets
17570 -> 22: 24 tweets


In [36]:
for i, community in enumerate(largest_4_subgraphs):
	print ("Community ", i)
	#Find higher centrality nodes in each subgraph
	nodes = sorted(community.vs, key=lambda vertex: vertex.degree(), reverse=True)
	central_nodes = nodes[:10]
	

    #2 Get tweets from higher centrality nodes in each subgraph
	#get each edge for each author
	for node in central_nodes:
		print("Node id: ", node['name'])
		print("Node outedges: ", node.outdegree())
		print("Node inedges: ", node.indegree())

		for tweet in tweets:
			if tweet.user == node['name']:
				print(tweet.text)

Community  0
Node id:  165213594
Node outedges:  0
Node inedges:  34761
Lmao what!? https://t.co/ItkQyWD3PO
They closed for 50 days complete lock off. sterilised whole cities, took temperatures of people everywhere upon ent‚Ä¶ https://t.co/9HfIPlPjY1
Node id:  299200325
Node outedges:  0
Node inedges:  25495
Node id:  25873558
Node outedges:  2
Node inedges:  23713
Not enough praise has gone to medical professionals worldwide. From those in the labs to those bedside https://t.co/dN6bsfD2jo
RT @mvazquez17: Me sending my dog out for supplies since he can‚Äôt contract COVID-19 https://t.co/4wrTWjqpTv
RT @EchoXrayMusic: Tell them stay closed
RT @JoshuaYJackson: Doctors from China &amp; Cuba are in Europe, Latin America &amp; Africa helping countries fight Coronavirus, putting themselves‚Ä¶
Node id:  309465874
Node outedges:  1
Node inedges:  21140
RT @the_mavs_fan: America could what? Say it with me!!

N. E. V. E. R. !!!!!!
Spread the good details just like you spread the negative ones
Nod

In [49]:
print("transferring igraph communities to networkx")
start_time = time.time()
#shift community info from igraph to networkx
#for each node in networkx graph
#look up matching node in igraph
#assign T property of networkx graph node to community value from igraph node
# Get the membership list from the igraph partition
membership = ig_community_graph.membership
# For each node in the networkx graph
for node in nx_g.nodes():
	# Look up the matching node in the igraph graph
	ig_node_index = i_g.vs.find('_nx_name'==node).index
	# Assign the 'T' property of the networkx node to the community value from the igraph node
	nx_g.nodes[node]['T'] = membership[ig_node_index]
#delete all nodes that don't have a membership value - these nodes weren't in the igraph largest connected component
print(len([node for node in nx_g if 'T' not in nx_g.nodes[node]]))
print(len([node for node in nx_g if 'T' in nx_g.nodes[node]]))
nx_g.remove_nodes_from([node for node in nx_g if 'T' not in nx_g.nodes[node]])
print("Time taken: {}".format(humanize.precisedelta(end_time - start_time, suppress=['days', 'milliseconds', 'microseconds'])))
print("Memory used: {}".format(humanize.naturalsize(psutil.Process().memory_info().rss)))
print("igraph communities transferred to networkx")

transferring igraph communities to networkx
0
3617873
Time taken: 57 seconds
Memory used: 12.5 GB
igraph communities transferred to networkx


In [None]:
#do wisdom of the crowds analysis
print("wisdom of the crowds:")
start_time = time.time()
c = Crowd(nx_g)
s_set = []
d_set = []
for node in c.node_set:
	s_set.append(c.S(node))
	d_set.append(c.D(node))
s_set = np.array(s_set)
d_set = np.array(d_set)
œÄ_set = np.multiply(s_set,d_set)
print("Time taken: {}".format(humanize.precisedelta(end_time - start_time, suppress=['days', 'milliseconds', 'microseconds'])))
print("Memory used: {}".format(humanize.naturalsize(psutil.Process().memory_info().rss)))
print("wisdom of the crowds complete")
make_sullivanplot(œÄ_set,d_set,s_set)
make_sullivanplot(œÄ_set,d_set,s_set,colormap='magma_r',yscale='log')

In [50]:
n_topics = 10
init = "nndsvda"
batch_size = 256

In [1]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Add your custom words
# Note: ‚ÄúWuhanVirus‚Äù not included as it is politically charged
custom_words = ["coronavirus", "2019nCoV", "corona virus", "COVD19", "CoronavirusPandemic", "COVID-19", "CoronaOutbreak", 
				"pneumonia", "pneumonie", "neumonia", "lungenentz√ºndung", "COVID19", "http", "https", "https://", "19", "just", "people", "rt"]

# Extend the default English stop words list with your words
stop_words = ENGLISH_STOP_WORDS.union(custom_words)

print("Vectorising (TF-IDF)...")
TFIDFvectorizer = TfidfVectorizer(
	min_df=0.05,
	stop_words=stop_words
)
start_time = time.time()
TFIDFvectorised_dataset = TFIDFvectorizer.fit_transform([tweet.text for tweet in tweets])
print(f"n_samples/documents: {TFIDFvectorised_dataset.shape[0]}, n_features/words: {TFIDFvectorised_dataset.shape[1]}") #shape is rows of the matrix in that dimension
print(f"Sparsity (number of cells with non-zero values): {TFIDFvectorised_dataset.nnz / np.prod(TFIDFvectorised_dataset.shape):.3f}")
print(f"vectorization done in {humanize.precisedelta(time.time() - start_time, suppress=['days', 'microseconds'])}")
print("Memory used: {}".format(humanize.naturalsize(psutil.Process().memory_info().rss)))

Vectorising (TF-IDF)...


NameError: name 'TfidfVectorizer' is not defined

In [52]:
print("Kmeans clustering...")

def fit_and_evaluate(km, X, name=None, n_runs=5):
	name = km.__class__.__name__ if name is None else name

	train_times = []
	scores = [] #score = "Silhouette Coefficient"
	for seed in range(n_runs):
		km.set_params(random_state=seed)
		start_time = time.time()
		km.fit(X)
		train_times.append(time.time() - start_time)
		scores.append(
			metrics.silhouette_score(X, km.labels_, sample_size=4000)
		)
	train_times = np.asarray(train_times)

	print(f"clustering done in {humanize.precisedelta(train_times.mean(), suppress=['days', 'microseconds'])} ¬± {humanize.precisedelta(train_times.std(), suppress=['days', 'microseconds'])} ")
	evaluation = {
		"estimator": name,
		"train_time": train_times.mean(),
	}
	evaluation_std = {
		"estimator": name,
		"train_time": train_times.std(),
	}
	mean_score, std_score = np.mean(scores), np.std(scores)
	print(f"Silhouette Coefficient: {mean_score:.3f} ¬± {std_score:.3f}")
	# evaluations.append(evaluation)
	# evaluations_std.append(evaluation_std)

start_time = time.time()
for seed in range(5):
	kmeans = KMeans(
		n_clusters=n_topics,
		max_iter=100,
		n_init=1,
		random_state=seed,
	).fit(TFIDFvectorised_dataset)
	cluster_ids, cluster_sizes = np.unique(kmeans.labels_, return_counts=True)
	print(f"Number of elements assigned to each cluster: {cluster_sizes}")

kmeans = KMeans(
	n_clusters=n_topics,
	max_iter=100,
	n_init=10,
)

fit_and_evaluate(kmeans, TFIDFvectorised_dataset, name="KMeans on tf-idf vectors")
print(f"kmeans clustering done in {humanize.precisedelta(time.time() - start_time, suppress=['days', 'microseconds'])}")
print("Memory used: {}".format(humanize.naturalsize(psutil.Process().memory_info().rss)))

Kmeans clustering...
Number of elements assigned to each cluster: [ 624549 1629895 1929417  490726  699247  812991  497375  446619  148491
  337278]
Number of elements assigned to each cluster: [ 105428 1929417  342789  476218 1308763  624745  497380  435971  366553
 1529324]
Number of elements assigned to each cluster: [1629895 1929417  699247  490726  497375  446619  624549  337278  812991
  148491]
Number of elements assigned to each cluster: [ 435774 1629921 1929417 1304531  342789  497375  624549  366267  337278
  148687]
Number of elements assigned to each cluster: [1313588 1929417 1629921   54266  342789  476214  497380  365438  435972
  571603]


In [None]:
def plot_top_words(model, feature_names, n_top_words, title):
	fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)
	axes = axes.flatten()
	for topic_idx, topic in enumerate(model.components_):
		top_features_ind = topic.argsort()[-n_top_words:]
		top_features = feature_names[top_features_ind]
		weights = topic[top_features_ind]

		ax = axes[topic_idx]
		ax.barh(top_features, weights, height=0.7)
		ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 30})
		ax.tick_params(axis="both", which="major", labelsize=20)
		for i in "top right left".split():
			ax.spines[i].set_visible(False)
		fig.suptitle(title, fontsize=40)

	plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
	plt.show()


print("Fitting the NMF model (Frobenius norm) with tf-idf features")
start_time = time.time()
FrobNMF = NMF(
	n_components=n_topics,
	random_state=1,
	init=init,
	beta_loss="frobenius",
	solver="mu",
	alpha_W=0.00005,
	alpha_H=0.00005,
	l1_ratio=1,
).fit(TFIDFvectorised_dataset)
print(f"Done in {humanize.precisedelta(time.time() - start_time, suppress=['days', 'microseconds'])}")
print("Memory used: {}".format(humanize.naturalsize(psutil.Process().memory_info().rss)))

print("\n", "Fitting the NMF model (generalized Kullback-Leibler divergence) with tf-idf features")
KLNMF = NMF(
	n_components=n_topics,
	random_state=1,
	init=init,
	beta_loss="kullback-leibler",
	solver="mu",
	max_iter=1000,
	alpha_W=0.00005,
	alpha_H=0.00005,
	l1_ratio=0.5,
).fit(TFIDFvectorised_dataset)
print(f"Done in {humanize.precisedelta(time.time() - start_time, suppress=['days', 'microseconds'])}")
print("Memory used: {}".format(humanize.naturalsize(psutil.Process().memory_info().rss)))

print("\n", "Fitting the MiniBatchNMF model (Frobenius norm) with tf-idf")
FrobMBNMF = MiniBatchNMF(
	n_components=n_topics,
	random_state=1,
	batch_size=batch_size,
	init=init,
	beta_loss="frobenius",
	alpha_W=0.00005,
	alpha_H=0.00005,
	l1_ratio=0.5,
).fit(TFIDFvectorised_dataset)
print(f"Done in {humanize.precisedelta(time.time() - start_time, suppress=['days', 'microseconds'])}")
print("Memory used: {}".format(humanize.naturalsize(psutil.Process().memory_info().rss)))

print("\n", "Fitting the MiniBatchNMF model (generalized Kullback-Leibler divergence) with tf-idf")
KLMBNMF = MiniBatchNMF(
	n_components=n_topics,
	random_state=1,
	batch_size=batch_size,
	init=init,
	beta_loss="kullback-leibler",
	alpha_W=0.00005,
	alpha_H=0.00005,
	l1_ratio=0.5,
).fit(TFIDFvectorised_dataset)
print(f"Done in {humanize.precisedelta(time.time() - start_time, suppress=['days', 'microseconds'])}")
print("Memory used: {}".format(humanize.naturalsize(psutil.Process().memory_info().rss)))

tfidf_feature_names = TFIDFvectorizer.get_feature_names_out()

print("Plotting...")
n_top_words = 20
plot_top_words(
	FrobNMF,
	tfidf_feature_names,
	n_top_words,
	"Topics in NMF model (Frobenius norm)",
)

plot_top_words(
	KLNMF,
	tfidf_feature_names,
	n_top_words,
	"Topics in NMF model (generalized Kullback-Leibler divergence)",
)

plot_top_words(
	FrobMBNMF,
	tfidf_feature_names,
	n_top_words,
	"Topics in MiniBatchNMF model (Frobenius norm)",
)

plot_top_words(
	KLMBNMF,
	tfidf_feature_names,
	n_top_words,
	"Topics in MiniBatchNMF model (generalized Kullback-Leibler divergence)",
)