-
Notifications
You must be signed in to change notification settings - Fork 0
/
wordcloud.py
80 lines (55 loc) · 2.54 KB
/
wordcloud.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
##### Word cloud ####
import re
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# Extract the news articles column from the DataFrame
news_articles = df['Summary_clean'].tolist()
word_to_remove = 'bangladesh'
news_articles = [word for word in news_articles if word != word_to_remove]
word_to_remove = 'said'
news_articles = [word for word in news_articles if word != word_to_remove]
word_to_remove = 's'
news_articles = [word for word in news_articles if word != word_to_remove]
word_to_remove = 'according'
news_articles = [word for word in news_articles if word != word_to_remove]
word_to_remove = 'may'
news_articles = [word for word in news_articles if word != word_to_remove]
word_to_remove = 'the'
news_articles = [word for word in news_articles if word != word_to_remove]
word_to_remove = 'will'
news_articles = [word for word in news_articles if word != word_to_remove]
# # Flatten the list of lists into a single list
# flattened_articles = [item for sublist in news_articles for item in sublist]
# Join the strings
# Concatenate the news articles into a single string
text = ' '.join(news_articles)
from nltk.corpus import stopwords
# Tokenize the text into individual words
word_tokens = word_tokenize(text)
# Get the list of stopwords
stop_words = set(stopwords.words("english"))
# Remove stopwords from the list of words
filtered_words = [word for word in word_tokens if word.lower() not in stop_words]
# Join the filtered words back into a single string
filtered_text = ' '.join(filtered_words)
words_to_remove = ['may', 'due', 'bangladesh', 'said', 'among', 'one', 'us', 'will', 'went', 'now',
'later', 'event', 'even', 'every', 'becae', 'told', 'acced', 'many', 'hoe']
# Remove words from the text
for word in words_to_remove:
filtered_text = filtered_text.replace(word, '')
# Generate the word cloud
wordcloud = WordCloud(background_color='white').generate(filtered_text)
top_words = dict(list(wordcloud.words_.items())[:75])
# Create a new WordCloud object with the top words
# get colourmap from here: https://www.kaggle.com/code/niteshhalai/wordcloud-colormap
wordcloud_top = WordCloud(background_color='white', width=1000, height=600, colormap="brg").fit_words(top_words)
# Set the plot parameters
plt.figure(dpi = 1200)
# Plot the top top words
plt.imshow(wordcloud_top, interpolation='bilinear')
plt.axis('off')
# Save the word cloud as a high-resolution image
#plt.savefig('wordcloud.pdf', format='pdf', dpi = 1200)
plt.savefig('wordcloud.png', format='png', dpi = 1200)
# plt.show()