pytopia · hejazizo · Dec 14, 2021 · Dec 7, 2021 · Dec 7, 2021 · Dec 14, 2021
diff --git a/README.md b/README.md
@@ -12,3 +12,10 @@ Then run:
 python src/chat_statistics/stats.py
 ```
 to generate a word cloud of json data in `DATA_DIR`
+
+## Adding Font:
+Use Vazir font, which may be found in the following repository, to better display Persian words alongside English words:
+
+https://github.com/rastikerdar/vazir-font/releases
+
+add **`Vazir.ttf`** in **`src/data`** directory
diff --git a/src/chat_statistics/stats.py b/src/chat_statistics/stats.py
@@ -1,8 +1,10 @@
+import re
 import json
 from collections import Counter, defaultdict
 from pathlib import Path
 from typing import Union
 
+import demoji
 import arabic_reshaper
-import demoji
-import arabic_reshaper
+import arabic_reshaper
+import demoji
-import demoji
-import arabic_reshaper
+import arabic_reshaper
+import demoji
 from bidi.algorithm import get_display
 from hazm import Normalizer, sent_tokenize, word_tokenize
@@ -89,6 +91,27 @@ def get_top_users(self, top_n: int = 10) -> dict:
 
         return dict(Counter(users).most_common(top_n))
 
+    def remove_stopwords(self, text):
+        """Removes stop-words from the text.
+
+        :param text: Text that may contain stop-words.
+        """
+        tokens = word_tokenize(self.normalizer.normalize(text))
+        tokens = list(filter(lambda item: item not in self.stop_words, tokens))
+        return ' '.join(tokens)
+
+    def de_emojify(self, text):
+        """Removes emojis and some special characters from the text.
+
+        :param text: Text that contains emoji
+        """
+        regrex_pattern = re.compile(pattern = "["
+            "\u2069"
+            "\u2066"
+                    "]+", flags = re.UNICODE)
+        text = regrex_pattern.sub(r'', text)
-        text = regrex_pattern.sub(r'', text)
+        text = regrex_pattern.sub('', text)
-        text = regrex_pattern.sub(r'', text)
+        text = regrex_pattern.sub('', text)
+        return demoji.replace(text, " ")
+
     def generate_word_cloud(
         self,
         output_dir: Union[str, Path],
@@ -102,22 +125,28 @@ def generate_word_cloud(
         """
         logger.info("Loading text content...")
         text_content = ''
-        for msg in self.chat_data['messages']:
-            if type(msg['text']) is str:
-                tokens = word_tokenize(msg['text'])
-                tokens = list(filter(lambda item: item not in self.stop_words, tokens))
-                text_content += f" {' '.join(tokens)}"
-
-        # normalzie, reshape for final word cloud
-        text_content = self.normalizer.normalize(text_content)
-        text_content = arabic_reshaper.reshape(text_content)
+        messages = iter(self.chat_data['messages'])
+        for message in messages:
+            msg = message['text']
+            if isinstance(msg, list):
+                for sub_msg in msg:
+                    if isinstance(sub_msg, str):
+                        text_content += f" {self.remove_stopwords(sub_msg)}"
+                    elif isinstance(sub_msg, dict) and sub_msg['type'] in {
+                        'text_link', 'bold', 'italic', 'hashtag', 'mention', 'pre'}:
-                    elif isinstance(sub_msg, dict) and sub_msg['type'] in {
-                        'text_link', 'bold', 'italic', 'hashtag', 'mention', 'pre'}:
+                    elif isinstance(sub_msg, dict) and sub_msg['type'] in {
+                        'text_link', 'bold', 'italic', 'hashtag', 'mention', 'pre'
+                    }:
-                    elif isinstance(sub_msg, dict) and sub_msg['type'] in {
-                        'text_link', 'bold', 'italic', 'hashtag', 'mention', 'pre'}:
+                    elif isinstance(sub_msg, dict) and sub_msg['type'] in {
+                        'text_link', 'bold', 'italic', 'hashtag', 'mention', 'pre'
+                    }:
+                        text_content += f" {self.remove_stopwords(sub_msg['text'])}"
+            else:
+                text_content += f" {self.remove_stopwords(msg)}"
+
+        # reshape for final word cloud
+        text_content = arabic_reshaper.reshape(self.de_emojify(text_content))
         text_content = get_display(text_content)
 
         logger.info("Generating word cloud...")
         # generate word cloud
         wordcloud = WordCloud(
             width=1200, height=1200,
-            font_path=str(DATA_DIR / 'BHoma.ttf'),
+            font_path=str(DATA_DIR / 'Vazir.ttf'),
             background_color=background_color,
             max_font_size=250
         ).generate(text_content)
@@ -131,4 +160,4 @@ def generate_word_cloud(
     print(top_users)
 
     chat_stats.generate_word_cloud(output_dir=DATA_DIR)
-    print('Done!')
+    print('Done!')
diff --git a/src/data/stopwords.txt b/src/data/stopwords.txt
@@ -1,4 +1,23 @@
-
+۱
+۲
+۳
+۴
+۵
+۶
+۷
+۸
+۹
+۱۰
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
 و
 در
 به
@@ -334,4 +353,167 @@
 تو
 میشه
 اگه
-اون
+اون
+?
+??
+؟
+:
+،
+[
+]
+×
+x
+a
+ی
+=
+!
+(
+)
+.
+بدین
+کنین
+الان
+مثلا
+می کنه
+چی
+هستن
+کردین
+؟؟
+چیه
+خوبی
+»
+«
+>>
+<<
+>>>
+>>>
+ک
+میده
+احتمالا
+سر
+می کنم
+دیگه
+برا
+اصلا
+میکنم
+میکنم
+دیدم
+بشه
+نداره
+کردید
+کن
+میکنه
+کردن
+ممکنه
+باز
+حالا
+یعنی
+بزنید
+شدم
+میخوام
+دارید
+بعدش
+می تونید
+یاد
+اینو
+می تونه
+دارم
+دارن
+داره
+میشن
+میاد
+کنی
+اینه
+کنن
+نظرم
+ببینید
+میزنم
+اوکی
+برام
+بدید
+همون
+می تونه
+نمی تونم
+پیدا
+بده
+بهش
+نشد
+میشم
+خودم
+گفتم
+کنه
+میگه
+خوبه
+لطف
+خودش
+بدم
+داشتم
+خب
+می تونه
+خواهش
+دوست
+نمی کنه
+بقیه
+باشید
+شه
+هستش
+می تونه
+بدیم
+دوتا
+نکردم
+وجود
+نمیشه
+مگه
+کلی
+حتما
+بهتره
+نمیکنه
+چقدر
+بیاد
+بخیر
+دارین
+ینی
+ازتون
+دست
+مشکلی
+نمی تونم
+کلا
+کم
+واسه
+بهم
+میدم
+کمی
+دست
+دادم
+بهم
+ینی
+بابا
+بازم
+!؟
+"
+'
+؟!
+بریم
+مند
+تونید
++
+چک
+میگم
+میشه
+زدم
+کردی
+زدن
+اونجا
+عه
+اینم
+آخه
+باشن
+بودم
+رفتم
+خودت
+سللا
+پی
+لپ
+تاپ
+...
+دادین