In [1]:
# Mounting colab on drive
from google.colab import drive

drive.mount('/content/drive', force_remount=True)
# Enter the folder name
FOLDER_NAME = '/content/drive/My Drive/Rutgers/'

Mounted at /content/drive


In [2]:
!pip install googletrans==3.1.0a0



In [4]:
from gensim.models import Word2Vec
import os
import json
import re
import matplotlib.pyplot as plt
from gensim.parsing.preprocessing import remove_stopwords
import nltk
from googletrans import Translator

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [33]:
def TrainModel(text, size=100, window=5, min_count=1, workers=4):
  model = Word2Vec(sentences=[nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)], size=size, window=window, min_count=min_count, workers=workers)
  return model

def GetSimilar(model, word, count):
  try:
    sims = model.wv.most_similar(word, topn=count)
    return sims
  except:
    return []

def ReadText(filename):
  postList = []
  for postF in os.listdir(filename):
    with open(filename+'/'+postF) as f:
      posts = f.readlines()
      for post in posts:
        post = json.loads(post)
        for ind in post['result']['posts']:
          postList.append(ind)
  text = ""
  for post in postList:
    try:
      res = post['message']
      res = re.sub(r'http\S+', '', res)
      res = re.sub(r'[^\w\s]', ' ', res)
      res = remove_stopwords(res)
      text += res+" "
    except:
      continue
  return text

def translate(word, src):
  translator = Translator()
  return translator.translate(word, src=src).text

In [34]:
philText = ReadText(FOLDER_NAME + "philipino")
model = TrainModel(philText)

In [36]:
from googletrans.constants import DEFAULT_USER_AGENT

def tToPhil(word):
  translator = Translator()
  return translator.translate(word, dest='tl').text

searchWordList = ['american', 'politics', 'trump']

for word in searchWordList:
  sims =  GetSimilar(model, word, 10)
  print(word)
  print("-------------------------------------")
  if len(sims):
    for i, sim in enumerate(sims):
      print("Original - {}; Translated - {}; Similarity - {}".format(sim[0], translate(sim[0], "tl"), sim[1]))
  else:
    print("None found")
  print("-------------------------------------\n")

  sims =  GetSimilar(model, tToPhil(word), 10)
  print(tToPhil(word))
  print("-------------------------------------")
  if len(sims):
    for i, sim in enumerate(sims):
      print("Original - {}; Translated - {}; Similarity - {}".format(sim[0], translate(sim[0], "tl"), sim[1]))
  else:
    print("None found")
  print("-------------------------------------\n")

american
-------------------------------------
Original - Leyte; Translated - Leyte; Similarity - 0.40485602617263794
Original - Limited; Translated - Limited; Similarity - 0.4003816246986389
Original - TACO; Translated - TACO; Similarity - 0.3922540545463562
Original - vary; Translated - vary; Similarity - 0.3872128129005432
Original - enterprises; Translated - enterprises; Similarity - 0.3867223560810089
Original - ALTANGHAP; Translated - ALTANGHAP; Similarity - 0.3843846917152405
Original - gratitude; Translated - gratitude; Similarity - 0.38253992795944214
Original - Seduction; Translated - Seduction; Similarity - 0.38137760758399963
Original - wounded; Translated - wounded; Similarity - 0.37183043360710144
Original - NECK; Translated - NECK; Similarity - 0.36616945266723633
-------------------------------------

amerikano
-------------------------------------
None found
-------------------------------------

politics
-------------------------------------
Original - Nagales; Transl

In [38]:
ChineseText = ReadText(FOLDER_NAME + "chinese/chinese")
model = TrainModel(ChineseText)

In [39]:
from googletrans.constants import DEFAULT_USER_AGENT

def tToChi(word):
  translator = Translator()
  return translator.translate(word, dest='zh-CN').text

searchWordList = ['american', 'politics', 'trump']

for word in searchWordList:
  sims =  GetSimilar(model, word, 10)
  print(word)
  print("-------------------------------------")
  if len(sims):
    for i, sim in enumerate(sims):
      print("Original - {}; Translated - {}; Similarity - {}".format(sim[0], translate(sim[0], "zh-CN"), sim[1]))
  else:
    print("None found")
  print("-------------------------------------\n")

  sims =  GetSimilar(model, tToChi(word), 10)
  print(tToChi(word))
  print("-------------------------------------")
  if len(sims):
    for i, sim in enumerate(sims):
      print("Original - {}; Translated - {}; Similarity - {}".format(sim[0], translate(sim[0], "zh-CN"), sim[1]))
  else:
    print("None found")
  print("-------------------------------------\n")

american
-------------------------------------
Original - 把這份情傳下去; Translated - pass this love on; Similarity - 0.4124833941459656
Original - 每梯次配合僑居地時區差異調配; Translated - Each echelon is adjusted according to the time zone difference of the overseas residence; Similarity - 0.4116383492946625
Original - 已經住院3天點滴治療; Translated - Has been hospitalized for 3 days with drip therapy; Similarity - 0.4075436592102051
Original - 警車; Translated - police car; Similarity - 0.40410616993904114
Original - 波城漂書站及取書點; Translated - Pau drifting station and pick-up point; Similarity - 0.40214645862579346
Original - 個性活潑開朗具有親和力; Translated - Lively and cheerful personality; Similarity - 0.39846327900886536
Original - 臺灣持續貢獻己力; Translated - Taiwan continues to contribute; Similarity - 0.3975507616996765
Original - 澳洲駐台代表; Translated - Australian representative in Taiwan; Similarity - 0.39597001671791077
Original - Limited; Translated - Limited; Similarity - 0.39560574293136597
Original - 打折入的; Translated 

In [40]:
VietText = ReadText(FOLDER_NAME + "vietnamese/vietnamese")
model = TrainModel(VietText)

In [41]:
from googletrans.constants import DEFAULT_USER_AGENT

def tToViet(word):
  translator = Translator()
  return translator.translate(word, dest='vi').text

searchWordList = ['american', 'politics', 'trump']

for word in searchWordList:
  sims =  GetSimilar(model, word, 10)
  print(word)
  print("-------------------------------------")
  if len(sims):
    for i, sim in enumerate(sims):
      print("Original - {}; Translated - {}; Similarity - {}".format(sim[0], translate(sim[0], "vi"), sim[1]))
  else:
    print("None found")
  print("-------------------------------------\n")

  sims =  GetSimilar(model, tToViet(word), 10)
  print(tToViet(word))
  print("-------------------------------------")
  if len(sims):
    for i, sim in enumerate(sims):
      print("Original - {}; Translated - {}; Similarity - {}".format(sim[0], translate(sim[0], "vi"), sim[1]))
  else:
    print("None found")
  print("-------------------------------------\n")

american
-------------------------------------
Original - 340w; Translated - 340w; Similarity - 0.44454050064086914
Original - 𝙉𝙜𝙪; Translated - 𝙉𝙜𝙪; Similarity - 0.42876189947128296
Original - 𝐚𝐢; Translated - 𝐚𝐢; Similarity - 0.4097299575805664
Original - Limited; Translated - Limited; Similarity - 0.4003816246986389
Original - TACO; Translated - TACO; Similarity - 0.3922540545463562
Original - BACKGRID; Translated - BACKGRID; Similarity - 0.3898788094520569
Original - QL; Translated - QL; Similarity - 0.3882220983505249
Original - vary; Translated - vary; Similarity - 0.3872128129005432
Original - enterprises; Translated - enterprises; Similarity - 0.3867223560810089
Original - Albalone; Translated - Abalone; Similarity - 0.38500845432281494
-------------------------------------

Người Mỹ
-------------------------------------
None found
-------------------------------------

politics
-------------------------------------
Original - ChinnyDCNFab; Translated - ChinnyDCNFab; Similarity

In [42]:
IndText = ReadText(FOLDER_NAME + "indian/indian")
model = TrainModel(IndText)

In [43]:
from googletrans.constants import DEFAULT_USER_AGENT

def tToHi(word):
  translator = Translator()
  return translator.translate(word, dest='hi').text

searchWordList = ['american', 'politics', 'trump']

for word in searchWordList:
  sims =  GetSimilar(model, word, 10)
  print(word)
  print("-------------------------------------")
  if len(sims):
    for i, sim in enumerate(sims):
      print("Original - {}; Translated - {}; Similarity - {}".format(sim[0], translate(sim[0], "hi"), sim[1]))
  else:
    print("None found")
  print("-------------------------------------\n")

  sims =  GetSimilar(model, tToHi(word), 10)
  print(tToHi(word))
  print("-------------------------------------")
  if len(sims):
    for i, sim in enumerate(sims):
      print("Original - {}; Translated - {}; Similarity - {}".format(sim[0], translate(sim[0], "hi"), sim[1]))
  else:
    print("None found")
  print("-------------------------------------\n")

american
-------------------------------------
Original - V9941; Translated - V9941; Similarity - 0.442058265209198
Original - sridhart; Translated - sridhart; Similarity - 0.41394031047821045
Original - Limited; Translated - Limited; Similarity - 0.40467172861099243
Original - ڈال; Translated - ال; Similarity - 0.39680707454681396
Original - TACO; Translated - TACO; Similarity - 0.3922540545463562
Original - vary; Translated - vary; Similarity - 0.3872128129005432
Original - gratitude; Translated - gratitude; Similarity - 0.3836163878440857
Original - NationalGeographic; Translated - National Geographic; Similarity - 0.37996453046798706
Original - 62208; Translated - 62208; Similarity - 0.378415048122406
Original - endocrinology; Translated - Endocrinology; Similarity - 0.37310391664505005
-------------------------------------

अमेरिकन
-------------------------------------
None found
-------------------------------------

politics
-------------------------------------
Original - starr