In [23]:
import pandas as pd
import numpy as np
import spacy
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, NMF
from gensim import corpora, models, matutils
import logging
from corextopic import corextopic as ct
from corextopic import vis_topic as vt
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from corex_funcs import *
from translate import Translator
%matplotlib inline

# Foreign language analysis
In this final notebook, we will look to see if there are differences in sentiment between posters in English and those in foreign languages. We will restrict topic analysis only to those languages that SpaCy supports so we can properly lemmatize words and identify parts of speech and adjective modifiers. Because translation packages often require a paid subscription or have rate limits, we will do topic analysis first, then translate the topics into English.

In [2]:
# df = pd.read_pickle('data/df_parsed')

In [3]:
# df_parsed_non_en = df.loc[df.language != 'en'].copy()

In [4]:
# df_parsed_non_en.to_pickle('data/df_parsed_non_en')

In [5]:
df_parsed_non_en = pd.read_pickle('data/df_parsed_non_en')

In [6]:
def clean(sentence): # take in spacy token, return lemmas and remove numbers, punctuation, and stop words
    return ' '.join([token.lemma_ for token in sentence if token.pos_ != 'NUM' and token.pos_ != 'PUNCT' and not token.is_stop])

In [7]:
df_parsed_non_en['spacy_doc_cleaned'] = df_parsed_non_en.spacy_doc.apply(clean)

What are the different non-English languages in the df?

In [99]:
df_parsed_non_en.language.value_counts()

it    41726
fr    34001
de     1603
da      527
zh      320
lt      173
Name: language, dtype: int64

In [9]:
# Create dict to translate language code to subreddit name
language_dict = {k:v for k, v in zip(df_parsed_non_en.language.value_counts().index, 'italy, france, catalan, germany, denmark, china, lithuania, netherlands'.split(', '))}

For topic modeling, we'll use CorEx because that had the best results with the English posts.

In [50]:
df_parsed_non_en.language.value_counts().index

Index(['it', 'fr', 'de', 'da', 'zh', 'lt'], dtype='object')

In [None]:
results = {}
for language in df_parsed_non_en.language.value_counts().index:
    sub_df = df_parsed_non_en[df_parsed_non_en.language == language].copy()
    results[language] = corex_translate(language, sub_df, num_topics=3)

In [65]:
for language in ['lt']:
    sub_df = df_parsed_non_en[df_parsed_non_en.language == language].copy()
    results[language] = corex_translate(language, sub_df, num_topics=3)

0_lt: pysanky, kirashchuk, wax, using, member, egg, folk, oleh, pysanka, putins
0_en: pysanky, kirashchuk, wax, using, member, egg, Folk, oleh, pysanka, putins


1_lt: slava, ukraine, ukraini, ukrainian, putin, charities, cams, live, linkti, vladimir
1_en: slava, ukraine, ukraini, ukrainian, putin, charities, cams, Live, nod, vladimir


2_lt: amp, xb, kyiv, lviv, supplies, village, unique, kiev, today, nato
2_en: (amf) -, Xb, kyiv, lviv, supplies, Village”, Unique, Kiev, today., NATO




Cleaning seems to have deleted all Chinese words so we will redo Corex on Chinese and Lithuanian (the remaining languages) separately.

In [62]:
df_parsed_non_en.loc[df_parsed_non_en.language=='zh', 'spacy_doc_cleaned'] = df_parsed_non_en.loc[df_parsed_non_en.language=='zh', 'spacy_doc']

In [83]:
df_parsed_non_en.loc[df_parsed_non_en.language == 'zh', 'spacy_doc_cleaned'] = df_parsed_non_en.loc[df_parsed_non_en.language == 'zh', 'spacy_doc'].map(lambda x: ''.join(list(x.text))).copy()

In [84]:
language = 'zh'
sub_df = df_parsed_non_en[df_parsed_non_en.language == language].copy()
results[language] = corex_translate(language, sub_df, num_topics=3)

0_zh: aggression, taiwan, government, pro, does, army, ukrainian, war, chinese, attack
0_en: The exceptional resort to self-defence is contingent on the occurrence of an  "armed attack ", which is rendered in French as  "agression armée ", i.e., armed aggression., Malaysia, Government, pro, `Who does what&apos;, Islamic Army of Aden;, Ukrainian, Go to war, LocaleName, Unmanned attack vehicles


1_zh: make, foreign, china, mistakes, states, united, ukraine, media, feel, position
1_en: Make, foreign operation, We need to build a wall., It doesn't matter if you make mistakes., STATES, United, Ukraine, media, feel, Position


2_zh: strength, evidence, military, american, bad, really, attitude, germany, security, far
2_en: Strength:, <g id="1">evidence</g><g id="2"> integration</g>, Military, American Tower, BAD, Really?, Attitude Demo, Germany, Security, Far Easern Technical University




In [97]:
results = {}
anchors = [['president', 'putin'],['nato'], ['poland']]
for language in df_parsed_non_en.language.value_counts().index:
    sub_df = df_parsed_non_en[df_parsed_non_en.language == language].copy()
    results[language] = corex_anchors_translate(language, anchors, sub_df, num_topics=3)

Translated anchors from en to it: [['presidente', 'putin'], ['nato'], ['polonia']]
0: putin, presidente, oligarca, vladimir, pro, salvini, macron, pazzo, maglietta, criminale
0_en: Putin, President, oligarch, Vladimir, pro, Salvini, MACRON, Crazy, t-shirt, Criminal


1: nato, entrare, ue, membro, espansione, difensivo, intervenire, attaccare, est, alleanza
1_en: born in, I enter, EU, ember, expansion, defensive, I intervene, to attach, basement floor plan, SAVINGS PLANS


2: il, polonia, russo, dell, ucraina, essere, russia, paese, potere, guerra
2_en: on ,, POLAND, Russian, for, Ukraine, have, russia, country, power, war


Percent Topic 0 Documents: 14.49%
Percent Topic 1 Documents: 8.72%
Percent Topic 2 Documents: 32.6%
Translated anchors from en to fr: [['• présidente\xa0:', 'huilo'], ['otan'], ['polonia']]
0: otan, rejoindre, ue, membre, extension, alliance, expansion, unien, rentrer, caritatif
0_en: NATO, join, &#13;, Number, extension, covenant, Expansion, unien, ship, Charity




RuntimeError: generator raised StopIteration

The free translator API limit is very low so it is not feasible to do much translation work for this project without buying a license so we'll stop here.