In [4]:
#Some initial installations, if necessary
!pip install googletrans



All poll titles, question titles, order of questions are metadata generated by U-Report programme managers – not data coming directly from the system
org_language refers to the language on the website, not necessarily the language of the polls
poll_url is unique to every poll, total of 4,399 to date
A subset of these “polls” are notifications and only have a single “question” for which there are no set responses

data_type
set_unset = whether respondents gave a suitable response for this question
response = breakdown of valid response options

data_segment_category

    total = all respondents, whether they have submitted age/gender/location or not
    age = for respondents that submitted age, in age-bands
    gender = for respondents that submitted gender
    location = most granular category for location, on most platforms this is region or district

data_segment_label = disaggregated data labels for each data_segment_category
data_category_label = label of either set/unset (for that data type) or label for the response
- This creates a problem, because set_unset becomes an issue
data_category_count = number of respondents falling into the data_category_label

Thoughts on exploring the data- 
Is it possible to get a global aggregation of responses to the same set of questions? Does the same questions have the same ID? Is there some way of aggregating across them 

In [5]:
#First,the usual imports
import numpy as np
import pandas as pd
import random
filename = '../all_poll_data.2020.05.29/all_poll_data.2020.05.29.csv'
unicef_seed = 2020
from tqdm import tqdm
tqdm.pandas()
import re
from nltk.corpus import stopwords



In [6]:
# Approach 1: Sampling
# https://stackoverflow.com/questions/22258491/read-a-small-random-sample-from-a-big-csv-file-into-a-python-data-frame#22259008
random.seed(a=unicef_seed)

p = 0.01  # 1% of the lines
# keep the header, then take only 1% of lines
# if random from [0,1] interval is greater than 0.01 the row will be skipped
df = pd.read_csv(
         filename,
         header=0, 
         skiprows=lambda i: i>0 and random.random() > p)


In [7]:
%timeit
print(df.shape)

(44679, 26)


In [39]:
df[['question_id', 'question_results_open_ended',
       'question_order', 'data_type', 'data_segment_category',
       'data_segment_label', 'data_category_label']].head()

Unnamed: 0,question_id,question_results_open_ended,question_order,data_type,data_segment_category,data_segment_label,data_category_label,data_category_count
0,12689,False,1,response,location,Entre Ríos,Twitter,0.0
1,12689,False,1,response,location,Córdoba,Twitter,0.0
2,12690,False,2,set_unset,location,Mendoza,set,0.0
3,12691,False,3,response,location,Santa Fe,Entre 1 y 2 horas por día,0.0
4,12691,False,3,response,location,Formosa,Más de 4 horas por día,0.0


So this means the final dataset will have 4,467,600 responses. Right now, our aim is to find strategies to make this work in a subsample. Then we can find Big Data methods for parallelization. 

In [8]:
# Explore Dataset
df.columns

Index(['org_name', 'org_language', 'org_id', 'org_host', 'org_subdomain',
       'org_domain', 'poll_id', 'poll_flow_uuid', 'poll_title', 'poll_org',
       'poll_created_on', 'poll_date', 'poll_category_image_url',
       'poll_category_name', 'poll_url', 'question_ruleset_uuid',
       'question_title', 'question_id', 'question_results_open_ended',
       'question_order', 'data_type', 'data_segment_category',
       'data_segment_label', 'data_category_label', 'data_category_count',
       'data_order'],
      dtype='object')

There is no index or identifier for individual users. That becomes more problematic later, when we try to combine the columns for age, location and gender. We have multiple rows now for what could well have been the same individual. 

In [9]:
for col in df.columns:
    print(df[col].unique())

['Argentina' 'Bangladesh' 'Belize' 'Bolivia' 'Bosnia and Herzegovina'
 'Botswana' 'Brasil' 'Bulgaria' 'Burkina Faso' 'Burundi' 'Cameroon'
 'Chile' 'Congo Brazzavile' 'Congo(RDC)' 'Costa Rica' "Côte d'Ivoire"
 'Ecuador' 'El Salvador' 'FSM' 'France' 'Gambia' 'Ghana' 'Guatemala'
 'Guinea' 'Haiti' 'Honduras' 'India' 'Indonesia' 'Ireland' 'Jamaica'
 'Jordan' 'Kiribati' 'Lesotho' 'Liberia' 'Malawi' 'Malaysia' 'Mexico'
 'Moldova' 'Moçambique' 'Myanmar' 'Nigeria' 'On the Move' 'Pacific'
 'Pakistan' 'Papua New Guinea' 'Philippines' 'România'
 'République Centrafricaine' 'Senegal' 'Sierra Leone' 'South Africa'
 'Srbija' 'Tanzania' 'Tchad' 'Trinidad and Tobago' 'Tunisie'
 'U-Report Global' 'U-Report24x7' 'U-report Mali' 'Uganda' 'Uzbekistan'
 'VIETNAM' 'Western Balkans' 'Zimbabwe' 'eSwatini' 'УКРАЇНА'
 'العراق\u200e' 'ประเทศไทย']
['es' 'en' 'bs' 'pt-br' 'bg' 'fr' 'id' 'ar' 'ro' 'pt' 'my' nan 'it'
 'sr-rs@latin' 'uz' 'vi' 'uk' 'th']
[ 8 17 15 27 19 42  1 33 23  5 10 12 32 46 28 26 49  4 40 38  7 2

The easy wins here would be findings poll titles that cover the same topic. 

In [10]:
assert(len(df['poll_org'].unique())  == len(df['org_id'].unique()))

The above assertion came true, which indicates that poll_org and org_id are referring to the same IDs, and each organization has provided a poll at least once. 

In [11]:
# Now let's get the unique questions per poll
questions_by_poll = df.groupby('poll_title')['question_title'].apply(set)
questions_by_poll = pd.DataFrame(questions_by_poll.reset_index())

In [12]:
questions_by_poll['poll-length'] = questions_by_poll['question_title'].apply(len)
questions_by_poll['poll-length'].describe()

count    4039.000000
mean        3.818024
std         2.768573
min         1.000000
25%         2.000000
50%         3.000000
75%         5.000000
max        27.000000
Name: poll-length, dtype: float64

Some polls are more driven by the youth and their concerns, others by what the UN wishes to learn. On average, there are about 3 -4 questions per poll (of course, with outliers).  
There are also fewer

In [13]:
print(len(df['question_title'].unique()))
print(len(df['question_id'].unique()))

14542
13121


In [14]:
df.head()

Unnamed: 0,org_name,org_language,org_id,org_host,org_subdomain,org_domain,poll_id,poll_flow_uuid,poll_title,poll_org,...,question_title,question_id,question_results_open_ended,question_order,data_type,data_segment_category,data_segment_label,data_category_label,data_category_count,data_order
0,Argentina,es,8,ilhasoft,argentina,,1734,5a7577ea-aa1f-4f9c-b031-78ab91695448,¿Cuáles son las redes sociales preferidas por ...,8,...,¿Cuál es la red social que más usás?,12689,False,1,response,location,Entre Ríos,Twitter,0.0,6
1,Argentina,es,8,ilhasoft,argentina,,1734,5a7577ea-aa1f-4f9c-b031-78ab91695448,¿Cuáles son las redes sociales preferidas por ...,8,...,¿Cuál es la red social que más usás?,12689,False,1,response,location,Córdoba,Twitter,0.0,6
2,Argentina,es,8,ilhasoft,argentina,,1734,5a7577ea-aa1f-4f9c-b031-78ab91695448,¿Cuáles son las redes sociales preferidas por ...,8,...,¿Cuánto tiempo por día pasas en la red social ...,12690,False,2,set_unset,location,Mendoza,set,0.0,0
3,Argentina,es,8,ilhasoft,argentina,,1734,5a7577ea-aa1f-4f9c-b031-78ab91695448,¿Cuáles son las redes sociales preferidas por ...,8,...,¿Cuánto tiempo por día pasas entre todas las r...,12691,False,3,response,location,Santa Fe,Entre 1 y 2 horas por día,0.0,2
4,Argentina,es,8,ilhasoft,argentina,,1734,5a7577ea-aa1f-4f9c-b031-78ab91695448,¿Cuáles son las redes sociales preferidas por ...,8,...,¿Cuánto tiempo por día pasas entre todas las r...,12691,False,3,response,location,Formosa,Más de 4 horas por día,0.0,4


The important point to check here is if the IDs repeat for different questions. So let's use groupby

In [15]:
#https://kite.com/python/answers/how-to-count-unique-values-in-a-pandas-dataframe-group-in-python
question_by_id = df.groupby(['question_id']).aggregate({'org_language':'nunique'}).reset_index()

In [16]:
question_by_id

Unnamed: 0,question_id,org_language
0,1,0
1,2,0
2,3,0
3,4,1
4,5,1
5,6,1
6,7,1
7,8,0
8,9,1
9,10,1


In [17]:
question_by_id['org_language'].describe()

count    13121.000000
mean         1.069659
std          0.456853
min          0.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          3.000000
Name: org_language, dtype: float64

So on average, a question only gets asked in one language, sometimes less
The latter option implies that certain polls don't have question associated with them. This might be problematic later. 

In [18]:
#Now the aim is to translate all the questions and find the relevant ones
from googletrans import Translator
translator = Translator()

In [19]:
result = translator.translate("ไม่ทราบว่าเพื่อนๆ ของยูรีพอร์ตเตอร์มีประสบการณ์ถูกกลั่นแกล้งบนโลกออนไลน์หรือไม่ และทราบหรือไม่ว่าหากพบเห็นการกลั่นแกล้งบนโลกออนไลน์ควรทำอย่างไร?", 
                     src="th", dest="en")

In [20]:
result.text

"Do not know friends Of Yu-Reporter's experience in cyberbullying? And do you know what to do if you see cyberbullying online?"

Each poll has an ID with data already generated. 
https://thailand.ureport.in/poll/62

In [21]:
speaker_countries = df.groupby('org_language')['org_name'].apply(set)
speaker_countries = pd.DataFrame(speaker_countries.reset_index())

In [22]:
speaker_countries.columns

Index(['org_language', 'org_name'], dtype='object')

In [23]:
speaker_countries['num_country'] = speaker_countries['org_name'].apply(len) 

In [24]:
for country in speaker_countries[speaker_countries['org_language']=='en']['org_name']:
    print(country)

{'U-Report24x7', 'Lesotho', 'Bangladesh', 'Philippines', 'Ireland', 'Pacific', 'eSwatini', 'Papua New Guinea', 'FSM', 'Botswana', 'Tanzania', 'Uganda', 'Sierra Leone', 'India', 'Ghana', 'Pakistan', 'Malaysia', 'U-Report Global', 'Kiribati', 'Liberia', 'South Africa', 'Belize', 'Jamaica', 'Gambia', 'Malawi', 'Trinidad and Tobago'}


So the English speaking organizations span quite a range- including groups such as 'Pacific', and more international entities like 'U-Report Global' and 'U-Report 24*7]
Now we look for the 

In [42]:
non_eng_polls = df[df['org_language'] != 'en'][['poll_title', 'question_title', 
                                                'org_language', 'poll_category_name']]
#It would make sense to remove duplicates

In [26]:
non_eng_polls.shape

(26776, 3)

We have gone from 40,000 or so question 26.7k- which indicates that though the number of countries for non-English speaking countries may be less, they stil involve a considerable proportion of the total questions. How many from Spain? 

In [27]:
non_eng_polls.drop_duplicates(inplace=True)

In [28]:
non_eng_polls.shape

(10780, 3)

#### Deduplication
So we have only 10.7k entries now- which suggests about 2-3 duplicates for questions earlier

In [36]:
#let's check if it's the same number of questions for all languages
poll_by_lang = df.groupby(['org_language'])['poll_title'].count()

In [30]:
poll_by_lang
#So more or less, we have the same number of poll_titles as question_titles
# Therefore, Only one question per poll
# So polls themselves aren't going to be very useful

org_language
ar               104
bg                 7
bs                72
en             17903
es              4360
fr              3512
id              1512
it               456
my              2702
pt               461
pt-br           1088
ro              2046
sr-rs@latin     1100
th              2723
uk              2657
uz               954
vi                32
Name: poll_title, dtype: int64

In [33]:
poll_by_lang_poll = df.groupby(['org_language', 'poll_title'])['question_title'].count()

In [35]:
poll_by_lang_poll.reset_index()

Unnamed: 0,org_language,poll_title,question_title
0,ar,COVID-19 Risk Perception at Community Level Su...,15
1,ar,COVID19 Information centre,23
2,ar,CRC,8
3,ar,Ending Violence Online,12
4,ar,Impact of COVID-19 on Education,5
5,ar,Mira: Movement,2
6,ar,Safe Internet Day Poll,1
7,ar,U-Report For Syria 9 Years,12
8,ar,World Mental Health Day - 10 October,2
9,ar,الصحة النفسية والرفاهية,10


#### Translation
Now we can can use the Translator object to get the vast range of translations

In [40]:
def translate_to_eng(txt, src_lang):
    """
    takes in text in a non-english language
    returns the english translation
    """
    
    try:
        result = translator.translate(txt, 
                     src=src_lang, dest="en")
    #In case the organization's language label doesn't match the question language
    except:
        result = translator.translate(txt, dest="en")
        
    return result.text    

In [55]:
non_eng_polls.iloc[1208]

poll_title                         BreastMilk and No water Campaign
question_title    (For YES answers): Ok, What else should be giv...
org_language                                                     fr
Name: 2998, dtype: object

In [None]:
non_eng_polls['poll_category_translation'] = non_eng_polls.progress_apply(lambda x: translate_to_eng(x.poll_category_name, x.org_language),
                                                       axis=1)


 18%|██████████████                                                               | 4911/26776 [17:47<35:59, 10.13it/s]

In [61]:
#What would be interesting is to see the differences to the same poll questions across different segments
non_eng_polls['poll_translation'] = non_eng_polls.progress_apply(lambda x: translate_to_eng(x.poll_title, x.org_language),
                                                       axis=1)


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10780/10780 [47:11<00:00,  3.81it/s]


In [63]:
poll_topics = non_eng_polls['poll_translation'].unique()

In [64]:
print(len(poll_topics))

3126


In [65]:
print(poll_topics)

['What are the preferred social networks for Argentine teens?'
 '¿Cuáles son las redes sociales preferidas por los adolescentes argentinos?'
 'How does Social and Mandatory Preventive Isolation affect households with Disabilities?'
 ...
 'The Convention on the Rights of the Child states that the children have the right to express themselves in their thoughts. And their opinions should be considered as appropriate\r\n\r\nWhat level of opinion do you think you can exercise your rights to participate in?'
 "Don't know friends Of U-readers have experienced cyberbullying? And do you know what if bullying is seen online?"
 'Thailand signed as a member of the Convention on the Rights of the Child on 12 February 1992. Did you know that this Convention? What are the key principles in protecting the basic rights of children?']


In [None]:
#Save this for later use
poll_topics.to_pickle('../non_english_poll_titles.pkl')
# Merge with the English poll titles

The questions need to be organized by some measure of similarity. There are many different ways of doing this but we will start with the overview covered in this comprehensive medium post:
https://medium.com/@adriensieg/text-similarities-da019229c894
and the associated github repository:
https://github.com/adsieg/text_similarity/blob/master/Different%20Embeddings%20%2B%20Cosine%20Similarity%20%2B%20HeatMap%20illustration.ipynb


In [1]:
import spacy
nlp = spacy.load("en_core_web_md")
tokens = nlp("dog cat banana afskfsd")


In [3]:
for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))

dog dog 1.0
dog cat 0.80168545
dog banana 0.24327648
dog afskfsd 0.0
cat dog 0.80168545
cat cat 1.0
cat banana 0.28154367
cat afskfsd 0.0
banana dog 0.24327648
banana cat 0.28154367
banana banana 1.0
banana afskfsd 0.0
afskfsd dog 0.0
afskfsd cat 0.0
afskfsd banana 0.0
afskfsd afskfsd 1.0


  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)


In [None]:
gloveFile = "data\\glove.6B.50d.txt"
import numpy as np
def loadGloveModel(gloveFile):
    print ("Loading Glove Model")
    with open(gloveFile, encoding="utf8" ) as f:
        content = f.readlines()
    model = {}
    for line in content:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print ("Done.",len(model)," words loaded!")
    return model

def preprocess(raw_text):

    # keep only words
    letters_only_text = re.sub("[^a-zA-Z]", " ", raw_text)

    # convert to lower case and split 
    words = letters_only_text.lower().split()

    # remove stopwords
    stopword_set = set(stopwords.words("english"))
    cleaned_words = list(set([w for w in words if w not in stopword_set]))

    return cleaned_words

def cosine_distance_between_two_words(word1, word2):
    import scipy
    return (1- scipy.spatial.distance.cosine(model[word1], model[word2]))

def calculate_heat_matrix_for_two_sentences(s1,s2):
    s1 = preprocess(s1)
    s2 = preprocess(s2)
    result_list = [[cosine_distance_between_two_words(word1, word2) for word2 in s2] for word1 in s1]
    result_df = pd.DataFrame(result_list)
    result_df.columns = s2
    result_df.index = s1
    return result_df

def cosine_distance_wordembedding_method(s1, s2):
    import scipy
    vector_1 = np.mean([model[word] for word in preprocess(s1)],axis=0)
    vector_2 = np.mean([model[word] for word in preprocess(s2)],axis=0)
    cosine = scipy.spatial.distance.cosine(vector_1, vector_2)
    print('Word Embedding method with a cosine distance asses that our two sentences are similar to',round((1-cosine)*100,2),'%')

def heat_map_matrix_between_two_sentences(s1,s2):
    df = calculate_heat_matrix_for_two_sentences(s1,s2)
    import seaborn as sns
    import matplotlib.pyplot as plt
    fig, ax = plt.subplots(figsize=(5,5)) 
    ax_blue = sns.heatmap(df, cmap="YlGnBu")
    # ax_red = sns.heatmap(df)
    print(cosine_distance_wordembedding_method(s1, s2))
    return ax_blue

In [None]:
len(df[df['org_language']=="es"]['question_title'].unique())

In [None]:
# We don't know yet what each of these language odes means

In [None]:
#https://stackoverflow.com/questions/27842613/pandas-groupby-sort-within-groups

In [25]:
# What would be useful is to find the polls that ask the same question in multiple languages
# Then we could check for differences

In [None]:
wide_df = df.pivot()