# Project 2: Text Analysis of UN Speeches
by Matt Ring

# Setup

1. Load Packages

In [161]:
## Packages
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd
import numpy as np

2. Import Data

In [140]:
## Data

df = pd.read_csv("data/un_gen_debates_text.csv")

In [141]:
# Display the data
df.sample(10)

Unnamed: 0,session,year,country,country_name,speaker,position,text
3696,50,1995,PER,Peru,Mr. Tudela,Minister for Foreign Affairs,"Mr. President, on this important occasion, all..."
1075,33,1978,SLE,Sierra Leone,Conteh,,"﻿160.\tI should like, on behalf of the Sierra ..."
3102,47,1992,FRA,France,Mr Pumas,,"Allow me to \ncongratulate you. Sir, on your e..."
4253,53,1998,SYC,Seychelles,James Michel,Vice-President,eychelles is pleased to see\nMr. Opertti at th...
171,26,1971,THA,Thailand,Mr. KHOMAN,,"30.\t Mr. President, it augurs well for the tw..."
50,25,1970,PRY,Paraguay,Mr. SOLANO LOPEZ,,226.\t Before I begin my statement in the gen...
3613,50,1995,ECU,Ecuador,Mr. Leoro Franco,Minister for Foreign Affairs,First of all I should like to extend my warmes...
4198,53,1998,LUX,Luxembourg,Jacques Poos,Deputy Prime Minister,"As previous speakers have done, I would like t..."
1759,38,1983,JPN,Japan,Abe,,﻿91.\t On behalf of the Government and people ...
4944,57,2002,MUS,Mauritius,Anerood Jugnauth,Prime Minister,﻿It gives me\nimmense pleasure to address the ...


3. Clean Data

In [142]:
# Remove newline and tab characters
df["text"] = df["text"].replace({"\n":" ",
                                 "\t":" "}, regex = True)

4. Subset Data

Political transitions in the Former Soviet Union (FSU) and other socialist states will be assessed. This was selected as I noticed Yugoslavia still present even in 2015. I had originally intended to look at other political transitions, but considered that too broad. As such, I've selected all modern states which were once or are Marxist-Leninist. States which only reference socialism in their constitution are not included, such as India, Portugal, Algeria, etc...

Some adjustments need to be made and noted here:
1. Germany is not included, as it is difficult to handle reunification here when each state had separate speaches.
2. Former Yugoslavic states will be noted as such. Yugoslavia is recorded until 1991, then representing Serbia from 2001 onwards. The other states include Slovenia, North Macedonia, Bosnia and Herzegovina, Croatia, and Montenegro.
3. Former USSR states will be noted as such. These include Russia, Armenia, Azerbeijan, Belarus, Estonia, Georgia, Kazakhstan, Kyrgyzstan, Latvia, Lithuania, Moldova, Tajikistan, Turkmenistan, Ukraine, and Uzbekistan.
4. Czechloslovakia becomes Chezia in the data around the year of 1992. These countries will be considered the same and continuous.

In [144]:
# Reassign all CSK to CZE (Czechia)
df["country"] = np.where(df["country"] == "CSK", "CZE", df["country"])

In [145]:
# Create a list of former (and present) soviet states
fss = ["RUS", "CHN", "YUG", "POL", "CUB", 
       "AFG", "ALB", "AGO", "ARM", "AZE",
       "BLR", "BEN", "BGR", "KHM", "COG", 
       "CZE", "EST", "ETH", "GRD", 
       "GEO", "HUN", "KAZ", "KGZ", "LVA",
       "LTU", "MDA", "MNG", "MOZ", "ROU", 
       "SOM", "TJK", "TKM", "TUV", "UKR", 
       "UZB", "VNM", "YEM", "LAO", "PRK"]

Now, we'll classify countries over time as socialist or not based on [this](https://en.wikipedia.org/wiki/List_of_socialist_states#Marxist%E2%80%93Leninist_states_2) list. Any not on this list will be research independently. Republics within the USSR will be classified based on information found [here](https://en.wikipedia.org/wiki/Republics_of_the_Soviet_Union).

In [146]:
df_fss = df.loc[df["country"].isin(fss)]

In [147]:
df_fss.sample(10)

Unnamed: 0,session,year,country,country_name,speaker,position,text
2657,44,1989,LAO,Lao People's Democratic Republic,SRITHIRATH,,﻿ Allow me first on behalf of the delegation o...
4098,53,1998,AGO,Angola,João Bernardo Miranda,Minister for Foreign Affairs,Allow me to begin by congratulating Mr. Didier...
6574,66,2011,COG,Congo,Basile Ikouebe,Minister for Foreign Affairs,I am honoured to take the floor from this ros...
7182,69,2014,GEO,Georgia,Irakli Garibashvili,Prime Minister,It is a great honour for me to represent my c...
5593,61,2006,BEN,Benin,Dr. Boni Yayi,President,I am taking part personally in this session o...
7276,69,2014,SOM,Somalia,Hassan Sheikh Mohamud,President,It is a pleasure to join the Assembly today. ...
4599,55,2000,ROU,Romania,Petre Roman,Deputy Prime Minister,"I am delighted to extend to Mr. Harri Holkeri,..."
1950,39,1984,ROU,Romania,Andrei,,﻿I have the greatest pleasure in conveying to ...
302,27,1972,UKR,Ukraine,Mr. Shevel,,"Mr. President, permit me on behalf of the dele..."
7417,70,2015,LTU,Lithuania,Mrs. Dalia Grybauskaitė,President,The seventieth anniversary of the United Natio...


In [152]:
len(df_fss)

1450

# LDA Model

In [222]:
# Add stopwords related to institutions, such as UN councils, country names, etc.
# Also including words found across topics: peace, new, development

additional_stop_words = ["international", "united", "nations", "nation", "national", "countries", "country", "world", "states",
                         "council", "government", "people", "peoples", "republic", "general", "security",
                         "economic", "social", "assembly", "peace"]

In [223]:
# Create a vectorizer
vec = CountVectorizer(stop_words=text.ENGLISH_STOP_WORDS.union(additional_stop_words))

In [224]:
# Create dtm
X = vec.fit_transform(df_fss["text"])

In [225]:
# Create lda
lda = LatentDirichletAllocation(n_components=5)

In [226]:
# Fit lda
doc_topics = lda.fit_transform(X)

In [227]:
print(f"There are {lda.components_.shape[0]} topics and {lda.components_.shape[1]} words")

There are 5 topics and 29498 words


# Interpretting Topics

In [228]:
## Get feature names (vocabulary)
voc = np.array(vec.get_feature_names())

In [229]:
# Set number of top words you want
n_words=10

# Create lambda function to extra top words from voc
imp_words = lambda x: [voc[each] for each in np.argsort(x)[:-n_words-1:-1]]

In [230]:
# Use imp_words to extract words with the highest weights from our lda model
words_in_topic = ([imp_words(x) for x in lda.components_])

In [231]:
# Examine words
words_in_topic

[['human',
  'rights',
  'afghanistan',
  'new',
  'state',
  'today',
  'years',
  'azerbaijan',
  'war',
  'law'],
 ['development',
  'global',
  'cooperation',
  'community',
  'efforts',
  'support',
  'new',
  'terrorism',
  'challenges',
  'sustainable'],
 ['africa',
  'community',
  'development',
  'political',
  'organization',
  'new',
  'african',
  'efforts',
  'human',
  'rights'],
 ['nuclear',
  'new',
  'relations',
  'political',
  'europe',
  'weapons',
  'disarmament',
  'problems',
  'development',
  'efforts'],
 ['struggle',
  'south',
  'independence',
  'war',
  'soviet',
  'situation',
  'support',
  'relations',
  'military',
  'africa']]

# Visualizations

## 1. Defining Each Topic

## 2. Bloc Trends/All Countries Shown (by one or all topics)

## 3. Important/Interesting Countries