In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score

# suppress warnings
import warnings
warnings.filterwarnings('ignore')


In [2]:
# read the data
df = pd.read_csv('https://github.com/blueprints-for-text-analytics-python/blueprints-text/raw/master/data/un-general-debates/un-general-debates-blueprint.csv.gz')
df.head()

Unnamed: 0,session,year,country,country_name,speaker,position,text
0,25,1970,ALB,Albania,Mr. NAS,,33: May I first convey to our President the co...
1,25,1970,ARG,Argentina,Mr. DE PABLO PARDO,,177.\t : It is a fortunate coincidence that pr...
2,25,1970,AUS,Australia,Mr. McMAHON,,100.\t It is a pleasure for me to extend to y...
3,25,1970,AUT,Austria,Mr. KIRCHSCHLAEGER,,155.\t May I begin by expressing to Ambassado...
4,25,1970,BEL,Belgium,Mr. HARMEL,,"176. No doubt each of us, before coming up to ..."


In [8]:
# show rows for the United States
df[df['country'] == 'USA']

Unnamed: 0,session,year,country,country_name,speaker,position,text
65,25,1970,USA,United States,Mr. YOST,,1.\t It is my privilege to extend to you once ...
179,26,1971,USA,United States,Mr. ROGERS,,"Mr. President, I should like first of all to ..."
304,27,1972,USA,United States,Mr. Rogers,,During the past few years the world has made r...
424,28,1973,USA,United States,Kissinger,,﻿35.\tI should like to add my congratulations ...
554,29,1974,USA,United States,Mr. Kissinger,,"Last year, at the previous session [.2124th me..."
680,30,1975,USA,United States,Mr. Kissinger,,"At the outset, let me say how pleased we are t..."
814,31,1976,USA,United States,Mr. Kissinger,,Let me first congratulate this body for electi...
953,32,1977,USA,United States,Jimmy Carter,President,"﻿1.\t Mr. President, I wish to offer you my co..."
1093,33,1978,USA,United States,Vance,,"﻿103.\tMr. President, I congratulate you on yo..."
1236,34,1979,USA,United States,Vance,,﻿We meet in this General Assembly on the thres...


In [9]:
# show rows for Canada
df[ df["country_name"] == 'Canada' ]

Unnamed: 0,session,year,country,country_name,speaker,position,text
8,25,1970,CAN,Canada,Mr. SHARP,,\nThe General Assembly is fortunate indeed to ...
84,26,1971,CAN,Canada,Mr- Sharp,,"48.\t May I first offer you, Sir, the fiM sup..."
204,27,1972,CAN,Canada,Mr. Sharp,,"Mr. President, the Canadian delegation looks f..."
329,28,1973,CAN,Canada,Sharp,,"﻿1.\tMr. President, I begin by expressing Cana..."
451,29,1974,CAN,Canada,Mr. MacEachen,,I take great pleasure in joining speakers who ...
580,30,1975,CAN,Canada,Mr. MacEACHEN,,"Mr. President, let me express, at the outset o..."
706,31,1976,CAN,Canada,Mr. Jamieson,,"Mr. President, in addressing this Assembly for..."
841,32,1977,CAN,Canada,JAMIESON,,"﻿42.\t Mr. President, I am pleased to be one o..."
981,33,1978,CAN,Canada,Jamieson,,﻿\n\n\n163.\tI should like first of all to con...
1121,34,1979,CAN,Canada,McDonald,,﻿ May I join my colleagues in congratulating t...


In [16]:
# Initialize stopwords
import nltk
nltk.download('stopwords')

stopwords = set(nltk.corpus.stopwords.words('english'))

# Words that identify the country being referred to. Makes classification trivial.
# Removing them prevents the model from simply learning to associate these terms with the target classes
include_stopwords = {'dear', 'regards', 'must', 'would', 'also', 
                     'canada', 'canadian', 'canadians', 'prime', 'minister', 'province', 'provinces'
                     'united', 'states', 'state', 'america', 'american', 'president','secretary'}
exclude_stopwords = {'against'}

stopwords |= include_stopwords
stopwords -= exclude_stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/stephanegoulet/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
# build a text processing and classifier pipeline
# to predict the country (USA or Canada) of a speech
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

df2 = df[df['country'].isin(['USA', 'CAN'])]

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df2['text'], df2['country'], test_size=0.2)

# Create a pipeline that first transforms the text data into TF-IDF vectors (term frequency–inverse document frequency, a measure of importance of a word to a document in a collection), 
# then applies SVM
text_clf = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=list(stopwords))),
    ('clf', svm.SVC()),
])

# Train the classifier
text_clf.fit(X_train, y_train)

# Predict the test set results
y_pred = text_clf.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred, target_names=['USA', 'CAN']))


              precision    recall  f1-score   support

         USA       0.82      1.00      0.90         9
         CAN       1.00      0.80      0.89        10

    accuracy                           0.89        19
   macro avg       0.91      0.90      0.89        19
weighted avg       0.91      0.89      0.89        19



In [19]:
# This script creates a new column 'sentiment' in the dataframe, 
# which contains the sentiment score of the text. 
# The sentiment score is a float within the range [-1.0, 1.0], 
# where -1.0 denotes a very negative sentiment, 
# 1.0 denotes a very positive sentiment, 
# and values around 0 denote a neutral sentiment.

from textblob import TextBlob

# Define a function to apply sentiment analysis to a text
def get_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity  # returns a value between -1 and 1

# Create a new column 'sentiment' in the DataFrame
df2['sentiment'] = df2['text'].apply(get_sentiment)

# Display the DataFrame
df2

Unnamed: 0,session,year,country,country_name,speaker,position,text,sentiment
8,25,1970,CAN,Canada,Mr. SHARP,,\nThe General Assembly is fortunate indeed to ...,0.107194
65,25,1970,USA,United States,Mr. YOST,,1.\t It is my privilege to extend to you once ...,0.122693
84,26,1971,CAN,Canada,Mr- Sharp,,"48.\t May I first offer you, Sir, the fiM sup...",0.136800
179,26,1971,USA,United States,Mr. ROGERS,,"Mr. President, I should like first of all to ...",0.121212
204,27,1972,CAN,Canada,Mr. Sharp,,"Mr. President, the Canadian delegation looks f...",0.117961
...,...,...,...,...,...,...,...,...
7107,68,2013,USA,United States,Barack Obama,President,Each year we come together to \nreaffirm the f...,0.060037
7149,69,2014,CAN,Canada,Stephen Harper,Prime Minister,It is both \nan honour and a pleasure for me t...,0.151988
7301,69,2014,USA,United States,Barack Obama,President,We come together at a \ncrossroads between war...,0.086555
7343,70,2015,CAN,Canada,Mr. Daniel Jean,Deputy Minister Foreign Affairs,I am honoured to appear before the Assembly to...,0.107947


In [17]:
# Average sentiment for each country in df2
df2.groupby('country')['sentiment'].mean()

country
CAN    0.112540
USA    0.110635
Name: sentiment, dtype: float64

In [18]:
# Average sentiment for each speaker in df2
# order the results from most positive to most negative

df2.groupby('speaker')['sentiment'].mean().sort_values(ascending=False).head(5)

speaker
Condoleezza Rice    0.164408
Lawrence Cannon     0.152515
Leonard Edwards     0.145419
Mr. MACGUIGAN       0.144525
Jean Chrétien       0.142099
Name: sentiment, dtype: float64

In [13]:
# The average sentiment for each year sorted in descending order based on their average sentiment.
df2.groupby('year')['sentiment'].mean().sort_values(ascending=False)

year
2008    0.143186
1989    0.140575
1991    0.139629
2003    0.137859
1981    0.135435
1971    0.129006
1994    0.128335
1993    0.126281
2006    0.125876
1995    0.125568
1973    0.125375
1997    0.124643
2015    0.123486
2010    0.122846
2005    0.121935
1987    0.121119
2014    0.119271
1998    0.117136
2009    0.116599
2007    0.116335
1970    0.114943
1983    0.114040
1978    0.113078
1976    0.111378
1977    0.111315
1984    0.110240
1996    0.109673
1988    0.109235
1992    0.107583
2004    0.106383
1990    0.104914
1985    0.103662
1972    0.100319
2001    0.099462
2011    0.099054
1974    0.098554
1980    0.097081
2000    0.095660
1979    0.095185
1982    0.093348
1999    0.093205
1975    0.090080
2002    0.084102
1986    0.084058
2012    0.074286
2013    0.071692
Name: sentiment, dtype: float64