In [1]:
import pandas as pd
import numpy as np 

import matplotlib.pyplot as plt

# Basic utilities
import csv
import pandas as pd
import numpy as np

# Transformers
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Binarizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.base import BaseEstimator, TransformerMixin

# Models
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
df = pd.read_csv('/Users/hc693/Downloads/long_dropnan_datecorrected.csv')

In [3]:
df.head()

Unnamed: 0,Council,Date,Title,Theme code,Resolution,token,Country,Vote
0,General Assembly,12/8/1946,Treatment of Indians in the Union of South Afr...,7,A/RES/44(I),671248,AFGHANISTAN,Y
1,General Assembly,12/12/1946,Relations of Members of the United Nations wit...,8,A/RES/39(I),671249,AFGHANISTAN,A
2,General Assembly,12/13/1946,Voting procedure in the Security Council : res...,12,A/RES/40(I),671250,AFGHANISTAN,Y
3,General Assembly,12/13/1946,Approval of Trusteeship Agreements [New Guinea...,9,A/RES/63(I)[PARA.1],671297,AFGHANISTAN,X
4,General Assembly,12/13/1946,Approval of Trusteeship Agreements [Western Sa...,9,A/RES/63(I)[PARA.5],671308,AFGHANISTAN,X


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 896649 entries, 0 to 896648
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Council     896649 non-null  object
 1   Date        896649 non-null  object
 2   Title       896649 non-null  object
 3   Theme code  896649 non-null  int64 
 4   Resolution  896649 non-null  object
 5   token       896649 non-null  int64 
 6   Country     896649 non-null  object
 7   Vote        896649 non-null  object
dtypes: int64(2), object(6)
memory usage: 54.7+ MB


In [5]:
# Converting date to datetime
df['Date']= pd.to_datetime(df['Date'])

In [6]:
#Converting Council to numeric--OHE method called get_dummy

dummies = pd.get_dummies(df.Council)
merged = pd.concat([df, dummies], axis='columns')
merged.drop(['Council'], axis='columns')

Unnamed: 0,Date,Title,Theme code,Resolution,token,Country,Vote,General Assembly,Security Council
0,1946-12-08,Treatment of Indians in the Union of South Afr...,7,A/RES/44(I),671248,AFGHANISTAN,Y,1,0
1,1946-12-12,Relations of Members of the United Nations wit...,8,A/RES/39(I),671249,AFGHANISTAN,A,1,0
2,1946-12-13,Voting procedure in the Security Council : res...,12,A/RES/40(I),671250,AFGHANISTAN,Y,1,0
3,1946-12-13,Approval of Trusteeship Agreements [New Guinea...,9,A/RES/63(I)[PARA.1],671297,AFGHANISTAN,X,1,0
4,1946-12-13,Approval of Trusteeship Agreements [Western Sa...,9,A/RES/63(I)[PARA.5],671308,AFGHANISTAN,X,1,0
...,...,...,...,...,...,...,...,...,...
896644,2021-12-24,Situation of human rights in the Syrian Arab R...,5,A/RES/76/228,3952169,ZIMBABWE,N,1,0
896645,2021-12-24,Promoting international cooperation on peacefu...,4,A/RES/76/234,3952167,ZIMBABWE,Y,1,0
896646,2021-12-24,Further practical measures for the prevention ...,4,A/RES/76/230,3952168,ZIMBABWE,Y,1,0
896647,2021-12-24,A global call for concrete action for the elim...,3,A/RES/76/226,3952171,ZIMBABWE,Y,1,0


In [7]:
# Converting countries to codes as per this link: 
#https://stackoverflow.com/questions/61400040/convert-country-name-to-int-in-panda
merged_country = pd.DataFrame({'Country'})
codes,uniques=pd.factorize(merged.Country)
codes
merged['Country_code']=pd.Series(codes)
merged

Unnamed: 0,Council,Date,Title,Theme code,Resolution,token,Country,Vote,General Assembly,Security Council,Country_code
0,General Assembly,1946-12-08,Treatment of Indians in the Union of South Afr...,7,A/RES/44(I),671248,AFGHANISTAN,Y,1,0,0
1,General Assembly,1946-12-12,Relations of Members of the United Nations wit...,8,A/RES/39(I),671249,AFGHANISTAN,A,1,0,0
2,General Assembly,1946-12-13,Voting procedure in the Security Council : res...,12,A/RES/40(I),671250,AFGHANISTAN,Y,1,0,0
3,General Assembly,1946-12-13,Approval of Trusteeship Agreements [New Guinea...,9,A/RES/63(I)[PARA.1],671297,AFGHANISTAN,X,1,0,0
4,General Assembly,1946-12-13,Approval of Trusteeship Agreements [Western Sa...,9,A/RES/63(I)[PARA.5],671308,AFGHANISTAN,X,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
896644,General Assembly,2021-12-24,Situation of human rights in the Syrian Arab R...,5,A/RES/76/228,3952169,ZIMBABWE,N,1,0,198
896645,General Assembly,2021-12-24,Promoting international cooperation on peacefu...,4,A/RES/76/234,3952167,ZIMBABWE,Y,1,0,198
896646,General Assembly,2021-12-24,Further practical measures for the prevention ...,4,A/RES/76/230,3952168,ZIMBABWE,Y,1,0,198
896647,General Assembly,2021-12-24,A global call for concrete action for the elim...,3,A/RES/76/226,3952171,ZIMBABWE,Y,1,0,198


In [8]:
#OHE for Vote

encoder = OneHotEncoder(handle_unknown='ignore')
encoder_df = pd.DataFrame(encoder.fit_transform(merged[['Vote']]).toarray())
final_df = merged.join(encoder_df)

In [9]:
#Results in our final dataframe
final_df

Unnamed: 0,Council,Date,Title,Theme code,Resolution,token,Country,Vote,General Assembly,Security Council,Country_code,0,1,2,3
0,General Assembly,1946-12-08,Treatment of Indians in the Union of South Afr...,7,A/RES/44(I),671248,AFGHANISTAN,Y,1,0,0,0.0,0.0,0.0,1.0
1,General Assembly,1946-12-12,Relations of Members of the United Nations wit...,8,A/RES/39(I),671249,AFGHANISTAN,A,1,0,0,1.0,0.0,0.0,0.0
2,General Assembly,1946-12-13,Voting procedure in the Security Council : res...,12,A/RES/40(I),671250,AFGHANISTAN,Y,1,0,0,0.0,0.0,0.0,1.0
3,General Assembly,1946-12-13,Approval of Trusteeship Agreements [New Guinea...,9,A/RES/63(I)[PARA.1],671297,AFGHANISTAN,X,1,0,0,0.0,0.0,1.0,0.0
4,General Assembly,1946-12-13,Approval of Trusteeship Agreements [Western Sa...,9,A/RES/63(I)[PARA.5],671308,AFGHANISTAN,X,1,0,0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
896644,General Assembly,2021-12-24,Situation of human rights in the Syrian Arab R...,5,A/RES/76/228,3952169,ZIMBABWE,N,1,0,198,0.0,1.0,0.0,0.0
896645,General Assembly,2021-12-24,Promoting international cooperation on peacefu...,4,A/RES/76/234,3952167,ZIMBABWE,Y,1,0,198,0.0,0.0,0.0,1.0
896646,General Assembly,2021-12-24,Further practical measures for the prevention ...,4,A/RES/76/230,3952168,ZIMBABWE,Y,1,0,198,0.0,0.0,0.0,1.0
896647,General Assembly,2021-12-24,A global call for concrete action for the elim...,3,A/RES/76/226,3952171,ZIMBABWE,Y,1,0,198,0.0,0.0,0.0,1.0


In [10]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 896649 entries, 0 to 896648
Data columns (total 15 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   Council           896649 non-null  object        
 1   Date              896649 non-null  datetime64[ns]
 2   Title             896649 non-null  object        
 3   Theme code        896649 non-null  int64         
 4   Resolution        896649 non-null  object        
 5   token             896649 non-null  int64         
 6   Country           896649 non-null  object        
 7   Vote              896649 non-null  object        
 8   General Assembly  896649 non-null  uint8         
 9   Security Council  896649 non-null  uint8         
 10  Country_code      896649 non-null  int64         
 11  0                 896649 non-null  float64       
 12  1                 896649 non-null  float64       
 13  2                 896649 non-null  float64       
 14  3   