In [17]:
import pandas as pd
import numpy as np
import statistics as sta
import warnings
warnings.filterwarnings('ignore')

In [18]:
demo = pd.read_csv('data/demo.csv')
countriesCode = pd.read_csv('data/countriesCode.csv')

#RENAME COLUMNS
countriesCode.columns = ['Country', 'CountryCode']
demo.rename(columns = {'SQLDATE':'Date','Actor1CountryCode':'CountryCode','SOURCEURL':'SourceURL','avgTone':'AvgTone'}, inplace=True)

#REMOVE SPACE
countriesCode['CountryCode'] = countriesCode['CountryCode'].str.strip()
countriesCode['Country'] = countriesCode['Country'].str.strip()

demo

Unnamed: 0,Date,CountryCode,AvgTone,SourceURL
0,20180102,EUR,-4.347826,http://sevilla.abc.es/opinion/abci-persa-incon...
1,20180102,USA,-3.674121,http://www.lavozdigital.es/opinion/abci-persa-...
2,20180103,USA,-3.955696,http://www.alertadigital.com/2018/01/03/el-per...
3,20180111,USA,-6.851312,http://theothermccain.com/2018/01/10/google-la...
4,20180111,USA,-6.851312,http://theothermccain.com/2018/01/10/google-la...
5,20180120,NLD,-1.226994,https://nos.nl/artikel/2212880-korte-pvv-demon...
6,20180120,NLD,-1.369863,https://www.rtlnieuws.nl/nederland/politiek/ho...
7,20180121,DEU,0.543478,https://www.berliner-zeitung.de/berlin/-women-...
8,20180124,IRN,-0.594228,https://www.reuters.com/article/us-behravesh-i...
9,20180125,IRN,-0.600343,https://in.reuters.com/article/us-behravesh-ir...


In [19]:
#drop data duplicated
demoDropDup = demo.drop_duplicates()
demoDropDup

Unnamed: 0,Date,CountryCode,AvgTone,SourceURL
0,20180102,EUR,-4.347826,http://sevilla.abc.es/opinion/abci-persa-incon...
1,20180102,USA,-3.674121,http://www.lavozdigital.es/opinion/abci-persa-...
2,20180103,USA,-3.955696,http://www.alertadigital.com/2018/01/03/el-per...
3,20180111,USA,-6.851312,http://theothermccain.com/2018/01/10/google-la...
5,20180120,NLD,-1.226994,https://nos.nl/artikel/2212880-korte-pvv-demon...
6,20180120,NLD,-1.369863,https://www.rtlnieuws.nl/nederland/politiek/ho...
7,20180121,DEU,0.543478,https://www.berliner-zeitung.de/berlin/-women-...
8,20180124,IRN,-0.594228,https://www.reuters.com/article/us-behravesh-i...
9,20180125,IRN,-0.600343,https://in.reuters.com/article/us-behravesh-ir...
10,20180126,USA,-0.608696,http://www.nasdaq.com/article/commentary-the-u...


## MERGE DATA WITH COUNTRY CODE

In [20]:
demoData = demoDropDup.merge(countriesCode, on='CountryCode', how='left')
demoData.info()
demoData

<class 'pandas.core.frame.DataFrame'>
Int64Index: 70 entries, 0 to 69
Data columns (total 5 columns):
Date           70 non-null int64
CountryCode    70 non-null object
AvgTone        70 non-null float64
SourceURL      70 non-null object
Country        67 non-null object
dtypes: float64(1), int64(1), object(3)
memory usage: 3.3+ KB


Unnamed: 0,Date,CountryCode,AvgTone,SourceURL,Country
0,20180102,EUR,-4.347826,http://sevilla.abc.es/opinion/abci-persa-incon...,
1,20180102,USA,-3.674121,http://www.lavozdigital.es/opinion/abci-persa-...,United States
2,20180103,USA,-3.955696,http://www.alertadigital.com/2018/01/03/el-per...,United States
3,20180111,USA,-6.851312,http://theothermccain.com/2018/01/10/google-la...,United States
4,20180120,NLD,-1.226994,https://nos.nl/artikel/2212880-korte-pvv-demon...,Netherlands
5,20180120,NLD,-1.369863,https://www.rtlnieuws.nl/nederland/politiek/ho...,Netherlands
6,20180121,DEU,0.543478,https://www.berliner-zeitung.de/berlin/-women-...,Germany
7,20180124,IRN,-0.594228,https://www.reuters.com/article/us-behravesh-i...,Iran
8,20180125,IRN,-0.600343,https://in.reuters.com/article/us-behravesh-ir...,Iran
9,20180126,USA,-0.608696,http://www.nasdaq.com/article/commentary-the-u...,United States


## MERGE DATA WITH ACTOR CODE

In [21]:
actorCodeData = pd.read_csv('data/actorCode.csv', delimiter = ',', encoding = "ISO-8859-1")
#RENAME COLUMNS
actorCodeData.columns = ['actorCode','Actor']
#REMOVE SPACE
actorCodeData['actorCode'] = actorCodeData['actorCode'].str.strip()
actorCodeData['Actor'] = actorCodeData['Actor'].str.strip()
actorCodeData

Unnamed: 0,actorCode,Actor
0,AFG,Afghanistan
1,ABN,ethnic Albanian
2,ABW,Aruba
3,AFG,Afghanistan
4,AFGGOVTAL,Taliban (d.r.)
5,AFGREBTAL,Taliban (d.r.)
6,AFR,Africa
7,AGO,Angola
8,AGOCAB,Cabinda Enclave
9,AGOREBUNI,National Union for the Total Independence of A...


In [22]:
demoDatan = demoData.merge(actorCodeData, left_on='CountryCode', right_on='actorCode', how='left')
demoDatan

Unnamed: 0,Date,CountryCode,AvgTone,SourceURL,Country,actorCode,Actor
0,20180102,EUR,-4.347826,http://sevilla.abc.es/opinion/abci-persa-incon...,,EUR,Europe
1,20180102,USA,-3.674121,http://www.lavozdigital.es/opinion/abci-persa-...,United States,,
2,20180103,USA,-3.955696,http://www.alertadigital.com/2018/01/03/el-per...,United States,,
3,20180111,USA,-6.851312,http://theothermccain.com/2018/01/10/google-la...,United States,,
4,20180120,NLD,-1.226994,https://nos.nl/artikel/2212880-korte-pvv-demon...,Netherlands,NLD,Netherlands
5,20180120,NLD,-1.369863,https://www.rtlnieuws.nl/nederland/politiek/ho...,Netherlands,NLD,Netherlands
6,20180121,DEU,0.543478,https://www.berliner-zeitung.de/berlin/-women-...,Germany,DEU,Germany
7,20180124,IRN,-0.594228,https://www.reuters.com/article/us-behravesh-i...,Iran,IRN,Iran
8,20180125,IRN,-0.600343,https://in.reuters.com/article/us-behravesh-ir...,Iran,IRN,Iran
9,20180126,USA,-0.608696,http://www.nasdaq.com/article/commentary-the-u...,United States,,


In [23]:
#cek pengaruh MERGE DATA WITH ACTOR CODE
demoDatan[(demoDatan.Actor.notnull())&(demoDatan.Country.isnull())]

Unnamed: 0,Date,CountryCode,AvgTone,SourceURL,Country,actorCode,Actor
0,20180102,EUR,-4.347826,http://sevilla.abc.es/opinion/abci-persa-incon...,,EUR,Europe
10,20180131,SEA,-11.728395,http://www.banglanews24.com/english/national/a...,,SEA,Southeast Asia
11,20180201,SEA,-11.728395,http://www.banglanews24.com/english/national/a...,,SEA,Southeast Asia


In [33]:
#cuma SEA yg ngefek. jadi gausah make merge (malah nambah 2 kolom - actorCode &actor)
#replace aja.
demoData.loc[demoData['CountryCode'] == 'SEA', 'Country'] = 'Southeast Asia'
demoData.loc[demoData['CountryCode'] == 'EUR', 'Country'] = 'Europe'
demoData[(demoData.isnull().any(axis=1))]
#khusus replace NA
#demoData.loc[demoData.countryCode == 'SEA', 'Country'] = demoData.loc[demoData.countryCode == 'SEA', 'Country'].fillna('Southeast Asia')

Unnamed: 0,Date,CountryCode,AvgTone,SourceURL,Country


In [34]:
demoData[demoData.CountryCode=='SEA']

Unnamed: 0,Date,CountryCode,AvgTone,SourceURL,Country
10,20180131,SEA,-11.728395,http://www.banglanews24.com/english/national/a...,Southeast Asia
11,20180201,SEA,-11.728395,http://www.banglanews24.com/english/national/a...,Southeast Asia


In [35]:
demoData[demoData.CountryCode=='SEA'].Country

10    Southeast Asia
11    Southeast Asia
Name: Country, dtype: object

# DEVELOPING/DEVELOPED COUNTRY

In [36]:
devC = pd.read_csv('data/developingCountries.csv', delimiter = ',', encoding = "ISO-8859-1")
devC['CountryCat'] = 'developing'
devC.drop('Country', axis =1, inplace =True)
devC.dropna(inplace=True)
devC

Unnamed: 0,CountryCode,CountryCat
0,AFG,developing
1,ALB,developing
2,DZA,developing
3,ASM,developing
4,AGO,developing
5,ARG,developing
6,ARM,developing
7,AZE,developing
8,BGD,developing
9,BLR,developing


In [37]:
demo = demoData.merge(devC, on=['CountryCode'], how='left')
demo.loc[demo.CountryCode.notnull(), 'CountryCat'] = demo.loc[demo.CountryCode.notnull(), 'CountryCat'].fillna('developed')
demo['HRTopic'] = 'demo'
demo = demo[['Date','Country','CountryCode','CountryCat','HRTopic','AvgTone', 'SourceURL']]

In [38]:
demoDropNA = demo.dropna()
print('jumlah data demo =', len(demoDropNA))
demoDropNA

jumlah data demo = 71


Unnamed: 0,Date,Country,CountryCode,CountryCat,HRTopic,AvgTone,SourceURL
0,20180102,Europe,EUR,developed,demo,-4.347826,http://sevilla.abc.es/opinion/abci-persa-incon...
1,20180102,United States,USA,developed,demo,-3.674121,http://www.lavozdigital.es/opinion/abci-persa-...
2,20180103,United States,USA,developed,demo,-3.955696,http://www.alertadigital.com/2018/01/03/el-per...
3,20180111,United States,USA,developed,demo,-6.851312,http://theothermccain.com/2018/01/10/google-la...
4,20180120,Netherlands,NLD,developed,demo,-1.226994,https://nos.nl/artikel/2212880-korte-pvv-demon...
5,20180120,Netherlands,NLD,developed,demo,-1.369863,https://www.rtlnieuws.nl/nederland/politiek/ho...
6,20180121,Germany,DEU,developed,demo,0.543478,https://www.berliner-zeitung.de/berlin/-women-...
7,20180124,Iran,IRN,developing,demo,-0.594228,https://www.reuters.com/article/us-behravesh-i...
8,20180125,Iran,IRN,developing,demo,-0.600343,https://in.reuters.com/article/us-behravesh-ir...
9,20180126,United States,USA,developed,demo,-0.608696,http://www.nasdaq.com/article/commentary-the-u...


In [39]:
demoDropNA.to_csv('MergingData/DataDemo.csv', index=False)