# MUC_keys_persons

Extract Person keys from manuualy tagged MUC 3 corpus.

References:
https://github.com/dstl/muc3


In [171]:
# Imports

import csv, re

import pandas as pd

In [44]:
# Cycle through key document and extract lines that relate to people

keyfilepath = "C:\\Users\\rothw\\Documents\\MUC Data\\MUC3\\muc34\\TASK\\CORPORA\\key-tst1.v7"

# with open(keyfilepath, 'r') as f:
#     reader = csv.reader(f)
#     keyfile = list(reader)

keyfile = open(keyfilepath, 'r').read()

split_by_line = keyfile.split('\n')

# split_by_line

[';;; gk deleted murder 2 (redundant with 1)',
 '0.  MESSAGE: ID                     TST1-MUC3-0001',
 '1.  MESSAGE: TEMPLATE               1',
 '2.  INCIDENT: DATE                  02 FEB 90',
 '3.  INCIDENT: LOCATION              GUATEMALA: SANTO TOMAS (FARM)',
 '4.  INCIDENT: TYPE                  ATTACK',
 '5.  INCIDENT: STAGE OF EXECUTION    ACCOMPLISHED',
 '6.  INCIDENT: INSTRUMENT ID         -',
 '7.  INCIDENT: INSTRUMENT TYPE       -',
 '8.  PERP: INCIDENT CATEGORY         TERRORIST ACT',
 '9.  PERP: INDIVIDUAL ID             "GUERRILLA COLUMN" / "GUERRILLAS"',
 '10. PERP: ORGANIZATION ID           "GUATEMALAN NATIONAL REVOLUTIONARY UNITY" / "URNG"',
 '11. PERP: ORGANIZATION CONFIDENCE   REPORTED AS FACT / CLAIMED OR ADMITTED: "GUATEMALAN NATIONAL REVOLUTIONARY UNITY" / "URNG"',
 '12. PHYS TGT: ID                    "\\"SANTO TOMAS\\" PRESIDENTIAL FARM" / "PRESIDENTIAL FARM"',
 '13. PHYS TGT: TYPE                  GOVERNMENT OFFICE OR RESIDENCE: "\\"SANTO TOMAS\\" PRESIDENTIAL 

In [93]:
# Function to extract the Persons - from PERP: INDIVIDUAL ID lines and HUM TGT: NAME lines


def extract_person_keys(keyfilepath):
    
    keyfile = open(keyfilepath, 'r').read()
    split_by_line = keyfile.split('\n')


    persons_perps=[]
    persons_tgts=[]
    previous_row=""

    for row in split_by_line:
        tokens = row.split()
    #    print(tokens[0], len(tokens))
        if (len(tokens)>1):
            if (tokens[1] == "MESSAGE:" and tokens[2] == "ID"):
                messageid = ' '.join([word for word in tokens[3:]])
            elif (tokens[1] == "PERP:" and tokens[2] == "INDIVIDUAL" and tokens[3] == "ID"):
                person = ' '.join([word for word in tokens[4:]])
                if (person != '*' and person != '-'):
                    persons_perps.append([messageid, person])
                    previous_row = "Perp"
                else:
                    previous_row = "Other"
            elif (tokens[1] == "HUM" and tokens[2] == "TGT:" and tokens[3] == "NAME"):
                person = ' '.join([word for word in tokens[4:]])
                if (person != '*' and person != '-'):
                    persons_tgts.append([messageid, person])
                    previous_row = "Tgt"
                else:
                    previous_row = "Other"
            elif (previous_row == "Perp" and tokens[0] != "10."):
                person = ' '.join([word for word in tokens])
                persons_perps.append([messageid, person])
            elif (previous_row == "Tgt" and tokens[0] != "19."):
                person = ' '.join([word for word in tokens])
                persons_tgts.append([messageid, person])
            else:
                previous_row = "Other"
        else:
            previous_row = "Other"
                
    persons_concat=[persons_tgts, persons_perps]
    return persons_concat
                
# Call function
persons1 = extract_person_keys("C:\\Users\\rothw\\Documents\\MUC Data\\MUC3\\muc34\\TASK\\CORPORA\\key-tst1.v7")
persons2 = extract_person_keys("C:\\Users\\rothw\\Documents\\MUC Data\\MUC3\\muc34\\TASK\\CORPORA\\key-tst2.v4")
persons3 = extract_person_keys("C:\\Users\\rothw\\Documents\\MUC Data\\MUC3\\muc34\\TASK\\CORPORA\\key-tst3.v2")
persons4 = extract_person_keys("C:\\Users\\rothw\\Documents\\MUC Data\\MUC3\\muc34\\TASK\\CORPORA\\key-tst4.v2")


In [156]:
# Check some output HUMAN TGTs

persons1[0]

[['TST1-MUC3-0001', '"CEREZO"'],
 ['TST1-MUC3-0006', '"ANTONIO ROLDAN BETANCUR"'],
 ['TST1-MUC3-0011', '"ORLANDO LETELIER"'],
 ['TST1-MUC3-0011', '"RONNIE MOFFIT"'],
 ['TST1-MUC3-0018', '"HECTOR OQUELI COLINDRES"'],
 ['TST1-MUC3-0018', '"HILDA FLORES"'],
 ['TST1-MUC3-0024', '"MIGUEL MAZA MARQUEZ"'],
 ['TST1-MUC3-0030', '"CARLOS GALAN"'],
 ['TST1-MUC3-0037', '"ERIKA SULLIVAN"'],
 ['TST1-MUC3-0037', '"CHRISTOPHER ANDERSON"'],
 ['TST1-MUC3-0037', '"NADINE ELKASHES"'],
 ['TST1-MUC3-0038', '"ERNESTINA UMANZOR"'],
 ['TST1-MUC3-0046', '"MAURICIO GUTIERREZ CASTRO"'],
 ['TST1-MUC3-0046', '"ANTONIO RODRIGUEZ PORTH"'],
 ['TST1-MUC3-0046', '"EDGAR CHACON"'],
 ['TST1-MUC3-0049', '"BERNARDO JARAMILLO OSSA"'],
 ['TST1-MUC3-0050', '"IGNACIO ELLACURIA"'],
 ['TST1-MUC3-0059', '"JOSE ANTONIO RODRIGUEZ PORTH"'],
 ['TST1-MUC3-0061', '"JAIME PARDO LEAL"'],
 ['TST1-MUC3-0065', '"ROBERTO ROASCIO"'],
 ['TST1-MUC3-0065', '"MARIO ACCURSO"'],
 ['TST1-MUC3-0065', '"FRANCISCO PUJA"'],
 ['TST1-MUC3-0069', '"JOSE IGN

In [146]:
# Rack TGTs together and dedupe

person_tgts = persons1[0] + persons2[0] + persons3[0] + persons4[0]

tgts_df = pd.DataFrame(person_tgts)
tgts_df.columns = ['Message_id', 'Entity']

tgts_dedup_df = tgts_df['Entity'].drop_duplicates().sort_values()

tgts_dedup_df

136                                       "ABILIO DINIZ"
176                                      "ADOLFO SPEZUA"
181                              "ALBERTO CELIS SANCHEZ"
152                          "ALEXANDER MOLINA GRANADOS"
28                                     "ALFONSO DE LIMA"
183                                   "ALFREDO CHAMORRO"
131                                  "ALFREDO CRISTIANI"
110                            "ALVARO GONZALEZ SANTANA"
126                                       "AMANDO LOPEZ"
31                           "ANGELA PIEDAD DE GUERRERO"
13                             "ANTONIO RODRIGUEZ PORTH"
1                              "ANTONIO ROLDAN BETANCUR"
112                                     "ARNULFO ROMERO"
50                               "ARTURO RIVERA Y DAMAS"
161                                   "AUGUSTO PINOCHET"
177                                 "AUGUSTO VILCAHUMAN"
65                                   "BARTOLO RODRIGUEZ"
113                            

In [137]:
# Manually review PERPs, to remove generic ones

# persons1[1]
# persons2[1]
# persons3[1]
# persons4[1]

In [None]:
# Perps to keep

In [148]:
perps=[
'"PABLO ESCOBAR GAVIRIA"',\
'"GONZALO RODRIGUEZ GACHA"',\
'"GONZALO DE JESUS PEREZ"',\
'"HENRY PEREZ"',\
'"MARCELO PEREZ"',\
'"COLONEL HECTOR HERIBERTO HERNANDEZ"',\
'"CAPTAIN ALONSO CHAVEZ GARCIA"',\
'"COL VARGAS"',\
'"GONZALO RODRIGUEZ GACHA"',\
'"ROBERTO D\'AUBUISSON"',\
'"CAPTAIN ALVARO SARAVIA"',\
'"COLONEL PONCE"',\
'"RUTH ESPERANZA AGUILAR MARROQUIN"',\
'"ADOLFO MESA MENESES"',\
'"COLONEL GUILLERMO ALFREDO BENAVIDES MORENO"',\
'"FIDEL CASTANO"',\
'"HENRY PEREZ"',\
'"JAIRO ALBERTO RESTREPO POSADA"',\
'"GONZALO RODRIGUEZ GACHA"',\
'"LIEUTENANT PACHECHO"',\
'"COLONEL CANAS"',\
'"COLONAL ORLANDO MONTANO"',\
'"COLONEL RENE EMILIO PONCE"',\
'"COLONEL HERIBERTO HERNANDEZ"',\
'"CAPTAIN ALFONSO CHAVEZ GARCIA"',\
'"COL ORLANDO CEPEDA"',\
'"COL RENE EMILIO PONCE"',\
'"COL GUILLERMO BENAVIDES"',\
'"ALFREDO CRISTIANI"',\
'"GERARDO OLIVOS SILVA"',\
'"JOSE JESUS PENA"',\
'"COL EMILIO PONCE"',\
'"JUBIZ HAZVUMB"',\
'"PRESIDENT CRISTIANI"',\
'"COMMANDER AMILCAR"',\
'"FACUNDO GUARDADO"',\
'"COMMANDER ESTEBAN"',\
'"GONZALO RODRIGUEZ GACHA"',\
'"PRESIDENT ALFREDO CRISTIANI"',\
'"CAPTAIN ALVARO SANABRIA"',\
'"VICTOR DIAZ CARO"'\
]

perps

['"PABLO ESCOBAR GAVIRIA"',
 '"GONZALO RODRIGUEZ GACHA"',
 '"GONZALO DE JESUS PEREZ"',
 '"HENRY PEREZ"',
 '"MARCELO PEREZ"',
 '"COLONEL HECTOR HERIBERTO HERNANDEZ"',
 '"CAPTAIN ALONSO CHAVEZ GARCIA"',
 '"COL VARGAS"',
 '"GONZALO RODRIGUEZ GACHA"',
 '"ROBERTO D\'AUBUISSON"',
 '"CAPTAIN ALVARO SARAVIA"',
 '"COLONEL PONCE"',
 '"RUTH ESPERANZA AGUILAR MARROQUIN"',
 '"ADOLFO MESA MENESES"',
 '"COLONEL GUILLERMO ALFREDO BENAVIDES MORENO"',
 '"FIDEL CASTANO"',
 '"HENRY PEREZ"',
 '"JAIRO ALBERTO RESTREPO POSADA"',
 '"GONZALO RODRIGUEZ GACHA"',
 '"LIEUTENANT PACHECHO"',
 '"COLONEL CANAS"',
 '"COLONAL ORLANDO MONTANO"',
 '"COLONEL RENE EMILIO PONCE"',
 '"COLONEL HERIBERTO HERNANDEZ"',
 '"CAPTAIN ALFONSO CHAVEZ GARCIA"',
 '"COL ORLANDO CEPEDA"',
 '"COL RENE EMILIO PONCE"',
 '"COL GUILLERMO BENAVIDES"',
 '"ALFREDO CRISTIANI"',
 '"GERARDO OLIVOS SILVA"',
 '"JOSE JESUS PENA"',
 '"COL EMILIO PONCE"',
 '"JUBIZ HAZVUMB"',
 '"PRESIDENT CRISTIANI"',
 '"COMMANDER AMILCAR"',
 '"FACUNDO GUARDADO"',
 '"COMMA

In [149]:
# Rack PERP INDIVIDUAL IDs together and dedupe

perps_df = pd.DataFrame(perps)

perps_df
# tgts_df.columns = ['Message_id', 'Entity']

perps_dedup_df = perps_df[0].drop_duplicates().sort_values()

perps_dedup_df

13                           "ADOLFO MESA MENESES"
28                             "ALFREDO CRISTIANI"
24                 "CAPTAIN ALFONSO CHAVEZ GARCIA"
6                   "CAPTAIN ALONSO CHAVEZ GARCIA"
39                       "CAPTAIN ALVARO SANABRIA"
10                        "CAPTAIN ALVARO SARAVIA"
31                              "COL EMILIO PONCE"
27                       "COL GUILLERMO BENAVIDES"
25                            "COL ORLANDO CEPEDA"
26                         "COL RENE EMILIO PONCE"
7                                     "COL VARGAS"
21                       "COLONAL ORLANDO MONTANO"
20                                 "COLONEL CANAS"
14    "COLONEL GUILLERMO ALFREDO BENAVIDES MORENO"
5             "COLONEL HECTOR HERIBERTO HERNANDEZ"
23                   "COLONEL HERIBERTO HERNANDEZ"
11                                 "COLONEL PONCE"
22                     "COLONEL RENE EMILIO PONCE"
34                             "COMMANDER AMILCAR"
36                             

In [150]:
# Combine the TGT and PERP series

persons_all = pd.concat([tgts_dedup_df, perps_dedup_df])

persons_all

136                                  "ABILIO DINIZ"
176                                 "ADOLFO SPEZUA"
181                         "ALBERTO CELIS SANCHEZ"
152                     "ALEXANDER MOLINA GRANADOS"
28                                "ALFONSO DE LIMA"
183                              "ALFREDO CHAMORRO"
131                             "ALFREDO CRISTIANI"
110                       "ALVARO GONZALEZ SANTANA"
126                                  "AMANDO LOPEZ"
31                      "ANGELA PIEDAD DE GUERRERO"
13                        "ANTONIO RODRIGUEZ PORTH"
1                         "ANTONIO ROLDAN BETANCUR"
112                                "ARNULFO ROMERO"
50                          "ARTURO RIVERA Y DAMAS"
161                              "AUGUSTO PINOCHET"
177                            "AUGUSTO VILCAHUMAN"
65                              "BARTOLO RODRIGUEZ"
113                                  "BEATRIZ IERO"
15                        "BERNARDO JARAMILLO OSSA"
53          

In [174]:
# Drop duplicates

persons_all_dedup = persons_all.drop_duplicates()

persons_all_dedup

136                                  "ABILIO DINIZ"
176                                 "ADOLFO SPEZUA"
181                         "ALBERTO CELIS SANCHEZ"
152                     "ALEXANDER MOLINA GRANADOS"
28                                "ALFONSO DE LIMA"
183                              "ALFREDO CHAMORRO"
131                             "ALFREDO CRISTIANI"
110                       "ALVARO GONZALEZ SANTANA"
126                                  "AMANDO LOPEZ"
31                      "ANGELA PIEDAD DE GUERRERO"
13                        "ANTONIO RODRIGUEZ PORTH"
1                         "ANTONIO ROLDAN BETANCUR"
112                                "ARNULFO ROMERO"
50                          "ARTURO RIVERA Y DAMAS"
161                              "AUGUSTO PINOCHET"
177                            "AUGUSTO VILCAHUMAN"
65                              "BARTOLO RODRIGUEZ"
113                                  "BEATRIZ IERO"
15                        "BERNARDO JARAMILLO OSSA"
53          

In [160]:
# Output the list

persons_all_dedup.to_csv("C:\\Users\\rothw\\Documents\\MUC Data\\MUC3\\MUC_keys_targets_and_perps.csv", index=False,  na_rep=None)

### Scrap code

In [24]:
type(split_by_line)

list

In [19]:
len(split_by_line)

3745

In [38]:
for row in split_by_line:
    tokens = row.split()
    print(tokens)

[';;;', 'gk', 'deleted', 'murder', '2', '(redundant', 'with', '1)']
['0.', 'MESSAGE:', 'ID', 'TST1-MUC3-0001']
['1.', 'MESSAGE:', 'TEMPLATE', '1']
['2.', 'INCIDENT:', 'DATE', '02', 'FEB', '90']
['3.', 'INCIDENT:', 'LOCATION', 'GUATEMALA:', 'SANTO', 'TOMAS', '(FARM)']
['4.', 'INCIDENT:', 'TYPE', 'ATTACK']
['5.', 'INCIDENT:', 'STAGE', 'OF', 'EXECUTION', 'ACCOMPLISHED']
['6.', 'INCIDENT:', 'INSTRUMENT', 'ID', '-']
['7.', 'INCIDENT:', 'INSTRUMENT', 'TYPE', '-']
['8.', 'PERP:', 'INCIDENT', 'CATEGORY', 'TERRORIST', 'ACT']
['9.', 'PERP:', 'INDIVIDUAL', 'ID', '"GUERRILLA', 'COLUMN"', '/', '"GUERRILLAS"']
['10.', 'PERP:', 'ORGANIZATION', 'ID', '"GUATEMALAN', 'NATIONAL', 'REVOLUTIONARY', 'UNITY"', '/', '"URNG"']
['11.', 'PERP:', 'ORGANIZATION', 'CONFIDENCE', 'REPORTED', 'AS', 'FACT', '/', 'CLAIMED', 'OR', 'ADMITTED:', '"GUATEMALAN', 'NATIONAL', 'REVOLUTIONARY', 'UNITY"', '/', '"URNG"']
['12.', 'PHYS', 'TGT:', 'ID', '"\\"SANTO', 'TOMAS\\"', 'PRESIDENTIAL', 'FARM"', '/', '"PRESIDENTIAL', 'FARM"']


In [136]:
len(perps)

41

In [120]:
person_tgts

[['TST1-MUC3-0001', '"CEREZO"'],
 ['TST1-MUC3-0006', '"ANTONIO ROLDAN BETANCUR"'],
 ['TST1-MUC3-0011', '"ORLANDO LETELIER"'],
 ['TST1-MUC3-0011', '"RONNIE MOFFIT"'],
 ['TST1-MUC3-0018', '"HECTOR OQUELI COLINDRES"'],
 ['TST1-MUC3-0018', '"HILDA FLORES"'],
 ['TST1-MUC3-0024', '"MIGUEL MAZA MARQUEZ"'],
 ['TST1-MUC3-0030', '"CARLOS GALAN"'],
 ['TST1-MUC3-0037', '"ERIKA SULLIVAN"'],
 ['TST1-MUC3-0037', '"CHRISTOPHER ANDERSON"'],
 ['TST1-MUC3-0037', '"NADINE ELKASHES"'],
 ['TST1-MUC3-0038', '"ERNESTINA UMANZOR"'],
 ['TST1-MUC3-0046', '"MAURICIO GUTIERREZ CASTRO"'],
 ['TST1-MUC3-0046', '"ANTONIO RODRIGUEZ PORTH"'],
 ['TST1-MUC3-0046', '"EDGAR CHACON"'],
 ['TST1-MUC3-0049', '"BERNARDO JARAMILLO OSSA"'],
 ['TST1-MUC3-0050', '"IGNACIO ELLACURIA"'],
 ['TST1-MUC3-0059', '"JOSE ANTONIO RODRIGUEZ PORTH"'],
 ['TST1-MUC3-0061', '"JAIME PARDO LEAL"'],
 ['TST1-MUC3-0065', '"ROBERTO ROASCIO"'],
 ['TST1-MUC3-0065', '"MARIO ACCURSO"'],
 ['TST1-MUC3-0065', '"FRANCISCO PUJA"'],
 ['TST1-MUC3-0069', '"JOSE IGN

In [154]:
type(persons_all)

pandas.core.series.Series