forked from embruna/Editorial-Board-Geography
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Name.check.R
104 lines (89 loc) · 5.09 KB
/
Name.check.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
Name.check <- function(DataToClean,NameComboColumn) {
# Trying to find names that are mispelled or close to correct close
# http://stackoverflow.com/questions/6683380/techniques-for-finding-near-duplicate-records
# # https://cran.r-project.org/web/packages/RecordLinkage/index.html AND
# # https://cran.r-project.org/web/packages/stringdist/stringdist.pdf
# # https://cran.r-project.org/web/packages/RecordLinkage/RecordLinkage.pdf
# https://journal.r-project.org/archive/2010-2/RJournal_2010-2_Sariyar+Borg.pdf
# http://stackoverflow.com/questions/11535625/similarity-scores-based-on-string-comparison-in-r-edit-distance
# http://stackoverflow.com/questions/28952034/finding-partial-matches-on-strings-in-r
CHECKFILE<-DataToClean
# CHECKFILE<-DataToClean %>%
# group_by(LAST_NAME,FIRST_NAME,MIDDLE_NAME, COUNTRY) %>%
# tally(sort=FALSE)
# str(CHECKFILE)
# CHECKFILE<-as.data.frame(CHECKFILE)
# which(CHECKFILE == "")
# # CHECKFILE[CHECKFILE == ""] <- NA
CHECKFILE<-droplevels(CHECKFILE)
# CHECKFILE$FIRSTLAST_NAME<-paste(CHECKFILE$FIRST_NAME,CHECKFILE$LAST_NAME, sep=" ")
# CHECKFILE$FIRSTLASTMIDDLE_NAME<-paste(CHECKFILE$FIRST_NAME,CHECKFILE$MIDDLE_NAME,CHECKFILE$LAST_NAME, sep=" ")
# # First initial 1st name + last name":
# CHECKFILE$FIRST_INIT<-as.character(CHECKFILE$FIRST_NAME)
# CHECKFILE$FIRST_INIT<-substring(CHECKFILE$FIRST_INIT,1,1)
# CHECKFILE$INITIALLAST<-paste(CHECKFILE$FIRST_INIT,CHECKFILE$LAST_NAME, sep=" ")
#
#
# str(CHECKFILE)
# summary(CHECKFILE)
# head(CHECKFILE)
#
# # CHECKFILE$NAME<-as.character(CHECKFILE$NAME)
CHECKFILE$COUNTRY<-as.character(CHECKFILE$COUNTRY)
CHECKFILE$FIRST_NAME<-as.character(CHECKFILE$FIRST_NAME)
CHECKFILE$LAST_NAME<-as.character(CHECKFILE$LAST_NAME)
CHECKFILE$MIDDLE_NAME<-as.character(CHECKFILE$MIDDLE_NAME)
CHECKFILE$FirstLast<-as.character(CHECKFILE$FirstLast)
CHECKFILE$FirstMiddleLast<-as.character(CHECKFILE$FirstMiddleLast)
CHECKFILE$FirstInitialLast<-as.character(CHECKFILE$FirstInitialLast)
#
# str(CHECKFILE)
#
# This will look over the names and check for mistakes, spelling errors, etc.
# LAST NAMES: this should help pick up things like Abrams vs Abrasm
# CheckNames<-CHECKFILE$NAME #FOR CHO DATA
#CheckNames<-CHECKFILE$FIRSTLAST_NAME
#CheckNames<-CHECKFILE$FIRSTLASTMIDDLE_NAME #FOR ALL DATA
CheckNames<-NameComboColumn
CheckNames<-tolower(CheckNames) #drop all to lower case - makes it easier to error check and analyze
CheckNames<-unique(CheckNames)
# This uses agrep to check similarity, then outputs a list of all names in your file compared to
# all other names. This is what will help find spelling mistakes, eg. "abrams" and "abrasm" will be counted as unique, as will
# "e bruna" and "emilio bruna". You can use this info to error correct or make changes to correctly pool the people with multiple names
NamesList<-sapply(CheckNames,agrep,CheckNames, value=TRUE)
# Convert this list to a dataframe (with help from this post:
# https://aurelienmadouasse.wordpress.com/2012/05/22/r-code-how-to-convert-a-list-to-a-data-frame/)
NamesDF<-data.frame(
Name1 = rep(names(NamesList), lapply(NamesList, length)),
Name2 = unlist(NamesList))
# summary(NamesDF)
# str(NamesDF)
# Create a column to which you will add a logical condition telling you if the names are an EXACT match
NamesDF$match<-NA
NamesDF$match<-NamesDF$Name1==NamesDF$Name2
# match2<-ifelse(NamesDF$match=="TRUE",1,0) #convert TRUE/FALSEto 0/1
# NamesDF<-cbind(NamesDF,match2)
# head(NamesDF,40)
# str(NamesDF)
NamesDF<-arrange(NamesDF,Name1,Name2) #organize in alphabetica order
NamesDF<-filter(NamesDF, match==FALSE) # THIS DELETES ALL NAMES THAT ARE 100% MATCH
head(NamesDF)
# Convert to chr
NamesDF$Name1<-as.character(NamesDF$Name1)
NamesDF$Name2<-as.character(NamesDF$Name2)
str(NamesDF)
# Calclulate the proportional similarity and # changes required to go from one name to another. Package RecordLinkage
NamesDF$Name_sim<-levenshteinSim(NamesDF$Name1, NamesDF$Name2)
NamesDF$Name_dist<-levenshteinDist(NamesDF$Name1, NamesDF$Name2)
# Because this does all pairwise comparisons, it results in redundancy: "e bruna vs emilio bruna" and "emilio bruna vs e bruna"
# are in different rows, even though they are the same "comparison". This deletes one of the two
NamesDF<-NamesDF[!duplicated(t(apply(NamesDF, 1, sort))),]
# this arranges them in order from most similar (1 change required) to least similar.
# look carefully at those with a few changes, as they are likely to be a tiny spelling mistake or difference in intials
NamesDF$index<-seq.int(nrow(NamesDF)) #adds a column with an index to make it easier to id which row you need'
NamesDF <- NamesDF %>% select(index, Name1, Name2, Name_sim,Name_dist) #It's kinda ugly, but this rearranges columns (and dumps the "FALSE")
NamesDF <- arrange(NamesDF, desc(Name_sim))
# head(NamesDF)
# write.csv(NamesDF, file="/Users/emiliobruna/Dropbox/EMB - ACTIVE/MANUSCRIPTS/Editorial Board Geography/NameCheck_ALLDATA_2.csv", row.names = T) #export it as a csv file
return(NamesDF)
}