# Checks for the correctness of the triadic closure code. 

In [1]:
import pandas as pd

In [55]:
# Let us first read the filtered_sample file for reading RetractionYear
indir = "/Users/sm9654/desktop/NYUAD/nyuad-research/retraction_openalex/retraction_effects_on_academic_careers/data/processed/"
indir_matching = indir+"/author_matching/"

df_filtered = pd.read_csv(indir+"RW_authors_w_confounders_filteredSample_postNHB_BedoorsCorrections_Augmented.csv",
                        usecols=['Record ID','RetractionYear']).\
                            drop_duplicates()

# Now let us read the MAGAIDs (and MatchMAGAIDs)
df10_magaids = pd.read_csv(indir_matching+"/closestAverageMatch_tolerance_0.1_w_0.8.csv",
                        usecols=['MAGAID','MatchMAGAID','Record ID']).\
                            drop_duplicates().\
                                merge(df_filtered, on='Record ID')

df20_magaids = pd.read_csv(indir_matching+"/closestAverageMatch_tolerance_0.2_w_0.8.csv",
                        usecols=['MAGAID','MatchMAGAID','Record ID']).\
                            drop_duplicates().\
                                merge(df_filtered, on='Record ID')

df30_magaids = pd.read_csv(indir_matching+"/closestMatch/closestAverageMatch_tolerance_0.3_w_0.8.csv",
                        usecols=['MAGAID','MatchMAGAID','Record ID']).\
                            drop_duplicates().\
                                merge(df_filtered, on='Record ID')

# Processing 10%
df10_magaids_t = df10_magaids[['MAGAID','RetractionYear','Record ID']].drop_duplicates()
df10_magaids_c = df10_magaids[['MatchMAGAID','RetractionYear','Record ID']].drop_duplicates().\
                                    rename(columns={'MatchMAGAID':'MAGAID'})
df10_magaids_t['ScientistType'] = 'retracted'
df10_magaids_c['ScientistType'] = 'matched'

# Processing 20%
df20_magaids_t = df20_magaids[['MAGAID','RetractionYear','Record ID']].drop_duplicates()
df20_magaids_c = df20_magaids[['MatchMAGAID','RetractionYear','Record ID']].drop_duplicates().\
                                    rename(columns={'MatchMAGAID':'MAGAID'})
df20_magaids_t['ScientistType'] = 'retracted'
df20_magaids_c['ScientistType'] = 'matched'

# Processing 30%
df30_magaids_t = df30_magaids[['MAGAID','RetractionYear','Record ID']].drop_duplicates()
df30_magaids_c = df30_magaids[['MatchMAGAID','RetractionYear','Record ID']].drop_duplicates().\
                                    rename(columns={'MatchMAGAID':'MAGAID'})
df30_magaids_t['ScientistType'] = 'retracted'
df30_magaids_c['ScientistType'] = 'matched'

# Now we have list of all matched treatment and control alongside retraction year
# Note: Control can be one to many i.e. one matched to multiple 
# Note: This is now sorted in ascending order
df_matched = pd.concat([df10_magaids_t,df10_magaids_c,
                        df20_magaids_t,df20_magaids_c,
                        df30_magaids_t,df30_magaids_c]).\
                                        drop_duplicates().\
                                            sort_values(by='MAGAID')

df_matched.MAGAID.nunique()

33929

In [56]:
df10_magaids_c

Unnamed: 0,MAGAID,RetractionYear,Record ID,ScientistType
0,2501851682,2012,3444,matched
1,2320922672,2012,3444,matched
2,2025227735,2014,3344,matched
3,2135330004,2014,3344,matched
4,1778874809,2014,3344,matched
...,...,...,...,...
30671,3084947088,2015,2068,matched
30672,2005700130,2013,2714,matched
30673,2331341047,2013,17031,matched
30674,2484595244,2011,19664,matched


In [57]:
# At this point, we have all the magaids and matchmagaids for which we need to extract triads

# Reading the 1d collaborators file and filtering out scientists not in our analysis
indir_1dcollabs = "../../data/main/rematching/"
df_1d_collaborators = pd.read_csv(indir_1dcollabs+"/RW_MAGcollaborators_1stDegree_rematching_woPapersCitationsCollaborators_wCollabYear_le2020_closestMatch30.csv")

# This step is redundant yet let's do it for sanity check
df_1d_collaborators_relevant = df_1d_collaborators[df_1d_collaborators['MAGAID'].isin(df_matched['MAGAID'].unique())]

assert(df_1d_collaborators_relevant.shape[0] == df_1d_collaborators.shape[0])




In [58]:
# Now let us merge collaborators with matched file to extract retraction year
# This should have MAGAID, MAGCollabAID, Retraction year, collaboration year
df_merged = df_matched.merge(df_1d_collaborators_relevant, on=['MAGAID'], how='left')

In [60]:
pd.set_option('display.float_format', lambda x: '%.0f' % x)

df_merged[df_merged.MAGCollabAID.isna()].ScientistType_x

1754       matched
1755       matched
8349       matched
10409      matched
14137      matched
            ...   
1183604    matched
1183732    matched
1183929    matched
1183934    matched
1183955    matched
Name: ScientistType_x, Length: 9479, dtype: object

In [61]:
# Now let us check how many of these are out because they don't have collaboration year

df_invalid = pd.read_csv("../../data/main/rematching/magaids_wo_collabYear.txt", sep="\t", header=None).\
            rename(columns={0:'MAGAID'})
df_invalid.MAGAID.nunique()


141333

In [62]:
df_merged_noCollab = df_merged[df_merged.MAGCollabAID.isna()]

df_merged_noCollab[df_merged_noCollab.MAGAID.isin(df_invalid.MAGAID.unique())]

Unnamed: 0,MAGAID,RetractionYear,Record ID,ScientistType_x,ScientistType_y,MAGCollaborationYear,MAGCollabAID


In [67]:
df_merged_noCollab[['MAGAID','ScientistType_x']].ScientistType_x.value_counts()

df_merged_noCollab

Unnamed: 0,MAGAID,RetractionYear,Record ID,ScientistType_x,ScientistType_y,MAGCollaborationYear,MAGCollabAID
1754,7103970,2013,7243,matched,,,
1755,8017455,2012,6703,matched,,,
8349,58126041,2012,4506,matched,,,
10409,74280016,2006,8004,matched,,,
14137,120183719,2005,4225,matched,,,
...,...,...,...,...,...,...,...
1183604,3175547219,2002,16547,matched,,,
1183732,3176086515,2001,4329,matched,,,
1183929,3176847651,2012,2242,matched,,,
1183934,3176890423,1997,2515,matched,,,


In [32]:
df_merged_noCollab.MAGAID

1754         7103970
1755         8017455
8349        58126041
10409       74280016
14137      120183719
             ...    
1183602   3175547219
1183730   3176086515
1183927   3176847651
1183932   3176890423
1183953   3177017899
Name: MAGAID, Length: 9479, dtype: float64

In [33]:
df_invalid.MAGAID

0        2134517984
1        2413204075
2        2428891613
3        2165675200
4        2119992199
            ...    
141328   1832365033
141329   3035242766
141330   2697112656
141331   2223701916
141332   2598070457
Name: MAGAID, Length: 141333, dtype: float64

In [51]:
# Let us now look at the MAGAIDs without collaborators to see what their other controls look like

df3 = pd.read_csv("../matching/closestMatch/closestAverageMatch_tolerance_0.3_w_0.8.csv")

df3[df3.MAGAID.isin(df_merged_noCollab.MAGAID.unique()) | 
    df3.MatchMAGAID.isin(df_merged_noCollab.MAGAID.unique())].MAGCumCollaborators.describe()



count   9800
mean       0
std        0
min        0
25%        0
50%        0
75%        0
max        0
Name: MAGCumCollaborators, dtype: float64

In [52]:
df3[df3.MAGAID.isin(df_merged_noCollab.MAGAID.unique()) | 
    df3.MatchMAGAID.isin(df_merged_noCollab.MAGAID.unique())].MAGCumPapers.describe()

count   9800
mean       1
std        0
min        1
25%        1
50%        1
75%        1
max       21
Name: MAGCumPapers, dtype: float64

In [53]:
df3[df3.MAGAID.isin(df_merged_noCollab.MAGAID.unique()) | 
    df3.MatchMAGAID.isin(df_merged_noCollab.MAGAID.unique())].MAGCumCitations.describe()

count   9800
mean       0
std        1
min        0
25%        0
50%        0
75%        0
max       51
Name: MAGCumCitations, dtype: float64

In [134]:
df3[df3.MatchMAGAID.isin(df_merged_noCollab.MAGAID.unique())].MAGAID.nunique()

96

In [79]:
df_filtered_sample = pd.read_csv("../../data/main/filtered_sample_wo_ageOutliers.csv")
df_regression = pd.read_csv("../../data/h4_altmetric/regression/RW_Authors_forRegression_rematching.csv")
df_filtered_sample[['MAGCumCitations','MAGCumCitationsYear', 'RetractionYear']]\
        .drop_duplicates()

(df_filtered_sample['MAGCumCitationsYear']-df_filtered_sample['RetractionYear']).describe()

count   26787
mean       -1
std         0
min       -15
25%        -1
50%        -1
75%        -1
max        -1
dtype: float64

In [76]:
(df_filtered_sample['MAGCumCollaboratorsYear']-df_filtered_sample['RetractionYear']).describe()

count   26220
mean       -2
std         2
min       -42
25%        -2
50%        -1
75%        -1
max        -1
dtype: float64

In [77]:
(df_filtered_sample['MAGCumPapersYear']-df_filtered_sample['RetractionYear']).describe()



count   30315
mean       -2
std         2
min       -29
25%        -1
50%        -1
75%        -1
max        -1
dtype: float64

In [100]:
df_temp = df_filtered_sample[['MAGAID','MAGCumCollaborators']].drop_duplicates()\
        .merge(df_regression, on='MAGAID')

df_temp['diff'] = abs(df_temp['MAGCumCollaborators']-\
                       df_temp['MAGCumCollaboratorsAtRetraction'])
pd.set_option('display.max_columns', None)
df_temp[df_temp['diff'].gt(6000)]

Unnamed: 0,MAGAID,MAGCumCollaborators,Record ID,MAGPID,RetractionYear,OriginalPaperYear,MAGAffIDRetractedPaper,MAGRetractedPIDYear,MAGAffRankRetractedPaper,MAGFirstName,GenderizeGender,GenderizeConfidence,FinalGender,MAGFirstPubYear,MAGCumPapersYearAtRetractionMinus1,MAGCumPapersAtRetractionMinus1,MAGCumCitationsYearAtRetractionMinus1,MAGCumCitationsAtRetractionMinus1,MAGCumCollaboratorsYearAtRetractionMinus1,MAGCumCollaboratorsAtRetractionMinus1,MAGFirstAffID,MAGFirstYear,MAGFirstAffiliationRank,MAGAIDRankInRetractedPaper,NumAuthorsInRetractedPaper,MAGFirstAuthorFlag,MAGLastAuthorFlag,MAGRetractionYearAffID,MAGRetractionYearAffYear,MAGRetractionYearAffRank,MAGFieldID,MAGFieldIDMaxPercent,MAGFieldName,Field_ART,Field_BIOLOGY,Field_BUSINESS,Field_CHEMISTRY,Field_COMPUTERSCIENCE,Field_ECONOMICS,Field_ENGINEERING,Field_ENVIRONMENTALSCIENCE,Field_GEOGRAPHY,Field_GEOLOGY,Field_HISTORY,Field_MATERIALSSCIENCE,Field_MATHEMATICS,Field_MEDICINE,Field_PHILOSOPHY,Field_PHYSICS,Field_POLITICALSCIENCE,Field_PSYCHOLOGY,Field_SOCIOLOGY,AcademicAgeAtRetraction,EthneaGender,EthneaEthnicity1,EthneaEthnicity2,NamePrismRaceMax,NamePrismRaceProbMax,NamePrismEthnicityMax,NamePrismEthnicityProbMax,MAGLatestActivityYear,YearsActive,AttritedClass,AttritedClassRobust,MAGJournalType,MAGJournalID,MAGJournalName,SJRScoreRetractedPaperYear,SJRQuartileRetractedPaperYear,MAGCumPapersYearAtRetraction,MAGCumPapersAtRetraction,MAGCumPapersYearAtRetractionPlus1,MAGCumPapersAtRetractionPlus1,MAGCumCitationsYearAtRetraction,MAGCumCitationsAtRetraction,MAGCumCitationsYearAtRetractionPlus1,MAGCumCitationsAtRetractionPlus1,LogMAGCumCitationsAtRetraction,LogMAGCumCitationsAtRetractionPlus1,MAGCumCollaboratorsYearAtRetraction,MAGCumCollaboratorsAtRetraction,MAGCumCollaboratorsYearAtRetractionPlus1,MAGCumCollaboratorsAtRetractionPlus1,LogMAGCumCollaboratorsAtRetraction,LogMAGCumCollaboratorsAtRetractionPlus1,MAGRetractionYearAffRankOrdinal,MAGAIDRankTypeInRetractedPaper,BlogMentionsAtRetraction,F1000MentionsAtRetraction,FacebookMentionsAtRetraction,Google+MentionsAtRetraction,LinkedInMentionsAtRetraction,NewsMentionsAtRetraction,PatentMentionsAtRetraction,Peer reviewMentionsAtRetraction,PinterestMentionsAtRetraction,PolicyMentionsAtRetraction,Q&AMentionsAtRetraction,RedditMentionsAtRetraction,TwitterMentionsAtRetraction,VideoMentionsAtRetraction,WeiboMentionsAtRetraction,WikipediaMentionsAtRetraction,TotalMentionsAtRetraction,AggregateSocialMediaMentionsAtRetraction,AggregateNewsMediaMentionsAtRetraction,AggregateBlogsMentionsAtRetraction,AggregateKnowledgeRepositoriesMentionsAtRetraction,BlogMentionsPreRetraction,F1000MentionsPreRetraction,FacebookMentionsPreRetraction,Google+MentionsPreRetraction,LinkedInMentionsPreRetraction,NewsMentionsPreRetraction,PatentMentionsPreRetraction,Peer reviewMentionsPreRetraction,PinterestMentionsPreRetraction,PolicyMentionsPreRetraction,Q&AMentionsPreRetraction,RedditMentionsPreRetraction,TwitterMentionsPreRetraction,VideoMentionsPreRetraction,WeiboMentionsPreRetraction,WikipediaMentionsPreRetraction,TotalMentionsPreRetraction,AggregateSocialMediaMentionsPreRetraction,AggregateNewsMediaMentionsPreRetraction,AggregateBlogsMentionsPreRetraction,AggregateKnowledgeRepositoriesMentionsPreRetraction,BlogMentionsPostRetraction,F1000MentionsPostRetraction,FacebookMentionsPostRetraction,Google+MentionsPostRetraction,LinkedInMentionsPostRetraction,NewsMentionsPostRetraction,PatentMentionsPostRetraction,Peer reviewMentionsPostRetraction,PinterestMentionsPostRetraction,PolicyMentionsPostRetraction,Q&AMentionsPostRetraction,RedditMentionsPostRetraction,TwitterMentionsPostRetraction,VideoMentionsPostRetraction,WeiboMentionsPostRetraction,WikipediaMentionsPostRetraction,TotalMentionsPostRetraction,AggregateSocialMediaMentionsPostRetraction,AggregateNewsMediaMentionsPostRetraction,AggregateBlogsMentionsPostRetraction,AggregateKnowledgeRepositoriesMentionsPostRetraction,AltmetricScoreAtRetraction,AltmetricScorePreRetraction,AltmetricScorePostRetraction,DemiDecade,TotalPostersPerRecord,PercentageResearchers,PercentagePractitioners,PercentagePublic,PercentageJournalists,RetractorMajority,ReasonPropagatedOverallMajority,ReasonPropagatedMajorityOfMajority,diff
14515,2055015582,0,4805,2150942405,2011,2010,2801241159.0,2011,,philip,male,1,male,1970,2010,354,2010,17288,,0,118347636,1971,76,2,13,False,False,2801241159,2011,1001-,86803240,0,biology,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,41,M,ENGLISH,,,,,,2020,9,0,0,journal,60865174,journal of the american college of cardiology,8,1,2011,387,2012,417,2011,19753,2012,22430,10,10,2011,6553,2012,6879,9,9,80,Middle Author,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3,0,0,0,0,0,4,0,0,0,8,4,1,0,3,0,0,18,2011-2015,2.0,0.0,0.0,0.0,0.0,,mistake,mistake,6553
14516,2055015582,0,4805,2150942405,2011,2010,2801241159.0,2011,,philip,male,1,male,1970,2010,354,2010,17288,,0,118347636,1971,76,2,13,False,False,129604602,2011,80,86803240,0,biology,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,41,M,ENGLISH,,,,,,2020,9,0,0,journal,60865174,journal of the american college of cardiology,8,1,2011,387,2012,417,2011,19753,2012,22430,10,10,2011,6553,2012,6879,9,9,80,Middle Author,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3,0,0,0,0,0,4,0,0,0,8,4,1,0,3,0,0,18,2011-2015,2.0,0.0,0.0,0.0,0.0,,mistake,mistake,6553
20232,2165657978,0,2935,2735158951,2013,2012,,2013,,hubert,male,1,male,1988,2012,265,2012,10898,,0,154526488,1988,1001-,18,20,False,False,178906410,2013,1001-,86803240,0,biology,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,25,M,FRENCH,,White,0.0,European,1.0,2020,7,0,0,journal,156174673,hepatology,5,1,2013,295,2014,317,2013,12282,2014,13609,9,10,2013,6271,2014,6325,9,9,1500,Middle Author,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,3,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,3,3,0,0,0,1,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,4,0,0,1,3,3,1,8,2011-2015,1.0,0.0,0.0,1.0,0.0,,mistake,mistake,6271
20233,2165657978,0,2935,2735158951,2013,2012,,2013,,hubert,male,1,male,1988,2012,265,2012,10898,,0,154526488,1988,1001-,18,20,False,False,154526488,2013,1001-,86803240,0,biology,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,25,M,FRENCH,,White,0.0,European,1.0,2020,7,0,0,journal,156174673,hepatology,5,1,2013,295,2014,317,2013,12282,2014,13609,9,10,2013,6271,2014,6325,9,9,1500,Middle Author,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,3,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,3,3,0,0,0,1,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,4,0,0,1,3,3,1,8,2011-2015,1.0,0.0,0.0,1.0,0.0,,mistake,mistake,6271
20234,2165657978,0,2935,2735158951,2013,2012,,2013,,hubert,male,1,male,1988,2012,265,2012,10898,,0,154526488,1988,1001-,18,20,False,False,203339264,2013,1001-,86803240,0,biology,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,25,M,FRENCH,,White,0.0,European,1.0,2020,7,0,0,journal,156174673,hepatology,5,1,2013,295,2014,317,2013,12282,2014,13609,9,10,2013,6271,2014,6325,9,9,1500,Middle Author,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,3,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,3,3,0,0,0,1,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,4,0,0,1,3,3,1,8,2011-2015,1.0,0.0,0.0,1.0,0.0,,mistake,mistake,6271
20235,2165657978,0,2935,2735158951,2013,2012,,2013,,hubert,male,1,male,1988,2012,265,2012,10898,,0,154526488,1988,1001-,18,20,False,False,48430043,2013,1001-,86803240,0,biology,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,25,M,FRENCH,,White,0.0,European,1.0,2020,7,0,0,journal,156174673,hepatology,5,1,2013,295,2014,317,2013,12282,2014,13609,9,10,2013,6271,2014,6325,9,9,1500,Middle Author,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,3,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,3,3,0,0,0,1,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,4,0,0,1,3,3,1,8,2011-2015,1.0,0.0,0.0,1.0,0.0,,mistake,mistake,6271
20236,2165657978,0,2935,2735158951,2013,2012,,2013,,hubert,male,1,male,1988,2012,265,2012,10898,,0,154526488,1988,1001-,18,20,False,False,1314251682,2013,1001-,86803240,0,biology,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,25,M,FRENCH,,White,0.0,European,1.0,2020,7,0,0,journal,156174673,hepatology,5,1,2013,295,2014,317,2013,12282,2014,13609,9,10,2013,6271,2014,6325,9,9,1500,Middle Author,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,3,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,3,3,0,0,0,1,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,4,0,0,1,3,3,1,8,2011-2015,1.0,0.0,0.0,1.0,0.0,,mistake,mistake,6271
22341,1966000109,0,7262,2046740947,2013,2012,95634034.0,2013,801-900,murat,male,1,male,1999,2012,254,2012,5096,,0,139829854,1999,1001-,1,3,True,False,165779595,2013,41,71924100,0,medicine,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,14,M,TURKISH,,White,1.0,Muslim,1.0,2020,7,0,0,journal,112574528,optics and laser technology,1,2,2013,332,2014,416,2013,7228,2014,10701,9,9,2013,6481,2014,9002,9,9,41,First or Last or Only Author,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2011-2015,,,,,,,misconduct,misconduct,6481
22342,1966000109,0,7262,2046740947,2013,2012,95634034.0,2013,801-900,murat,male,1,male,1999,2012,254,2012,5096,,0,139829854,1999,1001-,1,3,True,False,48912391,2013,801-900,71924100,0,medicine,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,14,M,TURKISH,,White,1.0,Muslim,1.0,2020,7,0,0,journal,112574528,optics and laser technology,1,2,2013,332,2014,416,2013,7228,2014,10701,9,9,2013,6481,2014,9002,9,9,41,First or Last or Only Author,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2011-2015,,,,,,,misconduct,misconduct,6481
22343,1966000109,0,7262,2046740947,2013,2012,95634034.0,2013,801-900,murat,male,1,male,1999,2012,254,2012,5096,,0,139829854,1999,1001-,1,3,True,False,56590836,2013,73,71924100,0,medicine,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,14,M,TURKISH,,White,1.0,Muslim,1.0,2020,7,0,0,journal,112574528,optics and laser technology,1,2,2013,332,2014,416,2013,7228,2014,10701,9,9,2013,6481,2014,9002,9,9,41,First or Last or Only Author,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2011-2015,,,,,,,misconduct,misconduct,6481


In [102]:
df_regression.MAGAID.astype(int)


0        2111743875
1        2245002928
2        2120726690
3        2151685716
4        2552715397
            ...    
34729    2323621928
34730     198611037
34731     314964646
34732    2117142933
34733    2169222301
Name: MAGAID, Length: 34734, dtype: int64

In [118]:
df4 = df_regression[['OriginalPaperYear','MAGAID','RetractionYear']].drop_duplicates().merge(df3,on='MAGAID')
df4['diff2'] = df4['OriginalPaperYear'] - df4['RetractionYear']
df4[['MAGAID','diff2']].drop_duplicates()


count   11802
mean        2
std         9
min         1
25%         1
50%         1
75%         1
max       265
Name: MAGCumPapers, dtype: float64

In [120]:
df_filtered_sample.columns

Index(['Record ID', 'MAGPID', 'RetractionYear', 'OriginalPaperYear',
       'MAGAffID', 'MAGRetractedPIDYear', 'MAGAffiliationRank', 'MAGFirstName',
       'GenderizeGender', 'GenderizeConfidence', 'GenderizeCount',
       'FinalGender', 'MAGFirstPubYear', 'OriginalPaperYearMethod1',
       'OriginalPaperYearMethod2', 'MAGCumPapersYear', 'MAGCumPapers',
       'MAGCumCitationsYear', 'MAGCumCitations', 'MAGCumCollaboratorsYear',
       'MAGCumCollaborators', 'MAGCumCollaborationsYear',
       'MAGCumCollaborations', 'MAGFirstAffID', 'MAGFirstYear',
       'MAGFirstAffiliationRank', 'MAGAIDSequence', 'MAGAIDSequenceMax',
       'MAGFirstAuthorFlag', 'MAGLastAuthorFlag', 'MAGRetractionYearAffID',
       'MAGRetractionYearAffYear', 'MAGRetractionYearAffRank', 'MAGAID',
       'MAGrootFID', 'MAGrootFIDMaxPercent', 'Title', 'MAGJCID', 'Author',
       'MAGTitleMatchedByFuzzy', 'FuzzRatio', 'OriginalPaperDOI',
       'MAGTitleMatchedByDOI', 'NormalizedName'],
      dtype='object')

In [125]:
df5 = df_filtered_sample[['OriginalPaperYearMethod2','MAGAID','RetractionYear']].drop_duplicates().merge(df3,on='MAGAID')
df5['diff2'] = df5['OriginalPaperYearMethod2'] - df5['RetractionYear']
df5[['MAGAID','diff2']].drop_duplicates()

df5.diff2.describe()


count   32611
mean       -1
std         1
min       -17
25%        -1
50%         0
75%         0
max         0
Name: diff2, dtype: float64

In [129]:
df.columns.tolist()

['MAGAID']

Unnamed: 0,MAGAID,MatchMAGAID,Record ID,MAGCumPapers,MAGCumCitations,MAGCumCollaborators,MatchMAGCumPapers,MatchMAGCumCitations,MatchMAGCumCollaborators,StandardizedMAGCumPapers,StandardizedMatchMAGCumPapers,StandardizedMAGCumCitations,StandardizedMatchMAGCumCitations,StandardizedMAGCumCollaborators,StandardizedMatchMAGCumCollaborators,SqDiffPapers,SqDiffCitations,SqDiffCollaborators,papers_within_x_percent,citations_within_x_percent,collaborators_within_x_percent,AbsDiffPapers,AbsDiffCitations,WEDPapersCitationsCollaborators,MinDistance,AverageMAGCumPapers,AverageMAGCumCitations,AverageMAGCumCollaborators,AverageMatchMAGCumPapers,AverageMatchMAGCumCitations,AverageMatchMAGCumCollaborators


In [138]:
df_filtered_sample[df_filtered_sample.MAGCumPapers.eq(0)].MAGAID.nunique()

1338

In [141]:
dfmatched_regression = df_regression[df_regression.MAGAID.isin(df3.MAGAID.unique())]


(dfmatched_regression['MAGCumCollaboratorsAtRetraction']\
     -dfmatched_regression['MAGCumCollaboratorsAtRetractionMinus1']).describe()


count   6384
mean       8
std       32
min        0
25%        0
50%        3
75%       10
max     1268
dtype: float64

In [150]:
df_filtered_sample[df_filtered_sample.MAGCumPapers.eq(0) & 
                    df_filtered_sample.RetractionYear.eq(df_filtered_sample.MAGRetractedPIDYear)]

Unnamed: 0,Record ID,MAGPID,RetractionYear,OriginalPaperYear,MAGAffID,MAGRetractedPIDYear,MAGAffiliationRank,MAGFirstName,GenderizeGender,GenderizeConfidence,GenderizeCount,FinalGender,MAGFirstPubYear,OriginalPaperYearMethod1,OriginalPaperYearMethod2,MAGCumPapersYear,MAGCumPapers,MAGCumCitationsYear,MAGCumCitations,MAGCumCollaboratorsYear,MAGCumCollaborators,MAGCumCollaborationsYear,MAGCumCollaborations,MAGFirstAffID,MAGFirstYear,MAGFirstAffiliationRank,MAGAIDSequence,MAGAIDSequenceMax,MAGFirstAuthorFlag,MAGLastAuthorFlag,MAGRetractionYearAffID,MAGRetractionYearAffYear,MAGRetractionYearAffRank,MAGAID,MAGrootFID,MAGrootFIDMaxPercent,Title,MAGJCID,Author,MAGTitleMatchedByFuzzy,FuzzRatio,OriginalPaperDOI,MAGTitleMatchedByDOI,NormalizedName
7,2604,1842691482,1990,1990,157725225,1990,38,emily,female,1,16379,female,1990,1990,1990,,0,,0,,0,,0,157725225,1990,38,1,3,True,False,157725225,1990,38,2684994055,71924100,0,Dietary protein source and plasma lipid profil...,77494981,Emily Tseng;Susan M Potter;Mary Frances Picciano,dietary protein source and plasma lipid profil...,100,,,emily tseng
8,2604,1842691482,1990,1990,157725225,1990,38,emily,female,1,16379,female,1990,1990,1990,,0,,0,,0,,0,157725225,1990,38,1,3,True,False,157725225,1990,38,2684994055,86803240,0,Dietary protein source and plasma lipid profil...,77494981,Emily Tseng;Susan M Potter;Mary Frances Picciano,dietary protein source and plasma lipid profil...,100,,,emily tseng
9,2604,1842691482,1990,1990,157725225,1990,38,emily,female,1,16379,female,1990,1990,1990,,0,,0,,0,,0,157725225,1990,38,1,3,True,False,157725225,1990,38,2684994055,185592680,0,Dietary protein source and plasma lipid profil...,77494981,Emily Tseng;Susan M Potter;Mary Frances Picciano,dietary protein source and plasma lipid profil...,100,,,emily tseng
27,3788,1945375234,1990,1990,1316535847,1990,,a,male,1,19908,,1990,1990,1990,,0,,0,,0,,0,1316535847,1990,1001-,2,3,False,False,1316535847,1990,1001-,2501524686,71924100,1,Oral clindamycin and ciprofloxacin therapy for...,122731057,G. Paul Sesin;Ann Paszko;Eva O'Keefe,oral clindamycin and ciprofloxacin therapy for...,99,10.1002/j.1875-9114.1990.tb02563.x,,a paszko
28,3788,1945375234,1990,1990,1316535847,1990,,g,male,1,5344,,1990,1990,1990,,0,,0,,0,,0,1316535847,1990,1001-,1,3,True,False,1316535847,1990,1001-,3161216083,71924100,1,Oral clindamycin and ciprofloxacin therapy for...,122731057,G. Paul Sesin;Ann Paszko;Eva O'Keefe,oral clindamycin and ciprofloxacin therapy for...,99,10.1002/j.1875-9114.1990.tb02563.x,,g p sesin
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33988,17181,1955909669,2015,2015,1333606569,2015,,emmanuel,male,1,45589,male,2015,2015,2015,,0,,0,,0,,0,1333606569,2015,1001-,1,2,True,False,1333606569,2015,1001-,2535895329,41008148,0,Imputing a randomly censored covariate in a li...,150263959,Emmanuel Sampene;Folefac D Atem,,100,10.1177/0962280215586011,imputing a randomly censored covariate in a li...,emmanuel sampene
33989,17181,1955909669,2015,2015,1333606569,2015,,emmanuel,male,1,45589,male,2015,2015,2015,,0,,0,,0,,0,1333606569,2015,1001-,1,2,True,False,1333606569,2015,1001-,2535895329,162324750,0,Imputing a randomly censored covariate in a li...,150263959,Emmanuel Sampene;Folefac D Atem,,100,10.1177/0962280215586011,imputing a randomly censored covariate in a li...,emmanuel sampene
34499,7063,330028695,2015,2015,1321296531,2015,,geetika,female,1,59,female,2015,2015,2015,,0,,0,,0,,0,1321296531,2015,1001-,1,4,True,False,1321296531,2015,1001-,2128089871,17744445,0,Optimal Cloud Resource Provisioning: A Two-Cri...,106296714,Geetika Mudali;Manas Ranjan Patra;K Hemant Kum...,,90,10.1007/978-3-319-14977-6_39,retracted optimal cloud resource provisioning ...,geetika mudali
34500,7063,330028695,2015,2015,1321296531,2015,,geetika,female,1,59,female,2015,2015,2015,,0,,0,,0,,0,1321296531,2015,1001-,1,4,True,False,1321296531,2015,1001-,2128089871,41008148,0,Optimal Cloud Resource Provisioning: A Two-Cri...,106296714,Geetika Mudali;Manas Ranjan Patra;K Hemant Kum...,,90,10.1007/978-3-319-14977-6_39,retracted optimal cloud resource provisioning ...,geetika mudali
