# Data Dropping & Augmentation: Creating Matching Analytical Samples

In this notebook, we shall augment some necessary columns within our dataframe before we conduct the matching analysis.

**First we shall drop the unnecessary columns. This is being done after Bedoor and Kinga asked me to do closest matching with distance measure (Almost right before submission to PNAS).**


Here are the steps we will take:

We shall create the matching analytical sample for paper, citation, collaborators distance of (i) 10%, (ii) 20%, and (iii) 30%. To do that the following steps will be taken:

1. First, we shall load the three matching files for 10%, 20% and 30%. 
2. Then, we shall load the confounders that were matched for treatment and control using the file: **RWMatched_intersection_wPapersCitationsCollaborators_wCollabYear_closestMatch30.csv**
3. Then we shall load the stratification variables: reason, time of retraction, order of author in the retracted paper, type of retraction, author academic age, map author affiliation rank, impute retractor majority to avoid NaNs using the file: **filtered_sample.csv**
4. Finally, we shall compute the outcome variables using the files: **RW_MAGcollaborators_1stDegree_rematching_woPapersCitationsCollaborators_wCollabYear_le2020_closestMatch30.csv** and **To be filled**. The outcome variables are
    1. Number of collaborators retained by authors and their matches
    2. Number of collaborators gained by authors and their matches
    3. Number of triads closed by authors and their matches
    4. Proportion of triads closed by authors and their matches (Newman's Coefficient or NC)


In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# Step 1:  load the three matching files for 10%, 20% and 30%.

relevant_cols = ['MAGAID', 'MatchMAGAID', 'Record ID',]

indir = "/Users/sm9654/desktop/NYUAD/nyuad-research/retraction_openalex/retraction_effects_on_academic_careers/data/processed/author_matching/"

dfmatched_30perc = pd.read_csv(indir+"/closestAverageMatch_tolerance_0.3_w_0.8.csv",
                              usecols=relevant_cols).drop_duplicates()

dfmatched_20perc = pd.read_csv(indir+"/closestAverageMatch_tolerance_0.2_w_0.8.csv",
                              usecols=relevant_cols).drop_duplicates()

dfmatched_10perc = pd.read_csv(indir+"/closestAverageMatch_tolerance_0.1_w_0.8.csv",
                              usecols=relevant_cols).drop_duplicates()


In [3]:
dfmatched_30perc[dfmatched_30perc.MatchMAGAID.duplicated()]

Unnamed: 0,MAGAID,MatchMAGAID,Record ID
1323,2115123316,2.144725e+09,4442.0
1415,2117168174,2.101954e+09,8450.0
1416,2117168174,2.304848e+09,8450.0
1745,2136126495,2.048001e+09,2132.0
1811,2142423705,2.041562e+09,4911.0
...,...,...,...
3727,2765273410,2.141240e+09,16929.0
3728,2765273410,2.146519e+09,16929.0
3729,2765273410,2.557500e+09,16929.0
3756,2791620886,2.715645e+09,5429.0


In [4]:
dfmatched_30perc['MAGAID'].nunique(), dfmatched_30perc['MatchMAGAID'].nunique(), dfmatched_30perc[['MAGAID','MatchMAGAID']].drop_duplicates().shape[0]

(2348, 3881, 4054)

In [5]:
dfmatched_10perc.columns

Index(['MAGAID', 'MatchMAGAID', 'Record ID'], dtype='object')

In [6]:
dfmatched_10perc.shape, dfmatched_20perc.shape, dfmatched_30perc.shape

((2416, 3), (3397, 3), (4054, 3))

In [7]:
dfmatched_10perc.MAGAID.nunique(), dfmatched_20perc.MAGAID.nunique(), dfmatched_30perc.MAGAID.nunique()

(751, 1700, 2348)

In [8]:
dfmatched_10perc.MatchMAGAID.nunique(), dfmatched_20perc.MatchMAGAID.nunique(), dfmatched_30perc.MatchMAGAID.nunique()


(2262, 3232, 3881)

In [9]:
# Step2: load the confounders that were matched for treatment and control

df_confounders = pd.read_csv(indir+"RWMatched_intersection_wPapersCitationsCollaboratorsAtRetraction_wCollabYear_wActivityPostRetraction.csv")

df_confounders.columns

  df_confounders = pd.read_csv(indir+"RWMatched_intersection_wPapersCitationsCollaboratorsAtRetraction_wCollabYear_wActivityPostRetraction.csv")


Index(['MAGAID', 'MatchMAGAID', 'Record ID', 'MAGPID', 'RetractionYear',
       'MAGRetractionYearAffID', 'MAGRetractionYearAffRank',
       'MAGRetractionYearAffYear', 'MatchMAGRetractionYearAffID',
       'MatchMAGRetractionYearAffYear', 'MatchMAGRetractionYearAffRank',
       'MatchMAGMaxRetractionYear', 'MAGrootFID', 'MAGrootFIDMaxPercent',
       'MatchMAGrootFID', 'MatchMAGrootFIDMaxPercent', 'GenderizeGender',
       'MAGFirstPubYear', 'MAGFirstAffID', 'MAGFirstAffiliationRank',
       'MatchMAGFirstAffID', 'MatchMAGFirstAffYear',
       'MatchMAGFirstAffiliationRank', 'MAGCumPapersAtRetraction',
       'MAGCumPapersYearAtRetraction', 'MatchMAGCumPapersYearAtRetraction',
       'MatchMAGCumPapersAtRetraction', 'MAGCumCitationsAtRetraction',
       'MAGCumCitationsYearAtRetraction',
       'MatchMAGCumCitationsYearAtRetraction',
       'MatchMAGCumCitationsAtRetraction', 'MAGCumCollaboratorsAtRetraction',
       'MAGCumCollaboratorsYearAtRetraction',
       'MatchMAGCumCollaborat

In [10]:
def merge_confounders(dfi):
    dfi_w_confounders = dfi.merge(df_confounders,on=['MAGAID','MatchMAGAID','Record ID'])
    assert(dfi_w_confounders.MAGAID.nunique() == dfi.MAGAID.nunique())
    assert(dfi_w_confounders.MatchMAGAID.nunique() == dfi.MatchMAGAID.nunique())
    return dfi_w_confounders.drop_duplicates()


dfmatched_10_perc_wConfounders = merge_confounders(dfmatched_10perc)
dfmatched_20_perc_wConfounders = merge_confounders(dfmatched_20perc)
dfmatched_30_perc_wConfounders = merge_confounders(dfmatched_30perc)



In [11]:
print(dfmatched_10_perc_wConfounders.MAGAID.nunique(), 
      dfmatched_20_perc_wConfounders.MAGAID.nunique(), 
      dfmatched_30_perc_wConfounders.MAGAID.nunique(),
      dfmatched_10_perc_wConfounders.MatchMAGAID.nunique(), 
      dfmatched_20_perc_wConfounders.MatchMAGAID.nunique(), 
      dfmatched_30_perc_wConfounders.MatchMAGAID.nunique())


751 1700 2348 2262 3232 3881


In [12]:
# Loading regression sample

df_regression_sample = pd.read_csv(indir+"/old_RW_Authors_forRegression_rematching.csv",
                                  usecols=['MAGAID','Record ID',
                                          'RetractorMajority']).drop_duplicates()


indir2 = "/Users/sm9654/desktop/NYUAD/nyuad-research/retraction_openalex/retraction_effects_on_academic_careers/data/processed/"

df_filtered_sample = pd.read_csv(indir2+"RW_authors_w_confounders_filteredSample_postNHB_BedoorsCorrections_Augmented.csv",
                                usecols=['MAGAID', 'Record ID', 'MAGAuthorOrderCategorical',
                                        'ReasonPropagatedMajorityOfMajority', 'DemiDecade']).drop_duplicates()

df_regression_sample = df_filtered_sample.merge(df_regression_sample, on=['MAGAID','Record ID'], how='left')\
                                            .rename(columns={'DemiDecade':'DemiDecadeOfRetraction',
                                                   'MAGAuthorOrderCategorical':'MAGAIDFirstORLastAuthorFlag'})\
                                            .replace({'First or Last or Only Author':'MAGFirstOrLastAuthor',
                                                'Middle Author':'MAGMiddleAuthor'})
df_regression_sample

Unnamed: 0,MAGAID,Record ID,ReasonPropagatedMajorityOfMajority,DemiDecadeOfRetraction,MAGAIDFirstORLastAuthorFlag,RetractorMajority
0,2127983451,2343,mistake,1996-2000,MAGFirstOrLastAuthor,
1,1986180616,3294,misconduct,2001-2005,MAGFirstOrLastAuthor,
2,2134970185,3489,mistake,1996-2000,MAGMiddleAuthor,
3,2600580187,3631,mistake,1996-2000,MAGFirstOrLastAuthor,author
4,257122240,2202,misconduct,1990-1995,MAGFirstOrLastAuthor,author
...,...,...,...,...,...,...
16052,2689068837,3021,other,1990-1995,MAGFirstOrLastAuthor,
16053,2689068837,3020,,1990-1995,MAGFirstOrLastAuthor,
16054,2689068837,3020,other,1990-1995,MAGFirstOrLastAuthor,
16055,2651016970,899,misconduct,2006-2010,MAGFirstOrLastAuthor,


In [13]:
df_regression_sample[~df_regression_sample['RetractorMajority'].isna()]['MAGAID'].nunique()

4978

In [14]:
def merge_strataVars(dfi):
    dfi_w_strataVars = dfi.merge(df_regression_sample,on=['MAGAID','Record ID'])
    assert(dfi_w_strataVars.MAGAID.nunique() == dfi.MAGAID.nunique())
    return dfi_w_strataVars.drop_duplicates()

dfmatched_10_perc_wStrataVars = merge_strataVars(dfmatched_10_perc_wConfounders)
dfmatched_20_perc_wStrataVars = merge_strataVars(dfmatched_20_perc_wConfounders)
dfmatched_30_perc_wStrataVars = merge_strataVars(dfmatched_30_perc_wConfounders)

dfmatched_30_perc_wStrataVars

Unnamed: 0,MAGAID,MatchMAGAID,Record ID,MAGPID,RetractionYear,MAGRetractionYearAffID,MAGRetractionYearAffRank,MAGRetractionYearAffYear,MatchMAGRetractionYearAffID,MatchMAGRetractionYearAffYear,...,MatchMAGCumCitationsAtRetraction,MAGCumCollaboratorsAtRetraction,MAGCumCollaboratorsYearAtRetraction,MatchMAGCumCollaboratorsYearAtRetraction,MatchMAGCumCollaboratorsAtRetraction,MAGRetractionYearAffRankOrdinal,ReasonPropagatedMajorityOfMajority,DemiDecadeOfRetraction,MAGAIDFirstORLastAuthorFlag,RetractorMajority
0,2184860,2.628313e+09,15835.0,2.609888e+09,2008.0,861853513.0,151-200,2008.0,881766915.0,2008.0,...,6.0,22.0,2008.0,2008.0,23.0,175.0,other,2006-2010,MAGMiddleAuthor,
1,2184860,2.136872e+09,15835.0,2.609888e+09,2008.0,861853513.0,151-200,2008.0,205349734.0,2008.0,...,6.0,22.0,2008.0,2008.0,21.0,175.0,other,2006-2010,MAGMiddleAuthor,
2,8197726,1.574644e+09,3444.0,1.506358e+08,2012.0,13955877.0,401-500,2012.0,185443292.0,2011.0,...,18.0,16.0,2012.0,2011.0,15.0,450.0,mistake,2011-2015,MAGMiddleAuthor,author
3,8197726,1.574644e+09,3444.0,1.506358e+08,2012.0,13955877.0,401-500,2012.0,185443292.0,2011.0,...,18.0,16.0,2012.0,2011.0,15.0,450.0,mistake,2011-2015,MAGMiddleAuthor,author
4,9474215,2.169122e+09,7285.0,1.985944e+09,2015.0,79576946.0,17,2003.0,79576946.0,1999.0,...,5692.0,284.0,2015.0,2015.0,207.0,17.0,mistake,2011-2015,MAGMiddleAuthor,author
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5413,3174447547,1.897294e+09,23452.0,2.166758e+09,2008.0,99065089.0,43,2008.0,86519309.0,2007.0,...,1.0,6.0,2008.0,2007.0,6.0,43.0,other,2006-2010,MAGFirstOrLastAuthor,other
5414,3174844467,2.315520e+09,17239.0,2.102017e+09,2014.0,865915315.0,101-150,2013.0,887064364.0,2014.0,...,18.0,11.0,2013.0,2014.0,10.0,125.0,misconduct,2011-2015,MAGMiddleAuthor,
5415,3175435814,2.137476e+09,18203.0,2.519517e+09,2015.0,159247623.0,801-900,2015.0,36243813.0,2009.0,...,13.0,28.0,2015.0,2011.0,27.0,850.0,plagiarism,2011-2015,MAGFirstOrLastAuthor,
5416,3176125681,2.168565e+09,4333.0,1.789963e+09,2004.0,125602781.0,601-700,2000.0,317356780.0,2001.0,...,197.0,31.0,2004.0,2001.0,29.0,650.0,mistake,2001-2005,MAGMiddleAuthor,


In [15]:
dfmatched_30_perc_wStrataVars.ReasonPropagatedMajorityOfMajority.value_counts()

ReasonPropagatedMajorityOfMajority
misconduct    1776
plagiarism    1614
mistake       1180
other          848
Name: count, dtype: int64

In [16]:
print(dfmatched_10_perc_wStrataVars.MAGAID.nunique(), 
      dfmatched_20_perc_wStrataVars.MAGAID.nunique(), 
      dfmatched_30_perc_wStrataVars.MAGAID.nunique(),
      dfmatched_10_perc_wStrataVars.MatchMAGAID.nunique(), 
      dfmatched_20_perc_wStrataVars.MatchMAGAID.nunique(), 
      dfmatched_30_perc_wStrataVars.MatchMAGAID.nunique())

751 1700 2348 2262 3232 3881


### Academic age

In [17]:
def compute_activityBin(row):
    if(row.AcademicAgeBeforeRetraction <= 1):
        return "1"
    elif(row.AcademicAgeBeforeRetraction <= 2):
        return "2"
    elif(row.AcademicAgeBeforeRetraction <= 5):
        return "3-5"
    else:
        return ">5"

def augment_age(dfi):
    dfi['AcademicAgeBeforeRetraction'] = dfi['RetractionYear'] - dfi['MAGFirstPubYear']
    dfi['AcademicAgeBin'] = dfi.apply(lambda row: compute_activityBin(row), axis=1)
    return dfi

dfmatched_10_perc_wStrataVars = augment_age(dfmatched_10_perc_wStrataVars)
dfmatched_20_perc_wStrataVars = augment_age(dfmatched_20_perc_wStrataVars)
dfmatched_30_perc_wStrataVars = augment_age(dfmatched_30_perc_wStrataVars)


dfmatched_10_perc_wStrataVars

Unnamed: 0,MAGAID,MatchMAGAID,Record ID,MAGPID,RetractionYear,MAGRetractionYearAffID,MAGRetractionYearAffRank,MAGRetractionYearAffYear,MatchMAGRetractionYearAffID,MatchMAGRetractionYearAffYear,...,MAGCumCollaboratorsYearAtRetraction,MatchMAGCumCollaboratorsYearAtRetraction,MatchMAGCumCollaboratorsAtRetraction,MAGRetractionYearAffRankOrdinal,ReasonPropagatedMajorityOfMajority,DemiDecadeOfRetraction,MAGAIDFirstORLastAuthorFlag,RetractorMajority,AcademicAgeBeforeRetraction,AcademicAgeBin
0,2184860,2.628313e+09,15835.0,2.609888e+09,2008.0,861853513.0,151-200,2008.0,881766915.0,2008.0,...,2008.0,2008.0,23.0,175.0,other,2006-2010,MAGMiddleAuthor,,2.0,2
1,2184860,2.136872e+09,15835.0,2.609888e+09,2008.0,861853513.0,151-200,2008.0,205349734.0,2008.0,...,2008.0,2008.0,21.0,175.0,other,2006-2010,MAGMiddleAuthor,,2.0,2
2,8197726,1.574644e+09,3444.0,1.506358e+08,2012.0,13955877.0,401-500,2012.0,185443292.0,2011.0,...,2012.0,2011.0,15.0,450.0,mistake,2011-2015,MAGMiddleAuthor,author,5.0,3-5
3,8197726,1.574644e+09,3444.0,1.506358e+08,2012.0,13955877.0,401-500,2012.0,185443292.0,2011.0,...,2012.0,2011.0,15.0,450.0,mistake,2011-2015,MAGMiddleAuthor,author,5.0,3-5
4,47570122,2.063571e+09,1202.0,1.975776e+09,2015.0,126744593.0,201-300,2015.0,100532134.0,2015.0,...,2015.0,2015.0,38.0,250.0,misconduct,2011-2015,MAGMiddleAuthor,author,5.0,3-5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3617,3169533057,2.413698e+09,17031.0,1.140967e+08,2013.0,68947357.0,101-150,2013.0,24943067.0,2013.0,...,2013.0,2013.0,14.0,125.0,misconduct,2011-2015,MAGFirstOrLastAuthor,,4.0,3-5
3618,3173543754,1.840802e+09,9548.0,2.045218e+09,2011.0,193775966.0,201-300,2011.0,45084792.0,2010.0,...,2011.0,2010.0,35.0,250.0,misconduct,2011-2015,MAGFirstOrLastAuthor,journal,6.0,>5
3619,3174447547,1.897294e+09,23452.0,2.166758e+09,2008.0,99065089.0,43,2008.0,86519309.0,2007.0,...,2008.0,2007.0,6.0,43.0,other,2006-2010,MAGFirstOrLastAuthor,other,2.0,2
3620,3176125681,2.168565e+09,4333.0,1.789963e+09,2004.0,125602781.0,601-700,2000.0,317356780.0,2001.0,...,2004.0,2001.0,29.0,650.0,mistake,2001-2005,MAGMiddleAuthor,,8.0,>5


In [18]:
dfmatched_10_perc_wStrataVars

Unnamed: 0,MAGAID,MatchMAGAID,Record ID,MAGPID,RetractionYear,MAGRetractionYearAffID,MAGRetractionYearAffRank,MAGRetractionYearAffYear,MatchMAGRetractionYearAffID,MatchMAGRetractionYearAffYear,...,MAGCumCollaboratorsYearAtRetraction,MatchMAGCumCollaboratorsYearAtRetraction,MatchMAGCumCollaboratorsAtRetraction,MAGRetractionYearAffRankOrdinal,ReasonPropagatedMajorityOfMajority,DemiDecadeOfRetraction,MAGAIDFirstORLastAuthorFlag,RetractorMajority,AcademicAgeBeforeRetraction,AcademicAgeBin
0,2184860,2.628313e+09,15835.0,2.609888e+09,2008.0,861853513.0,151-200,2008.0,881766915.0,2008.0,...,2008.0,2008.0,23.0,175.0,other,2006-2010,MAGMiddleAuthor,,2.0,2
1,2184860,2.136872e+09,15835.0,2.609888e+09,2008.0,861853513.0,151-200,2008.0,205349734.0,2008.0,...,2008.0,2008.0,21.0,175.0,other,2006-2010,MAGMiddleAuthor,,2.0,2
2,8197726,1.574644e+09,3444.0,1.506358e+08,2012.0,13955877.0,401-500,2012.0,185443292.0,2011.0,...,2012.0,2011.0,15.0,450.0,mistake,2011-2015,MAGMiddleAuthor,author,5.0,3-5
3,8197726,1.574644e+09,3444.0,1.506358e+08,2012.0,13955877.0,401-500,2012.0,185443292.0,2011.0,...,2012.0,2011.0,15.0,450.0,mistake,2011-2015,MAGMiddleAuthor,author,5.0,3-5
4,47570122,2.063571e+09,1202.0,1.975776e+09,2015.0,126744593.0,201-300,2015.0,100532134.0,2015.0,...,2015.0,2015.0,38.0,250.0,misconduct,2011-2015,MAGMiddleAuthor,author,5.0,3-5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3617,3169533057,2.413698e+09,17031.0,1.140967e+08,2013.0,68947357.0,101-150,2013.0,24943067.0,2013.0,...,2013.0,2013.0,14.0,125.0,misconduct,2011-2015,MAGFirstOrLastAuthor,,4.0,3-5
3618,3173543754,1.840802e+09,9548.0,2.045218e+09,2011.0,193775966.0,201-300,2011.0,45084792.0,2010.0,...,2011.0,2010.0,35.0,250.0,misconduct,2011-2015,MAGFirstOrLastAuthor,journal,6.0,>5
3619,3174447547,1.897294e+09,23452.0,2.166758e+09,2008.0,99065089.0,43,2008.0,86519309.0,2007.0,...,2008.0,2007.0,6.0,43.0,other,2006-2010,MAGFirstOrLastAuthor,other,2.0,2
3620,3176125681,2.168565e+09,4333.0,1.789963e+09,2004.0,125602781.0,601-700,2000.0,317356780.0,2001.0,...,2004.0,2001.0,29.0,650.0,mistake,2001-2005,MAGMiddleAuthor,,8.0,>5


## Augmenting Field Name

In [19]:
df_fieldnames = pd.read_csv(indir+"RootFieldsNames.txt").\
                    rename(columns={'root_FieldID':'MAGrootFID',
                                   'FieldName': 'MAGFieldName'})

dfmatched_10_perc_wStrataVars = dfmatched_10_perc_wStrataVars.merge(df_fieldnames, 
                                                                   on='MAGrootFID')

dfmatched_20_perc_wStrataVars = dfmatched_20_perc_wStrataVars.merge(df_fieldnames, 
                                                                   on='MAGrootFID')

dfmatched_30_perc_wStrataVars = dfmatched_30_perc_wStrataVars.merge(df_fieldnames, 
                                                                   on='MAGrootFID')

## Mapping Field Names to STEM and non-STEM

In [20]:
# Classifying fields with < 5% as other stem and non-stem
other_stem_fields = ['materials science', 'computer science',
                'engineering', 'mathematics', 'psychology',
                'economics', 'environmental science']

non_stem_fields = ['psychology','political science', 'geology',
                  'philosophy','geography','sociology','business',
                  'history','art']

dfmatched_10_perc_wStrataVars['MAGFieldName'] = dfmatched_10_perc_wStrataVars['MAGFieldName']\
                                                   .replace(dict.fromkeys(other_stem_fields, 'other STEM fields'))\
                                                    .replace(dict.fromkeys(non_stem_fields,'non-STEM fields'))

dfmatched_20_perc_wStrataVars['MAGFieldName'] = dfmatched_20_perc_wStrataVars['MAGFieldName']\
                                                   .replace(dict.fromkeys(other_stem_fields, 'other STEM fields'))\
                                                    .replace(dict.fromkeys(non_stem_fields,'non-STEM fields'))

dfmatched_30_perc_wStrataVars['MAGFieldName'] = dfmatched_30_perc_wStrataVars['MAGFieldName']\
                                                   .replace(dict.fromkeys(other_stem_fields, 'other STEM fields'))\
                                                    .replace(dict.fromkeys(non_stem_fields,'non-STEM fields'))


## Mapping Retractor Majority NaNs to Other

In [21]:
def impute_retractor_majority_NaNs(dfj):
    dfj['RetractorMajority'] = dfj['RetractorMajority'].fillna('other retractor')
    dfj['RetractorMajority'] = dfj['RetractorMajority'].replace({'other':'other retractor'})
    return dfj
    
    
dfmatched_10_perc_wStrataVars = impute_retractor_majority_NaNs(dfmatched_10_perc_wStrataVars)
dfmatched_20_perc_wStrataVars = impute_retractor_majority_NaNs(dfmatched_20_perc_wStrataVars)
dfmatched_30_perc_wStrataVars = impute_retractor_majority_NaNs(dfmatched_30_perc_wStrataVars)

In [22]:
print(dfmatched_10_perc_wStrataVars.MAGAID.nunique(), 
      dfmatched_20_perc_wStrataVars.MAGAID.nunique(), 
      dfmatched_30_perc_wStrataVars.MAGAID.nunique(),
      dfmatched_10_perc_wStrataVars.MatchMAGAID.nunique(), 
      dfmatched_20_perc_wStrataVars.MatchMAGAID.nunique(), 
      dfmatched_30_perc_wStrataVars.MatchMAGAID.nunique())

751 1700 2348 2262 3232 3881


## Mapping Affiliation Ranks

In [23]:
def map_affiliation_ranks(dfj, col):
    
    mapping = {'101-150':'101-500',
              '151-200':'101-500',
              '201-300':'101-500',
              '301-400':'101-500',
              '401-500':'101-500',
              '501-600':'501-1000',
              '601-700':'501-1000',
              '701-800':'501-1000',
              '801-900':'501-1000',
              '901-1000':'501-1000',
              '1001-':'>1000',}
    
    dfj[col+'Stratified'] = dfj[col].map(mapping).fillna('1-100')
    
    return dfj


dfmatched_10_perc_wStrataVars = map_affiliation_ranks(dfmatched_10_perc_wStrataVars, 'MAGRetractionYearAffRank')
dfmatched_20_perc_wStrataVars = map_affiliation_ranks(dfmatched_20_perc_wStrataVars, 'MAGRetractionYearAffRank')
dfmatched_30_perc_wStrataVars = map_affiliation_ranks(dfmatched_30_perc_wStrataVars, 'MAGRetractionYearAffRank')
dfmatched_30_perc_wStrataVars

Unnamed: 0,MAGAID,MatchMAGAID,Record ID,MAGPID,RetractionYear,MAGRetractionYearAffID,MAGRetractionYearAffRank,MAGRetractionYearAffYear,MatchMAGRetractionYearAffID,MatchMAGRetractionYearAffYear,...,MatchMAGCumCollaboratorsAtRetraction,MAGRetractionYearAffRankOrdinal,ReasonPropagatedMajorityOfMajority,DemiDecadeOfRetraction,MAGAIDFirstORLastAuthorFlag,RetractorMajority,AcademicAgeBeforeRetraction,AcademicAgeBin,MAGFieldName,MAGRetractionYearAffRankStratified
0,2184860,2.628313e+09,15835.0,2.609888e+09,2008.0,861853513.0,151-200,2008.0,881766915.0,2008.0,...,23.0,175.0,other,2006-2010,MAGMiddleAuthor,other retractor,2.0,2,biology,101-500
1,2184860,2.136872e+09,15835.0,2.609888e+09,2008.0,861853513.0,151-200,2008.0,205349734.0,2008.0,...,21.0,175.0,other,2006-2010,MAGMiddleAuthor,other retractor,2.0,2,biology,101-500
2,8197726,1.574644e+09,3444.0,1.506358e+08,2012.0,13955877.0,401-500,2012.0,185443292.0,2011.0,...,15.0,450.0,mistake,2011-2015,MAGMiddleAuthor,author,5.0,3-5,biology,101-500
3,9474215,2.169122e+09,7285.0,1.985944e+09,2015.0,79576946.0,17,2003.0,79576946.0,1999.0,...,207.0,17.0,mistake,2011-2015,MAGMiddleAuthor,author,22.0,>5,biology,1-100
4,13737004,2.311431e+09,3344.0,2.035632e+09,2014.0,186903577.0,701-800,2013.0,22248866.0,2011.0,...,24.0,750.0,mistake,2011-2015,MAGMiddleAuthor,other retractor,10.0,>5,biology,501-1000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5413,2482103685,2.161710e+09,18635.0,2.081956e+09,2012.0,191208505.0,201-300,2012.0,8961855.0,2012.0,...,10.0,250.0,other,2011-2015,MAGMiddleAuthor,other retractor,1.0,1,non-STEM fields,101-500
5414,2482103685,2.483120e+09,18635.0,2.081956e+09,2012.0,191208505.0,201-300,2012.0,75027704.0,2012.0,...,10.0,250.0,other,2011-2015,MAGMiddleAuthor,other retractor,1.0,1,non-STEM fields,101-500
5415,2501052294,2.149938e+09,18635.0,2.081956e+09,2012.0,191208505.0,201-300,2012.0,162868743.0,2011.0,...,10.0,250.0,other,2011-2015,MAGMiddleAuthor,other retractor,1.0,1,non-STEM fields,101-500
5416,2714015866,2.310975e+09,7423.0,2.147214e+09,2008.0,129774422.0,151-200,2008.0,108290504.0,2008.0,...,4.0,175.0,plagiarism,2006-2010,MAGMiddleAuthor,other retractor,0.0,1,non-STEM fields,101-500


In [24]:
dfmatched_30_perc_wStrataVars[['MAGRetractionYearAffRank','MAGRetractionYearAffRankStratified']].head(30)

Unnamed: 0,MAGRetractionYearAffRank,MAGRetractionYearAffRankStratified
0,151-200,101-500
1,151-200,101-500
2,401-500,101-500
3,17,1-100
4,701-800,501-1000
5,151-200,101-500
6,201-300,101-500
7,301-400,101-500
8,301-400,101-500
9,601-700,501-1000


## Augmenting outcome variables

In [25]:
dfmatched_10_perc_wStrataVars.columns

Index(['MAGAID', 'MatchMAGAID', 'Record ID', 'MAGPID', 'RetractionYear',
       'MAGRetractionYearAffID', 'MAGRetractionYearAffRank',
       'MAGRetractionYearAffYear', 'MatchMAGRetractionYearAffID',
       'MatchMAGRetractionYearAffYear', 'MatchMAGRetractionYearAffRank',
       'MatchMAGMaxRetractionYear', 'MAGrootFID', 'MAGrootFIDMaxPercent',
       'MatchMAGrootFID', 'MatchMAGrootFIDMaxPercent', 'GenderizeGender',
       'MAGFirstPubYear', 'MAGFirstAffID', 'MAGFirstAffiliationRank',
       'MatchMAGFirstAffID', 'MatchMAGFirstAffYear',
       'MatchMAGFirstAffiliationRank', 'MAGCumPapersAtRetraction',
       'MAGCumPapersYearAtRetraction', 'MatchMAGCumPapersYearAtRetraction',
       'MatchMAGCumPapersAtRetraction', 'MAGCumCitationsAtRetraction',
       'MAGCumCitationsYearAtRetraction',
       'MatchMAGCumCitationsYearAtRetraction',
       'MatchMAGCumCitationsAtRetraction', 'MAGCumCollaboratorsAtRetraction',
       'MAGCumCollaboratorsYearAtRetraction',
       'MatchMAGCumCollaborat

# Breaking point

At this point we need to run code for 10, 20, 30% separately. 

In [109]:
# Creating treatment and control

df_treatment = dfmatched_30_perc_wStrataVars.drop(columns=['MatchMAGAID','MatchMAGRetractionYearAffID',
               'MatchMAGRetractionYearAffYear', 'MatchMAGRetractionYearAffRank',
               'MatchMAGMaxRetractionYear','MatchMAGrootFID', 'MatchMAGrootFIDMaxPercent',
                'MatchMAGCumPapersYearAtRetraction', 'MatchMAGCumPapersAtRetraction',
               'MatchMAGCumCitationsYearAtRetraction', 'MatchMAGCumCitationsAtRetraction',
               'MatchMAGCumCollaboratorsYearAtRetraction',
               'MatchMAGCumCollaboratorsAtRetraction',
                'MatchMAGFirstAffID', 'MatchMAGFirstAffYear',
               'MatchMAGFirstAffiliationRank']).drop_duplicates()

df_control = dfmatched_30_perc_wStrataVars.copy()

In [110]:
perc20 = False
perc10 = False

if(perc20):
    df_treatment = dfmatched_20_perc_wStrataVars.drop(columns=['MatchMAGAID','MatchMAGRetractionYearAffID',
                   'MatchMAGRetractionYearAffYear', 'MatchMAGRetractionYearAffRank',
                   'MatchMAGMaxRetractionYear','MatchMAGrootFID', 'MatchMAGrootFIDMaxPercent',
                    'MatchMAGCumPapersYearAtRetraction', 'MatchMAGCumPapersAtRetraction',
                   'MatchMAGCumCitationsYearAtRetraction', 'MatchMAGCumCitationsAtRetraction',
                   'MatchMAGCumCollaboratorsYearAtRetraction',
                   'MatchMAGCumCollaboratorsAtRetraction',
                    'MatchMAGFirstAffID', 'MatchMAGFirstAffYear',
                   'MatchMAGFirstAffiliationRank']).drop_duplicates()

    df_control = dfmatched_20_perc_wStrataVars.copy()
    
if(perc10):
    df_treatment = dfmatched_10_perc_wStrataVars.drop(columns=['MatchMAGAID','MatchMAGRetractionYearAffID',
                   'MatchMAGRetractionYearAffYear', 'MatchMAGRetractionYearAffRank',
                   'MatchMAGMaxRetractionYear','MatchMAGrootFID', 'MatchMAGrootFIDMaxPercent',
                    'MatchMAGCumPapersYearAtRetraction', 'MatchMAGCumPapersAtRetraction',
                   'MatchMAGCumCitationsYearAtRetraction', 'MatchMAGCumCitationsAtRetraction',
                   'MatchMAGCumCollaboratorsYearAtRetraction',
                   'MatchMAGCumCollaboratorsAtRetraction',
                    'MatchMAGFirstAffID', 'MatchMAGFirstAffYear',
                   'MatchMAGFirstAffiliationRank']).drop_duplicates()

    df_control = dfmatched_10_perc_wStrataVars.copy()



In [111]:
print(df_treatment.MAGAID.nunique(),
      df_control.MAGAID.nunique(),
      df_control.MatchMAGAID.nunique())

751 751 2262


In [112]:
df_treatment.columns

Index(['MAGAID', 'Record ID', 'MAGPID', 'RetractionYear',
       'MAGRetractionYearAffID', 'MAGRetractionYearAffRank',
       'MAGRetractionYearAffYear', 'MAGrootFID', 'MAGrootFIDMaxPercent',
       'GenderizeGender', 'MAGFirstPubYear', 'MAGFirstAffID',
       'MAGFirstAffiliationRank', 'MAGCumPapersAtRetraction',
       'MAGCumPapersYearAtRetraction', 'MAGCumCitationsAtRetraction',
       'MAGCumCitationsYearAtRetraction', 'MAGCumCollaboratorsAtRetraction',
       'MAGCumCollaboratorsYearAtRetraction',
       'MAGRetractionYearAffRankOrdinal', 'ReasonPropagatedMajorityOfMajority',
       'DemiDecadeOfRetraction', 'MAGAIDFirstORLastAuthorFlag',
       'RetractorMajority', 'AcademicAgeBeforeRetraction', 'AcademicAgeBin',
       'MAGFieldName', 'MAGRetractionYearAffRankStratified'],
      dtype='object')

In [113]:
# Reading the collaborators file
df_1d_collaborators = pd.read_csv(indir+"RW_MAGcollaborators_1stDegree_rematching_woPapersCitationsCollaboratorsAtRetraction_wCollabYear_le2020_30perc.csv")

if(perc20):
    df_1d_collaborators = pd.read_csv(indir+"RW_MAGcollaborators_1stDegree_rematching_woPapersCitationsCollaboratorsAtRetraction_wCollabYear_le2020_20perc.csv")

    
if(perc10):
    df_1d_collaborators = pd.read_csv(indir+"RW_MAGcollaborators_1stDegree_rematching_woPapersCitationsCollaboratorsAtRetraction_wCollabYear_le2020_10perc.csv")

df_1d_collaborators

Unnamed: 0,MAGAID,ScientistType,MAGCollaborationYear,MAGCollabAID,FirstYearPostRetraction,YearOfAttrition,RetractionYear,AuthorType,YearsBetweenRyearAndFirstActivityPostRetraction
0,2.007263e+09,retracted,2008.0,2103177151,2010.0,2020.0,2009.0,retracted,1.0
1,2.007263e+09,retracted,2008.0,2133860354,2010.0,2020.0,2009.0,retracted,1.0
2,2.007263e+09,retracted,2006.0,2089574110,2010.0,2020.0,2009.0,retracted,1.0
3,2.007263e+09,retracted,2006.0,2099750106,2010.0,2020.0,2009.0,retracted,1.0
4,2.007263e+09,retracted,2006.0,2164912843,2010.0,2020.0,2009.0,retracted,1.0
...,...,...,...,...,...,...,...,...,...
224604,2.080702e+09,matched,2020.0,2972314851,2017.0,2020.0,2015.0,matched,2.0
224605,2.080702e+09,matched,2020.0,3083126804,2017.0,2020.0,2015.0,matched,2.0
224606,2.080702e+09,matched,2020.0,3083257570,2017.0,2020.0,2015.0,matched,2.0
224607,2.080702e+09,matched,2020.0,3083380120,2017.0,2020.0,2015.0,matched,2.0


In [114]:
# Separating collaborators for treatment and control
df_1d_collaborators_treatment = df_1d_collaborators[df_1d_collaborators.ScientistType == 'retracted'].\
                                drop(columns=['ScientistType']).\
                                drop_duplicates() # Not necessary but still

df_1d_collaborators_control = df_1d_collaborators[df_1d_collaborators.ScientistType == 'matched'].\
                                rename(columns={'MAGAID':'MatchMAGAID'}).\
                                drop(columns=['ScientistType']).\
                                drop_duplicates() # Not necessary but still


# Let us only get collaborators for MAGAIDs that are relevant
df_1d_collaborators_treatment = df_1d_collaborators_treatment.\
                                merge(df_treatment[['MAGAID']].drop_duplicates(),
                                on='MAGAID', how='right')

# Now let us augment df_1d_collaborators_control with MAGAID first
# Also only getting collaborators for matches that are useful
df_1d_collaborators_control = df_1d_collaborators_control.\
                                merge(df_control[['MAGAID','MatchMAGAID']].drop_duplicates(),
                                on=['MatchMAGAID'], how='right')

df_1d_collaborators_control.shape

(133214, 9)

In [115]:
df_1d_collaborators_treatment[df_1d_collaborators_treatment.MAGCollaborationYear.isna()]

Unnamed: 0,MAGAID,MAGCollaborationYear,MAGCollabAID,FirstYearPostRetraction,YearOfAttrition,RetractionYear,AuthorType,YearsBetweenRyearAndFirstActivityPostRetraction


In [116]:
df_1d_collaborators_treatment.MAGAID.nunique(),df_1d_collaborators_control.MAGAID.nunique(),df_1d_collaborators_control.MatchMAGAID.nunique()


(751, 751, 2262)

### Extracting pre- and post-retraction collaborators with 5 year window

Given the assumption that retraction affects the scientists' reputation for only certain number of years, after which there is a phase of recovery, we conduct our analysis by limiting collaborations to a 5 year window such that we only look at collaborators 5 year in the past and 5 years in the future. 

**VERY important note: earlier we may be dropping authors with no collaborators pre and post. We must add them back**

In [117]:
#Let us first create a prepost flag to check if a collaborator is before or after retraction given 5 year window

def get_prepost_flag(row):
    if(pd.isna(row['MAGCollaborationYear'])):
        return 'pre'
    if(row['MAGCollaborationYear'] <= row['RetractionYear']):
        return 'pre'
    else:
        if((row['MAGCollaborationYear']-row['RetractionYear'])<=5):
            return 'post5'
        else:
            return 'post'


# Now let us apply the get_prepost_flag function to each row for treatment and control

df_1d_collaborators_treatment['PrePostFlag5'] = df_1d_collaborators_treatment.apply(lambda row: get_prepost_flag(row), 
                                                 axis=1)

df_1d_collaborators_control['PrePostFlag5'] = df_1d_collaborators_control.apply(lambda row: get_prepost_flag(row), 
                                             axis=1)

In [118]:
# Now we must first impute NaNs with 

In [119]:
# Now let us extract pre- and post-retraction collaborators as set

# Grouping by MAGAID, gender, and pre-post flag to extract pre, and post- re. collabs.
df_1d_collaborators_treatment_w5 = df_1d_collaborators_treatment.groupby(['MAGAID',
                                          'RetractionYear','PrePostFlag5'])\
                                        ['MAGCollabAID'].apply(set).unstack().reset_index().\
                                        drop(columns=['post'])

df_1d_collaborators_control_w5 = df_1d_collaborators_control.groupby(['MAGAID','MatchMAGAID','RetractionYear',
                                                                   'PrePostFlag5'])\
                                        ['MAGCollabAID'].apply(set).unstack().reset_index().\
                                        drop(columns=['post'])


In [120]:
# df_1d_collaborators_control_w5[df_1d_collaborators_control_w5.post5.isna() & 
#                               df_1d_collaborators_control_w5.MatchMAGAID.isin(temp)]

In [121]:
df_1d_collaborators_control_w5

PrePostFlag5,MAGAID,MatchMAGAID,RetractionYear,post5,pre
0,2184860,2.136872e+09,2006.0,"{2142233216, 2138036098, 3171196163, 212547943...","{2938253029, 2601764421, 2128606473, 196349863..."
1,2184860,2.628313e+09,2008.0,"{2118927203, 2159872740, 2653548035, 298556306...","{2100084742, 2275291657, 2143456268, 263429415..."
2,8197726,1.574644e+09,2009.0,"{2167392961, 1424858499, 2103543300, 195557334...","{2167392961, 1955573347, 2334728430, 2107135623}"
3,47570122,2.063571e+09,2011.0,"{2161412225, 194432546, 2988870465, 2040655364...","{2161412225, 2040655364, 2118656777, 170825127..."
4,59171237,2.145678e+09,2010.0,"{2110558084, 2167601542, 282612616, 2659865352...","{2224689291, 2438685069, 2674527635, 205479310..."
...,...,...,...,...,...
2411,3164389219,2.480513e+09,2013.0,"{2302770432, 2672592672, 2702384553, 256313658...","{2302770432, 2442160417, 2420613156, 268554214..."
2412,3169533057,2.413698e+09,2010.0,"{2150990560, 2466253987, 2687598470, 251075690...","{2238303395, 2121595460, 2510756905, 255812385..."
2413,3173543754,1.840802e+09,2007.0,"{2075075843, 2019336331, 2136420748, 228429569...","{1984615008, 2152376389, 311575975, 2164219148..."
2414,3174447547,1.897294e+09,2008.0,{2074793029},"{2594844899, 2074793029, 2067866536, 266878635..."


In [122]:

# Dealing with NaNs, and replacing them with empty set

# For treatment
df_1d_collaborators_treatment_w5['pre'] = df_1d_collaborators_treatment_w5['pre'].\
                                            apply(lambda d: d if isinstance(d, set) else set())

df_1d_collaborators_treatment_w5['post5'] = df_1d_collaborators_treatment_w5['post5'].\
                                                apply(lambda d: d if isinstance(d, set) else set())

# For control
df_1d_collaborators_control_w5['pre'] = df_1d_collaborators_control_w5['pre'].\
                                            apply(lambda d: d if isinstance(d, set) else set())
df_1d_collaborators_control_w5['post5'] = df_1d_collaborators_control_w5['post5'].\
                                            apply(lambda d: d if isinstance(d, set) else set())

In [123]:
df_1d_collaborators_control_w5

PrePostFlag5,MAGAID,MatchMAGAID,RetractionYear,post5,pre
0,2184860,2.136872e+09,2006.0,"{2142233216, 2138036098, 3171196163, 212547943...","{2938253029, 2601764421, 2128606473, 196349863..."
1,2184860,2.628313e+09,2008.0,"{2118927203, 2159872740, 2653548035, 298556306...","{2100084742, 2275291657, 2143456268, 263429415..."
2,8197726,1.574644e+09,2009.0,"{2167392961, 1424858499, 2103543300, 195557334...","{2167392961, 1955573347, 2334728430, 2107135623}"
3,47570122,2.063571e+09,2011.0,"{2161412225, 194432546, 2988870465, 2040655364...","{2161412225, 2040655364, 2118656777, 170825127..."
4,59171237,2.145678e+09,2010.0,"{2110558084, 2167601542, 282612616, 2659865352...","{2224689291, 2438685069, 2674527635, 205479310..."
...,...,...,...,...,...
2411,3164389219,2.480513e+09,2013.0,"{2302770432, 2672592672, 2702384553, 256313658...","{2302770432, 2442160417, 2420613156, 268554214..."
2412,3169533057,2.413698e+09,2010.0,"{2150990560, 2466253987, 2687598470, 251075690...","{2238303395, 2121595460, 2510756905, 255812385..."
2413,3173543754,1.840802e+09,2007.0,"{2075075843, 2019336331, 2136420748, 228429569...","{1984615008, 2152376389, 311575975, 2164219148..."
2414,3174447547,1.897294e+09,2008.0,{2074793029},"{2594844899, 2074793029, 2067866536, 266878635..."


In [124]:
print(df_1d_collaborators_treatment_w5.MAGAID.nunique(),
      df_1d_collaborators_control_w5.MAGAID.nunique(),
      df_1d_collaborators_control_w5.MatchMAGAID.nunique())

751 751 2262


In [125]:
missing_magaids = df_1d_collaborators_treatment_w5\
                    [~df_1d_collaborators_treatment_w5['MAGAID'].isin(df_1d_collaborators_control_w5['MAGAID'])]['MAGAID']

missing_matches = df_control[df_control['MAGAID'].isin(missing_magaids)]['MatchMAGAID']

dftemp = df_1d_collaborators[df_1d_collaborators.MAGAID.isin(missing_matches)]


df_1d_collaborators

missing_matches.astype(int)

Series([], Name: MatchMAGAID, dtype: int64)

In [126]:
df_1d_collaborators[~df_1d_collaborators['MAGAID'].isin(df_1d_collaborators_control_w5['MAGAID'])]

Unnamed: 0,MAGAID,ScientistType,MAGCollaborationYear,MAGCollabAID,FirstYearPostRetraction,YearOfAttrition,RetractionYear,AuthorType,YearsBetweenRyearAndFirstActivityPostRetraction
98986,2.323857e+09,matched,1986.0,65801642,1996.0,2009.0,1994.0,matched,2.0
98987,2.323857e+09,matched,1986.0,2443261861,1996.0,2009.0,1994.0,matched,2.0
98988,2.323857e+09,matched,1986.0,3059314501,1996.0,2009.0,1994.0,matched,2.0
98989,2.323857e+09,matched,1999.0,2035580865,1996.0,2009.0,1994.0,matched,2.0
98990,2.323857e+09,matched,1999.0,2140485575,1996.0,2009.0,1994.0,matched,2.0
...,...,...,...,...,...,...,...,...,...
224604,2.080702e+09,matched,2020.0,2972314851,2017.0,2020.0,2015.0,matched,2.0
224605,2.080702e+09,matched,2020.0,3083126804,2017.0,2020.0,2015.0,matched,2.0
224606,2.080702e+09,matched,2020.0,3083257570,2017.0,2020.0,2015.0,matched,2.0
224607,2.080702e+09,matched,2020.0,3083380120,2017.0,2020.0,2015.0,matched,2.0


### Extracting number & set of retained collaborators with a 5 year window

In [127]:
# Now let us find the number and set of retained collaborators for both the groups

df_1d_collaborators_treatment_w5['NumRetentionW5'] = df_1d_collaborators_treatment_w5.apply(lambda row: \
                                                    len(row.post5.intersection(row.pre)), 
                                                    axis=1)

df_1d_collaborators_treatment_w5['CollabAIDRetainedW5'] = df_1d_collaborators_treatment_w5.apply(lambda row: \
                                                    row.post5.intersection(row.pre), 
                                                    axis=1)

df_1d_collaborators_control_w5['NumRetentionW5'] = df_1d_collaborators_control_w5.apply(lambda row: \
                                                len(row.post5.intersection(row.pre)), 
                                                axis=1)

df_1d_collaborators_control_w5['CollabAIDRetainedW5'] = df_1d_collaborators_control_w5.apply(lambda row: \
                                                    row.post5.intersection(row.pre), 
                                                    axis=1)

In [128]:
df_1d_collaborators_treatment_w5

PrePostFlag5,MAGAID,RetractionYear,post5,pre,NumRetentionW5,CollabAIDRetainedW5
0,2.184860e+06,2008.0,"{1749937409, 2149399999, 2617454923, 260192180...","{2118754826, 2601921804, 2665868049, 230519298...",6,"{2617454923, 2601921804, 2032600174, 230576087..."
1,8.197726e+06,2012.0,"{1969204096, 2226225926, 2024673639, 199363863...","{2024673639, 279977545, 1970425162, 1689175372...",4,"{1970425162, 1689175372, 2122225613, 2024673639}"
2,4.757012e+07,2015.0,"{2810038656, 2612874633, 2664804105, 216196967...","{2539300224, 216902017, 2065029121, 2164769538...",5,"{3106199751, 2161969676, 2254181133, 239672274..."
3,5.917124e+07,2010.0,"{21588321, 2477778402, 2150085834, 2124258861,...","{2184540678, 2309258637, 2130493843, 210086888...",4,"{21588321, 2477778402, 2150085834, 2116044414}"
4,7.092528e+07,2014.0,"{2328435392, 2343414049, 2166760899, 204621162...","{2695372927, 2701780488, 2119067530, 168314770...",4,"{2062888856, 2108751578, 168314770, 2188614167}"
...,...,...,...,...,...,...
746,3.164389e+09,2015.0,"{2312179010, 2717012738, 2805334946, 209874563...","{2044078369, 2233156033, 2021874555, 210650077...",4,"{2151122518, 2717012738, 3137906268, 2098745638}"
747,3.169533e+09,2013.0,"{1529165376, 2123317345, 3080824549, 206997354...","{2584739264, 2138028929, 2949336004, 214012061...",1,{2171895741}
748,3.173544e+09,2011.0,"{3120802076, 2236143686}","{2096029443, 2160112133, 2158121610, 230907623...",0,{}
749,3.174448e+09,2008.0,{2561941943},"{2413204075, 2100828844, 3175667245, 299211752...",1,{2561941943}


In [129]:
df_1d_collaborators_control_w5

PrePostFlag5,MAGAID,MatchMAGAID,RetractionYear,post5,pre,NumRetentionW5,CollabAIDRetainedW5
0,2184860,2.136872e+09,2006.0,"{2142233216, 2138036098, 3171196163, 212547943...","{2938253029, 2601764421, 2128606473, 196349863...",4,"{2128606473, 2136984498, 1963498635, 1600873685}"
1,2184860,2.628313e+09,2008.0,"{2118927203, 2159872740, 2653548035, 298556306...","{2100084742, 2275291657, 2143456268, 263429415...",8,"{2118927203, 2159872740, 2275291657, 268779959..."
2,8197726,1.574644e+09,2009.0,"{2167392961, 1424858499, 2103543300, 195557334...","{2167392961, 1955573347, 2334728430, 2107135623}",3,"{2167392961, 1955573347, 2107135623}"
3,47570122,2.063571e+09,2011.0,"{2161412225, 194432546, 2988870465, 2040655364...","{2161412225, 2040655364, 2118656777, 170825127...",7,"{2988870465, 2161412225, 771622433, 2040655364..."
4,59171237,2.145678e+09,2010.0,"{2110558084, 2167601542, 282612616, 2659865352...","{2224689291, 2438685069, 2674527635, 205479310...",11,"{1987745634, 2939406712, 2224689291, 247545838..."
...,...,...,...,...,...,...,...
2411,3164389219,2.480513e+09,2013.0,"{2302770432, 2672592672, 2702384553, 256313658...","{2302770432, 2442160417, 2420613156, 268554214...",4,"{2302770432, 2702384553, 2587525358, 2124554743}"
2412,3169533057,2.413698e+09,2010.0,"{2150990560, 2466253987, 2687598470, 251075690...","{2238303395, 2121595460, 2510756905, 255812385...",1,{2510756905}
2413,3173543754,1.840802e+09,2007.0,"{2075075843, 2019336331, 2136420748, 228429569...","{1984615008, 2152376389, 311575975, 2164219148...",6,"{1984615008, 2152376389, 311575975, 1866114381..."
2414,3174447547,1.897294e+09,2008.0,{2074793029},"{2594844899, 2074793029, 2067866536, 266878635...",1,{2074793029}


In [130]:
# df_1d_collaborators_control_w5[df_1d_collaborators_control_w5.MatchMAGAID.isin(temp)].NumRetentionW5.value_counts()

### Extracting number & set of new collaborators with a 5 year window

In [131]:
# Now let us compute the number of new collaborators

# We can compute them by subtracting pre-retraction collaborators' set from post-retraction collaborators' set
def extract_num_newCollab(row):
    return len(row['post5']-row['pre'])

def extract_newCollab(row):
    return row['post5']-row['pre']

# computing number and set of new collaborators
df_1d_collaborators_treatment_w5['NumNewCollaboratorsW5'] = df_1d_collaborators_treatment_w5\
                                                            .apply(lambda row: extract_num_newCollab(row), 
                                                               axis=1)

df_1d_collaborators_treatment_w5['CollabAIDGainedW5'] = df_1d_collaborators_treatment_w5\
                                                            .apply(lambda row: extract_newCollab(row), 
                                                               axis=1)

df_1d_collaborators_control_w5['NumNewCollaboratorsW5'] = df_1d_collaborators_control_w5.apply(lambda row: extract_num_newCollab(row), 
                                                       axis=1)

df_1d_collaborators_control_w5['CollabAIDGainedW5'] = df_1d_collaborators_control_w5.apply(lambda row: extract_newCollab(row), 
                                                           axis=1)



In [132]:
df_1d_collaborators_treatment_w5.head(2)

PrePostFlag5,MAGAID,RetractionYear,post5,pre,NumRetentionW5,CollabAIDRetainedW5,NumNewCollaboratorsW5,CollabAIDGainedW5
0,2184860.0,2008.0,"{1749937409, 2149399999, 2617454923, 260192180...","{2118754826, 2601921804, 2665868049, 230519298...",6,"{2617454923, 2601921804, 2032600174, 230576087...",5,"{1749937409, 1517361100, 2798255860, 251012216..."
1,8197726.0,2012.0,"{1969204096, 2226225926, 2024673639, 199363863...","{2024673639, 279977545, 1970425162, 1689175372...",4,"{1970425162, 1689175372, 2122225613, 2024673639}",10,"{1969204096, 2226225926, 1993638631, 256742932..."


In [133]:
df_1d_collaborators_control_w5.head(2)

PrePostFlag5,MAGAID,MatchMAGAID,RetractionYear,post5,pre,NumRetentionW5,CollabAIDRetainedW5,NumNewCollaboratorsW5,CollabAIDGainedW5
0,2184860,2136872000.0,2006.0,"{2142233216, 2138036098, 3171196163, 212547943...","{2938253029, 2601764421, 2128606473, 196349863...",4,"{2128606473, 2136984498, 1963498635, 1600873685}",34,"{2142233216, 2138036098, 3171196163, 212547943..."
1,2184860,2628313000.0,2008.0,"{2118927203, 2159872740, 2653548035, 298556306...","{2100084742, 2275291657, 2143456268, 263429415...",8,"{2118927203, 2159872740, 2275291657, 268779959...",10,"{2653548035, 1989641415, 2717105898, 249522852..."


### Merging num retention and num new collaborators with treatment and control

In [134]:

df_treatment_augmented = df_treatment.\
                merge(df_1d_collaborators_treatment_w5[['MAGAID','NumRetentionW5',
                                                        'NumNewCollaboratorsW5']].drop_duplicates(),
                     on='MAGAID')

df_treatment_augmented

Unnamed: 0,MAGAID,Record ID,MAGPID,RetractionYear,MAGRetractionYearAffID,MAGRetractionYearAffRank,MAGRetractionYearAffYear,MAGrootFID,MAGrootFIDMaxPercent,GenderizeGender,...,ReasonPropagatedMajorityOfMajority,DemiDecadeOfRetraction,MAGAIDFirstORLastAuthorFlag,RetractorMajority,AcademicAgeBeforeRetraction,AcademicAgeBin,MAGFieldName,MAGRetractionYearAffRankStratified,NumRetentionW5,NumNewCollaboratorsW5
0,2184860,15835.0,2.609888e+09,2008.0,861853513.0,151-200,2008.0,86803240.0,0.375000,female,...,other,2006-2010,MAGMiddleAuthor,other retractor,2.0,2,biology,101-500,6,5
1,8197726,3444.0,1.506358e+08,2012.0,13955877.0,401-500,2012.0,86803240.0,0.333333,female,...,mistake,2011-2015,MAGMiddleAuthor,author,5.0,3-5,biology,101-500,4,10
2,8197726,3444.0,1.506358e+08,2012.0,13955877.0,401-500,2012.0,185592680.0,0.333333,female,...,mistake,2011-2015,MAGMiddleAuthor,author,5.0,3-5,chemistry,101-500,4,10
3,47570122,1202.0,1.975776e+09,2015.0,126744593.0,201-300,2015.0,86803240.0,0.297297,male,...,misconduct,2011-2015,MAGMiddleAuthor,author,5.0,3-5,biology,101-500,5,19
4,70925285,8100.0,2.068127e+09,2014.0,158248296.0,501-600,2014.0,86803240.0,0.196970,male,...,plagiarism,2011-2015,MAGFirstOrLastAuthor,other retractor,4.0,3-5,biology,501-1000,4,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1059,2641243609,13282.0,2.065182e+09,2013.0,59433898.0,501-600,2013.0,192562407.0,0.307692,male,...,plagiarism,2011-2015,MAGFirstOrLastAuthor,other retractor,5.0,3-5,other STEM fields,501-1000,4,13
1060,2647279690,17399.0,2.012563e+09,2008.0,151727225.0,401-500,2008.0,192562407.0,0.350000,male,...,other,2006-2010,MAGMiddleAuthor,other retractor,1.0,1,other STEM fields,101-500,4,0
1061,3174447547,23452.0,2.166758e+09,2008.0,99065089.0,43,2008.0,192562407.0,0.384615,female,...,other,2006-2010,MAGFirstOrLastAuthor,other retractor,2.0,2,other STEM fields,1-100,1,0
1062,2160069722,16673.0,2.024869e+09,2013.0,153976015.0,501-600,2012.0,17744445.0,0.271605,female,...,plagiarism,2011-2015,MAGFirstOrLastAuthor,other retractor,11.0,>5,non-STEM fields,501-1000,0,1


In [135]:
df_control_augmented = df_control.\
                merge(df_1d_collaborators_control_w5[['MAGAID','MatchMAGAID','NumRetentionW5',
                                                        'NumNewCollaboratorsW5']].drop_duplicates(),
                     on=['MAGAID','MatchMAGAID'])

df_control_augmented

Unnamed: 0,MAGAID,MatchMAGAID,Record ID,MAGPID,RetractionYear,MAGRetractionYearAffID,MAGRetractionYearAffRank,MAGRetractionYearAffYear,MatchMAGRetractionYearAffID,MatchMAGRetractionYearAffYear,...,ReasonPropagatedMajorityOfMajority,DemiDecadeOfRetraction,MAGAIDFirstORLastAuthorFlag,RetractorMajority,AcademicAgeBeforeRetraction,AcademicAgeBin,MAGFieldName,MAGRetractionYearAffRankStratified,NumRetentionW5,NumNewCollaboratorsW5
0,2184860,2.628313e+09,15835.0,2.609888e+09,2008.0,861853513.0,151-200,2008.0,881766915.0,2008.0,...,other,2006-2010,MAGMiddleAuthor,other retractor,2.0,2,biology,101-500,8,10
1,2184860,2.136872e+09,15835.0,2.609888e+09,2008.0,861853513.0,151-200,2008.0,205349734.0,2008.0,...,other,2006-2010,MAGMiddleAuthor,other retractor,2.0,2,biology,101-500,4,34
2,8197726,1.574644e+09,3444.0,1.506358e+08,2012.0,13955877.0,401-500,2012.0,185443292.0,2011.0,...,mistake,2011-2015,MAGMiddleAuthor,author,5.0,3-5,biology,101-500,3,11
3,8197726,1.574644e+09,3444.0,1.506358e+08,2012.0,13955877.0,401-500,2012.0,185443292.0,2011.0,...,mistake,2011-2015,MAGMiddleAuthor,author,5.0,3-5,chemistry,101-500,3,11
4,47570122,2.063571e+09,1202.0,1.975776e+09,2015.0,126744593.0,201-300,2015.0,100532134.0,2015.0,...,misconduct,2011-2015,MAGMiddleAuthor,author,5.0,3-5,biology,101-500,7,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3617,2714015866,3.168352e+09,7423.0,2.147214e+09,2008.0,129774422.0,151-200,2008.0,204983213.0,2008.0,...,plagiarism,2006-2010,MAGMiddleAuthor,other retractor,0.0,1,non-STEM fields,101-500,2,3
3618,2160069722,2.011915e+09,16673.0,2.024869e+09,2013.0,153976015.0,501-600,2012.0,20121455.0,2010.0,...,plagiarism,2011-2015,MAGFirstOrLastAuthor,other retractor,11.0,>5,non-STEM fields,501-1000,1,8
3619,2501052294,2.405914e+09,18635.0,2.081956e+09,2012.0,191208505.0,201-300,2012.0,195460627.0,2012.0,...,other,2011-2015,MAGMiddleAuthor,other retractor,1.0,1,non-STEM fields,101-500,3,1
3620,3049602676,2.135624e+09,5290.0,1.495099e+09,2014.0,203951103.0,301-400,2012.0,76214153.0,2011.0,...,plagiarism,2011-2015,MAGFirstOrLastAuthor,other retractor,3.0,3-5,non-STEM fields,101-500,1,2


In [136]:
# df_1d_collaborators_control_w5[df_1d_collaborators_control_w5.MatchMAGAID.isin(temp)].NumNewCollaboratorsW5.value_counts()

### Triadic Closure

In [137]:
# Let us first read the triadic closure files

triadic_closure_path = indir+"/triadic_closure/"

flist = os.listdir(triadic_closure_path)

dfs = []
for f in flist:
    df = pd.read_csv(triadic_closure_path+f, usecols=['MAGAID','NumOpenTriads','RetractionYear','NumTriadsClosed','NC'])
    dfs.append(df)
    
df_triads = pd.concat(dfs)

df_triads.head()


Unnamed: 0,MAGAID,RetractionYear,NumOpenTriads,NumTriadsClosed,NC
0,2643648000.0,2013.0,20,0,0.0
1,2643749000.0,2014.0,33,0,0.0
2,2643908000.0,2015.0,119,0,0.0
3,2644173000.0,2015.0,79,0,0.0
4,2644200000.0,2010.0,147,0,0.0


In [138]:
df_triads.MAGAID.nunique() # because it contains matches too

6540

In [139]:
df_treatment_augmented = df_treatment_augmented.merge(df_triads, on=['MAGAID','RetractionYear'], how='left')
print(df_treatment_augmented[df_treatment_augmented.NC.isna()].MAGAID.nunique())
df_treatment_augmented['NC'] = df_treatment_augmented['NC'].fillna(0)


df_control_augmented = df_control_augmented.merge(df_triads.rename(columns={'MAGAID':'MatchMAGAID'}), 
                                                                    on=['MatchMAGAID','RetractionYear'], how='left')
                                                  
print(df_control_augmented[df_control_augmented.NC.isna()].MatchMAGAID.nunique())

df_control_augmented['NC'] = df_control_augmented['NC'].fillna(0)

1
2


In [140]:
df_control_augmented

Unnamed: 0,MAGAID,MatchMAGAID,Record ID,MAGPID,RetractionYear,MAGRetractionYearAffID,MAGRetractionYearAffRank,MAGRetractionYearAffYear,MatchMAGRetractionYearAffID,MatchMAGRetractionYearAffYear,...,RetractorMajority,AcademicAgeBeforeRetraction,AcademicAgeBin,MAGFieldName,MAGRetractionYearAffRankStratified,NumRetentionW5,NumNewCollaboratorsW5,NumOpenTriads,NumTriadsClosed,NC
0,2184860,2.628313e+09,15835.0,2.609888e+09,2008.0,861853513.0,151-200,2008.0,881766915.0,2008.0,...,other retractor,2.0,2,biology,101-500,8,10,33,0,0.000000
1,2184860,2.136872e+09,15835.0,2.609888e+09,2008.0,861853513.0,151-200,2008.0,205349734.0,2008.0,...,other retractor,2.0,2,biology,101-500,4,34,28,0,0.000000
2,8197726,1.574644e+09,3444.0,1.506358e+08,2012.0,13955877.0,401-500,2012.0,185443292.0,2011.0,...,author,5.0,3-5,biology,101-500,3,11,1,0,0.000000
3,8197726,1.574644e+09,3444.0,1.506358e+08,2012.0,13955877.0,401-500,2012.0,185443292.0,2011.0,...,author,5.0,3-5,chemistry,101-500,3,11,1,0,0.000000
4,47570122,2.063571e+09,1202.0,1.975776e+09,2015.0,126744593.0,201-300,2015.0,100532134.0,2015.0,...,author,5.0,3-5,biology,101-500,7,11,38,0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3617,2714015866,3.168352e+09,7423.0,2.147214e+09,2008.0,129774422.0,151-200,2008.0,204983213.0,2008.0,...,other retractor,0.0,1,non-STEM fields,101-500,2,3,7,2,0.285714
3618,2160069722,2.011915e+09,16673.0,2.024869e+09,2013.0,153976015.0,501-600,2012.0,20121455.0,2010.0,...,other retractor,11.0,>5,non-STEM fields,501-1000,1,8,9,1,0.111111
3619,2501052294,2.405914e+09,18635.0,2.081956e+09,2012.0,191208505.0,201-300,2012.0,195460627.0,2012.0,...,other retractor,1.0,1,non-STEM fields,101-500,3,1,12,0,0.000000
3620,3049602676,2.135624e+09,5290.0,1.495099e+09,2014.0,203951103.0,301-400,2012.0,76214153.0,2011.0,...,other retractor,3.0,3-5,non-STEM fields,101-500,1,2,18,0,0.000000


In [141]:
df_control_augmented.columns.tolist()

['MAGAID',
 'MatchMAGAID',
 'Record ID',
 'MAGPID',
 'RetractionYear',
 'MAGRetractionYearAffID',
 'MAGRetractionYearAffRank',
 'MAGRetractionYearAffYear',
 'MatchMAGRetractionYearAffID',
 'MatchMAGRetractionYearAffYear',
 'MatchMAGRetractionYearAffRank',
 'MatchMAGMaxRetractionYear',
 'MAGrootFID',
 'MAGrootFIDMaxPercent',
 'MatchMAGrootFID',
 'MatchMAGrootFIDMaxPercent',
 'GenderizeGender',
 'MAGFirstPubYear',
 'MAGFirstAffID',
 'MAGFirstAffiliationRank',
 'MatchMAGFirstAffID',
 'MatchMAGFirstAffYear',
 'MatchMAGFirstAffiliationRank',
 'MAGCumPapersAtRetraction',
 'MAGCumPapersYearAtRetraction',
 'MatchMAGCumPapersYearAtRetraction',
 'MatchMAGCumPapersAtRetraction',
 'MAGCumCitationsAtRetraction',
 'MAGCumCitationsYearAtRetraction',
 'MatchMAGCumCitationsYearAtRetraction',
 'MatchMAGCumCitationsAtRetraction',
 'MAGCumCollaboratorsAtRetraction',
 'MAGCumCollaboratorsYearAtRetraction',
 'MatchMAGCumCollaboratorsYearAtRetraction',
 'MatchMAGCumCollaboratorsAtRetraction',
 'MAGRetraction

# Additional Strata

At this point, there are two more stratifications that we must add and compute:

1. Number of years between retraction year and original pub year
    a. 1-2 years
    b. 3-5 years
    c. > 5 years
    
2. Attention:
    a. High attention (>10 Altmetric score)
    b. Low attention (<= 10 Altmetric score)

In [142]:
# Let us read the dataframe with this information
indir_processed = "/Users/sm9654/desktop/NYUAD/nyuad-research/retraction_openalex/retraction_effects_on_academic_careers/data/processed/"


df_filteredSample = pd.read_csv(indir_processed+"/RW_authors_w_confounders_filteredSample_postNHB_BedoorsCorrections_Augmented.csv",
                               usecols=['MAGAID','RetractionYear', 'AltmetricScoreAtRetraction', 'OriginalPaperYear'])\
                                .drop_duplicates()

df_filteredSample['DifferencePublicationAndRetractionYear'] = df_filteredSample['RetractionYear'] \
                                                                - df_filteredSample['OriginalPaperYear']

df_filteredSample['DifferencePublicationAndRetractionYear'].describe()

count    15995.000000
mean         2.279712
std          2.915085
min          0.000000
25%          0.000000
50%          1.000000
75%          3.000000
max         29.000000
Name: DifferencePublicationAndRetractionYear, dtype: float64

In [143]:
# Define a function to categorize the data
def categorize_diffyears(years):
    if years <= 1:
        return '0-1 year'
    elif years <= 5:
        return '2-5 years'
    else:
        return '6 or more years'

# Apply the categorization
df_filteredSample['DifferencePublicationAndRetractionYearCategorical'] = \
                df_filteredSample['DifferencePublicationAndRetractionYear'].apply(categorize_diffyears)

df_filteredSample

Unnamed: 0,MAGAID,RetractionYear,OriginalPaperYear,AltmetricScoreAtRetraction,DifferencePublicationAndRetractionYear,DifferencePublicationAndRetractionYearCategorical
0,2127983451,1997.0,1994.0,0.0,3.0,2-5 years
5,1986180616,2001.0,1995.0,3.0,6.0,6 or more years
8,2134970185,1998.0,1994.0,15.0,4.0,2-5 years
9,2600580187,1998.0,1997.0,0.0,1.0,0-1 year
11,257122240,1994.0,1990.0,0.0,4.0,2-5 years
...,...,...,...,...,...,...
31717,2691683266,2010.0,2009.0,0.0,1.0,0-1 year
31719,2682940880,1998.0,1997.0,0.0,1.0,0-1 year
31721,2689068837,1991.0,1991.0,0.0,0.0,0-1 year
31733,2651016970,2009.0,2004.0,0.0,5.0,2-5 years


In [144]:
# Also let us categorize the attention
# Define a function to categorize the data
def categorize_attention(score):
    if score <= 10:
        return 'low'
    else:
        return 'high'

df_filteredSample['AttentionCategorical'] = \
                df_filteredSample['AltmetricScoreAtRetraction'].apply(categorize_attention)

df_filteredSample['AttentionCategorical'].value_counts()

AttentionCategorical
low     14005
high     1990
Name: count, dtype: int64

In [145]:
df_treatment_augmented.MAGAID.nunique(), df_control_augmented.MAGAID.nunique(), df_control_augmented.MatchMAGAID.nunique()

(751, 751, 2262)

In [146]:
# Now let us remove unnecessary columns and merge
df_treatment_augmented = df_treatment_augmented.merge(df_filteredSample, on=['MAGAID','RetractionYear'],)
df_control_augmented = df_control_augmented.merge(df_filteredSample, on=['MAGAID','RetractionYear'],)

In [147]:
df_treatment_augmented.MAGAID.nunique(), df_control_augmented.MAGAID.nunique(), df_control_augmented.MatchMAGAID.nunique()

(751, 751, 2262)

In [148]:
df_treatment_augmented.to_csv(indir+"/RWMAG_rematched_treatment_augmented_rematching_30perc.csv", index=False)
df_control_augmented.to_csv(indir+"/RWMAG_rematched_control_augmented_rematching_30perc.csv", index=False)

In [149]:
df_treatment_augmented

Unnamed: 0,MAGAID,Record ID,MAGPID,RetractionYear,MAGRetractionYearAffID,MAGRetractionYearAffRank,MAGRetractionYearAffYear,MAGrootFID,MAGrootFIDMaxPercent,GenderizeGender,...,NumRetentionW5,NumNewCollaboratorsW5,NumOpenTriads,NumTriadsClosed,NC,OriginalPaperYear,AltmetricScoreAtRetraction,DifferencePublicationAndRetractionYear,DifferencePublicationAndRetractionYearCategorical,AttentionCategorical
0,2184860,15835.0,2.609888e+09,2008.0,861853513.0,151-200,2008.0,86803240.0,0.375000,female,...,6,5,24,3,0.125000,2008.0,0.0,0.0,0-1 year,low
1,8197726,3444.0,1.506358e+08,2012.0,13955877.0,401-500,2012.0,86803240.0,0.333333,female,...,4,10,4,1,0.250000,2007.0,5.5,5.0,2-5 years,low
2,8197726,3444.0,1.506358e+08,2012.0,13955877.0,401-500,2012.0,185592680.0,0.333333,female,...,4,10,4,1,0.250000,2007.0,5.5,5.0,2-5 years,low
3,47570122,1202.0,1.975776e+09,2015.0,126744593.0,201-300,2015.0,86803240.0,0.297297,male,...,5,19,1,0,0.000000,2012.0,10.0,3.0,2-5 years,low
4,70925285,8100.0,2.068127e+09,2014.0,158248296.0,501-600,2014.0,86803240.0,0.196970,male,...,4,12,130,0,0.000000,2014.0,0.0,0.0,0-1 year,low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1059,2641243609,13282.0,2.065182e+09,2013.0,59433898.0,501-600,2013.0,192562407.0,0.307692,male,...,4,13,31,2,0.064516,2012.0,0.0,1.0,0-1 year,low
1060,2647279690,17399.0,2.012563e+09,2008.0,151727225.0,401-500,2008.0,192562407.0,0.350000,male,...,4,0,5,0,0.000000,2007.0,0.0,1.0,0-1 year,low
1061,3174447547,23452.0,2.166758e+09,2008.0,99065089.0,43,2008.0,192562407.0,0.384615,female,...,1,0,1,0,0.000000,2008.0,0.0,0.0,0-1 year,low
1062,2160069722,16673.0,2.024869e+09,2013.0,153976015.0,501-600,2012.0,17744445.0,0.271605,female,...,0,1,48,0,0.000000,2012.0,5.0,1.0,0-1 year,low
