# Postprocessing

Once we have extracted the confounders, we will 

(i) filter authors and papers further based on whether we were able to extract confounders for them, and 

(ii) extract, augment, and process confounders further to ensure we have all the variables needed for plotting (and later for regression and matching).

In [1]:
# Importing relevant packages

import pandas as pd
import os
from config_reader import read_config
import numpy as np

In [2]:
# Reading paths
paths = read_config()
AUTHORS_W_CONFOUNDERS_PATH = paths['AUTHORS_W_CONFOUNDERS_PATH']
RW_ORIGINAL_W_YEAR_LOCAL_PATH = paths['RW_ORIGINAL_W_YEAR_LOCAL_PATH']
SCIMAGO_PATH_LOCAL = paths['SCIMAGO_PATH_LOCAL']
MAG_PATH_LOCAL = paths['MAG_PATH_LOCAL']
MAG_FIELD_NAMES_LOCAL = paths['MAG_FIELD_NAMES_LOCAL']
# Path to where we will save our processed files
OUTDIR = paths['PROCESSED_FOLDER_LOCAL']

In [3]:
df_authors = pd.read_csv(AUTHORS_W_CONFOUNDERS_PATH)

df_authors = df_authors.rename(columns={'RetractedPaperMAGPID':'MAGPID',
                                 'FieldName':'MAGFieldName',
                                 'AcademicAge': 'AcademicAgeAtRetraction',
                                 'AffRank': 'MAGRetractionYearAffRank',
                                 'NumAuthorsInRetractedPaperRW':'NumAuthorsInRetractedPaper',
                                 'cumPapers':'MAGCumPapersAtRetraction',
                                 'cumCitations':'MAGCumCitationsAtRetraction',
                                 'cumCollaborators':'MAGCumCollaboratorsAtRetraction'})

# Filtering

In [4]:
df_authors['MAGAID'].nunique()

20713

In [5]:
# Filtering authors without gender, affiliation id, discipline, weird academic age, and also repeated offenders

# gender
df_authors = df_authors[~df_authors['GenderizeGender'].isna() & df_authors['GenderizeConfidence'].gt(0.5)]

# affiliation id
df_authors = df_authors[~df_authors['AffID'].isna()]

# discipline
df_authors = df_authors[~df_authors['MAGrootFID'].isna()]

# academic age
df_authors = df_authors[df_authors['AcademicAgeAtRetraction'].le(70)]


In [6]:
df_authors['MAGAID'].nunique(), df_authors['MAGPID'].nunique()

(15983, 5043)

In [7]:
# removing repeated offenders
df_authors_repeated_offenders = df_authors[df_authors['nRetracted'] > 1]
#df_authors = df_authors[df_authors['nRetracted'] == 1]


df_authors['MAGAID'].nunique(), df_authors_repeated_offenders['MAGAID'].nunique()

(15983, 1402)

In [8]:
df_authors['MAGPID'].nunique(), df_authors_repeated_offenders['MAGPID'].nunique()

(5043, 1105)

In [9]:
df_authors['Record ID'].nunique(), df_authors_repeated_offenders['Record ID'].nunique()

(5049, 1111)

# Fixing some data issues

For repeated offenders, we need to ident

# Augmentation

Now we shall augment some of the columns

In [10]:
# Let us add attrition flags
# Let us first read Bedoor's file
df_attrition_year_Bedoor = pd.read_csv(OUTDIR+"AIDs_YearOfAttrition.csv")\
                            .rename(columns={'AID':'MAGAID'})


# Reading RW to extract attrition date for those with NaN
df_rw = pd.read_csv(RW_ORIGINAL_W_YEAR_LOCAL_PATH,
                   usecols=['Record ID','OriginalPaperDate'])


df_rw['OriginalPaperDate'] = pd.to_datetime(df_rw['OriginalPaperDate'])

# Extracting the year
df_rw['OriginalPaperYear'] = df_rw['OriginalPaperDate'].dt.year


In [11]:
df_attrition_year_Bedoor = df_attrition_year_Bedoor[df_attrition_year_Bedoor['MAGAID'].isin(df_authors['MAGAID'])]

# Merging with df_authors
df_authors = df_authors.merge(df_attrition_year_Bedoor, on='MAGAID', how='left')

# Separating those with year of attrition from those without
df_authors_w_yearOfAttrition = df_authors[~df_authors['YearOfAttrition'].isna()]

# Processing those without
df_authors_wo_yearOfAttrition = df_authors[df_authors['YearOfAttrition'].isna()]
df_authors_wo_yearOfAttrition = df_authors_wo_yearOfAttrition.merge(df_rw, on='Record ID', how='left')
df_authors_wo_yearOfAttrition['YearOfAttrition'] = df_authors_wo_yearOfAttrition['OriginalPaperYear']

df_authors = pd.concat([df_authors_w_yearOfAttrition,df_authors_wo_yearOfAttrition])

def compute_years_active(row):
    if row['YearOfAttrition']  == -1:
        return None
    else:
        return row['YearOfAttrition']-row['RetractionYear']

# Now computing the years active 
df_authors['YearsActive'] = df_authors.apply(lambda row: compute_years_active(row), axis=1)

df_authors = df_authors.drop(columns=['OriginalPaperYear', 'OriginalPaperDate'])
df_authors = df_authors.merge(df_rw, on='Record ID', how='left')

In [12]:
# Extract attrition class
def extract_attrited_class(row, lowerthreshold=-1, upperthreshold=0):
    if(row['YearsActive'] > upperthreshold):
        # That means it is not attrited author
        return 0
    elif(row['YearsActive'] in list(range(lowerthreshold,upperthreshold+1))):
        # That means it is attrited
        return 1
    elif pd.isnull(row['YearsActive']):
        return 0
    else:
        # That means it's attrited but not due to retraction 
        return -1

# Flagging Attrited vs Non-Attrited authors

df_authors['AttritedClass'] = df_authors.apply(lambda row: extract_attrited_class(row), 
                                                                             axis=1)

df_authors['AttritedClassRobust'] = df_authors.apply(lambda row: extract_attrited_class(row,
                                                                            upperthreshold=1), 
                                                                             axis=1)

In [13]:
df_authors.columns

Index(['MAGAID', 'MAGAuthorName', 'Record ID', 'MAGPID', 'RetractionYear',
       'nRetracted', 'dateobject', 'JID', 'CSID', 'JournalName',
       'ConferenceSeriesName', 'ReasonPropagatedMajorityOfMajority',
       'NumAuthorsInRetractedPaper', 'GenderizeGender', 'GenderizeConfidence',
       'MAGAuthorOrder', 'FirstPubMAGPID', 'FirstPubYear',
       'AcademicAgeAtRetraction', 'cumPapersYear', 'MAGCumPapersAtRetraction',
       'cumCitationsYear', 'MAGCumCitationsAtRetraction',
       'cumCollaboratorsYear', 'MAGCumCollaboratorsAtRetraction', 'AffID',
       'MAGRetractionYearAffRank', 'AffYear', 'MAGrootFID',
       'MAGrootFIDMaxPercent', 'YearOfAttrition', 'YearsActive',
       'OriginalPaperDate', 'OriginalPaperYear', 'AttritedClass',
       'AttritedClassRobust'],
      dtype='object')

In [14]:
df_authors[~df_authors['ConferenceSeriesName'].isna()]['ConferenceSeriesName']

150                                            Web Science
693      International Conference on Distributed Comput...
808      International Conference on Bioinformatics and...
826      International Conference on Conceptual Structures
827      International Conference on Conceptual Structures
                               ...                        
34803                                 Ubiquitous Computing
34804                                 Ubiquitous Computing
34805                                 Ubiquitous Computing
34806                                 Ubiquitous Computing
34807                                 Ubiquitous Computing
Name: ConferenceSeriesName, Length: 472, dtype: object

In [15]:
# Let us now add the journal ranking and journal type info as that is that hardest part
def get_journal_type(row):
    if pd.isnull(row['JID']) & pd.isnull(row['CSID']):
        return None
    elif pd.isnull(row['CSID']):
        return 'conferenc'
    else:
        return 'journal'

def get_venue_name(row):
    if not pd.isnull(row['JournalName']):
        return row['JournalName']
    elif not pd.isnull(row['ConferenceSeriesName']):
        return row['ConferenceSeriesName']

df_authors['ConferenceSeriesName'] = df_authors['ConferenceSeriesName'].str.lower()
df_authors['JournalName'] = df_authors['JournalName'].str.lower()

df_authors['MAGJCName'] = df_authors.apply(lambda row: get_venue_name(row), axis=1)
    
df_authors['MAGJournalType'] = df_authors.apply(lambda row: get_journal_type(row), axis=1)
# Now we shall impute rankings, and we will do so by matching on journal title and year from Scimag

dfs_authors_w_ranks = [df_authors[df_authors.OriginalPaperYear.lt(1999)]]

# Only going through the years relevant -- rankings before 1999 not available
for year in range(1999,2016):
    
    dfi = pd.read_csv(f"{SCIMAGO_PATH_LOCAL}/scimagojr-journal-"+str(year)+".csv",
                      sep=";",decimal=',',usecols = ['Title','SJR Best Quartile','SJR'])
    
    dfi['MAGJCName'] = dfi['Title'].str.lower()
    dfi['OriginalPaperYear'] = year
    
    # Let us now fix SJR value
    def fixSJR(row):
        SJR = row['SJR']
        if(pd.isnull(SJR)):
           return np.nan
        elif(SJR == "-"):
           return np.nan
        else:
            return float(".".join(row['SJR'].split(",")))
    if(year == 2015):
        dfi['SJR'] = dfi.apply(lambda row: fixSJR(row), axis=1)
    
    # We do this as some journals have multiple entries per year
    dfi = dfi.drop_duplicates(subset='MAGJCName').drop(columns=['Title'])
    
    df_authors_yeari = df_authors[df_authors.OriginalPaperYear.eq(year)]
    df_authors_yeari = df_authors_yeari.merge(dfi, on=['MAGJCName','OriginalPaperYear'],
                                             how='left')
    
    dfs_authors_w_ranks.append(df_authors_yeari)

df_authors = pd.concat(dfs_authors_w_ranks)\
                        .replace({"SJR Best Quartile": {'Q1':1,'Q2':2,'Q3':3,'Q4':4,'-':np.nan,
                                            '0,153':np.nan}})

df_authors['SJR Best Quartile'].value_counts()


  dfi = pd.read_csv(f"{SCIMAGO_PATH_LOCAL}/scimagojr-journal-"+str(year)+".csv",


SJR Best Quartile
1.0    19049
2.0     4378
3.0     1454
4.0      257
Name: count, dtype: int64

In [16]:
# Let us now add the demi-decade

def compute_decade(row):
    if(row.RetractionYear >= 1990 and row.RetractionYear <= 1995):
        return "1990-1995"
    elif(row.RetractionYear >= 1996 and row.RetractionYear <= 2000):
        return "1996-2000"
    elif(row.RetractionYear >= 2001 and row.RetractionYear <= 2005):
        return "2001-2005"
    elif(row.RetractionYear >= 2006 and row.RetractionYear <= 2010):
        return "2006-2010"
    elif(row.RetractionYear >= 2011 and row.RetractionYear <= 2015):
        return "2011-2015"

df_authors['DemiDecade'] = df_authors.apply(lambda row: compute_decade(row), axis=1)

In [17]:
# Processing author sequence order in the paper

def extract_author_rank_type(row):
    if(row['NumAuthorsInRetractedPaper'] == row['MAGAuthorOrder']):
        return 'First or Last or Only Author'
    if(row['MAGAuthorOrder'] == 1):
        return 'First or Last or Only Author'
    else:
        return 'Middle Author'

df_authors['MAGAIDRankTypeInRetractedPaper'] = \
            df_authors.apply(lambda row: extract_author_rank_type(row), axis=1)

In [18]:
df_authors.columns

Index(['MAGAID', 'MAGAuthorName', 'Record ID', 'MAGPID', 'RetractionYear',
       'nRetracted', 'dateobject', 'JID', 'CSID', 'JournalName',
       'ConferenceSeriesName', 'ReasonPropagatedMajorityOfMajority',
       'NumAuthorsInRetractedPaper', 'GenderizeGender', 'GenderizeConfidence',
       'MAGAuthorOrder', 'FirstPubMAGPID', 'FirstPubYear',
       'AcademicAgeAtRetraction', 'cumPapersYear', 'MAGCumPapersAtRetraction',
       'cumCitationsYear', 'MAGCumCitationsAtRetraction',
       'cumCollaboratorsYear', 'MAGCumCollaboratorsAtRetraction', 'AffID',
       'MAGRetractionYearAffRank', 'AffYear', 'MAGrootFID',
       'MAGrootFIDMaxPercent', 'YearOfAttrition', 'YearsActive',
       'OriginalPaperDate', 'OriginalPaperYear', 'AttritedClass',
       'AttritedClassRobust', 'MAGJCName', 'MAGJournalType', 'SJR',
       'SJR Best Quartile', 'DemiDecade', 'MAGAIDRankTypeInRetractedPaper'],
      dtype='object')

In [19]:
df_rootFID_MAGFieldName = pd.read_csv(MAG_FIELD_NAMES_LOCAL)\
            .rename(columns={'root_FieldID':'MAGrootFID','FieldName':'MAGFieldName'})

df_authors = df_authors.merge(df_rootFID_MAGFieldName, on='MAGrootFID')

# Extracting dummy variables for fields

df_dummies_nHot = pd.crosstab(df_authors['MAGAID'],df_authors['MAGFieldName']).astype(bool).\
                    replace({False:0,True:1}).add_prefix('Field_').reset_index()

df_authors = df_authors.merge(df_dummies_nHot, on='MAGAID', how='left')

# # Find columns with the suffix "Field_"
field_columns = [col for col in df_authors.columns if col.startswith('Field_')]

#Replace spaces with underscores only for those columns
for col in field_columns:
    df_authors.rename(columns={col: col.replace(' ', '')}, inplace=True)
    
df_authors

Unnamed: 0,MAGAID,MAGAuthorName,Record ID,MAGPID,RetractionYear,nRetracted,dateobject,JID,CSID,JournalName,...,Field_geology,Field_history,Field_materialsscience,Field_mathematics,Field_medicine,Field_philosophy,Field_physics,Field_politicalscience,Field_psychology,Field_sociology
0,2127983451,william r jacobs,2343,1790891469,1997.0,1,1994-07-01,133274750.0,,journal of bacteriology,...,0,0,0,0,0,0,0,0,0,0
1,2127983451,william r jacobs,2343,1790891469,1997.0,1,1994-07-01,133274750.0,,journal of bacteriology,...,0,0,0,0,0,0,0,0,0,0
2,2127983451,william r jacobs,2343,1790891469,1997.0,1,1994-07-01,133274750.0,,journal of bacteriology,...,0,0,0,0,0,0,0,0,0,0
3,2127983451,william r jacobs,2343,1790891469,1997.0,1,1994-07-01,133274750.0,,journal of bacteriology,...,0,0,0,0,0,0,0,0,0,0
4,2127983451,william r jacobs,2343,1790891469,1997.0,1,1994-07-01,133274750.0,,journal of bacteriology,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34890,2749244068,quang duc truong,17131,2035272936,2013.0,2,2013-01-01,42323631.0,,applied surface science,...,0,0,1,0,0,0,1,0,0,0
34891,268272321,alex mishelevitz,16929,1970421558,2012.0,1,2012-10-01,112095004.0,,journal of chemical technology & biotechnology,...,0,0,1,0,0,0,1,0,0,0
34892,2768061868,munesh kumari,2071,2108644616,2014.0,1,2013-10-01,202381698.0,,plos one,...,1,0,0,0,1,0,1,0,0,0
34893,2639980879,d k hazra,8363,1420593637,2014.0,1,2014-01-01,,,,...,0,0,0,0,1,0,0,0,0,0


In [20]:
df_authors.columns.tolist()

['MAGAID',
 'MAGAuthorName',
 'Record ID',
 'MAGPID',
 'RetractionYear',
 'nRetracted',
 'dateobject',
 'JID',
 'CSID',
 'JournalName',
 'ConferenceSeriesName',
 'ReasonPropagatedMajorityOfMajority',
 'NumAuthorsInRetractedPaper',
 'GenderizeGender',
 'GenderizeConfidence',
 'MAGAuthorOrder',
 'FirstPubMAGPID',
 'FirstPubYear',
 'AcademicAgeAtRetraction',
 'cumPapersYear',
 'MAGCumPapersAtRetraction',
 'cumCitationsYear',
 'MAGCumCitationsAtRetraction',
 'cumCollaboratorsYear',
 'MAGCumCollaboratorsAtRetraction',
 'AffID',
 'MAGRetractionYearAffRank',
 'AffYear',
 'MAGrootFID',
 'MAGrootFIDMaxPercent',
 'YearOfAttrition',
 'YearsActive',
 'OriginalPaperDate',
 'OriginalPaperYear',
 'AttritedClass',
 'AttritedClassRobust',
 'MAGJCName',
 'MAGJournalType',
 'SJR',
 'SJR Best Quartile',
 'DemiDecade',
 'MAGAIDRankTypeInRetractedPaper',
 'MAGFieldName',
 'Field_art',
 'Field_biology',
 'Field_business',
 'Field_chemistry',
 'Field_computerscience',
 'Field_economics',
 'Field_engineering',

In [21]:
# Filling NaNs with 0s

df_authors['MAGCumCollaboratorsAtRetraction'].fillna(0, inplace=True)

df_authors['MAGCumCitationsAtRetraction'].fillna(0, inplace=True)

# FIlling NaNs with >1000
df_authors['MAGRetractionYearAffRank'].fillna("1001-", inplace=True)

# Fill reasons

df_authors['ReasonPropagatedMajorityOfMajority'] = df_authors['ReasonPropagatedMajorityOfMajority']\
                                                    .replace("unknown","other")

df_authors['ReasonPropagatedMajorityOfMajority'] = df_authors['ReasonPropagatedMajorityOfMajority']\
                                                    .replace("ambiguous","other")

In [22]:
rank_dict = {'101-150':125,
             '151-200':175,
             '201-300':250,
             '301-400':350,
             '401-500':450,
             '501-600':550,
             '601-700':650,
             '701-800':750,
             '801-900':850,
             '901-1000':950,
             '1001-':1500}

def map2numeric_aff_rank(row):
    return rank_dict.get(row['MAGRetractionYearAffRank'],
                        row['MAGRetractionYearAffRank'])
    

df_authors['MAGRetractionYearAffRankOrdinal'] = df_authors.apply(lambda row: map2numeric_aff_rank(row), axis=1)

df_authors['MAGRetractionYearAffRankOrdinal'] = \
   df_authors['MAGRetractionYearAffRankOrdinal'].astype(float)


"""
At this point there is one last thing that needs to be done i.e. 
for authors who have multiple affiliations, we need to only take their 
high-ranked affiliation only"""

df_affrank = df_authors[['MAGAID','MAGRetractionYearAffRankOrdinal']].\
                drop_duplicates().dropna().sort_values(by='MAGRetractionYearAffRankOrdinal').\
                drop_duplicates(subset='MAGAID',keep='first')


# Now we shall remove the MAGRetractionYearAffRankOrdinal column and reinclude it 
df_authors = df_authors.\
        drop(columns=['MAGRetractionYearAffRankOrdinal']).merge(df_affrank, on='MAGAID',how='left')


df_authors

Unnamed: 0,MAGAID,MAGAuthorName,Record ID,MAGPID,RetractionYear,nRetracted,dateobject,JID,CSID,JournalName,...,Field_history,Field_materialsscience,Field_mathematics,Field_medicine,Field_philosophy,Field_physics,Field_politicalscience,Field_psychology,Field_sociology,MAGRetractionYearAffRankOrdinal
0,2127983451,william r jacobs,2343,1790891469,1997.0,1,1994-07-01,133274750.0,,journal of bacteriology,...,0,0,0,0,0,0,0,0,0,89.0
1,2127983451,william r jacobs,2343,1790891469,1997.0,1,1994-07-01,133274750.0,,journal of bacteriology,...,0,0,0,0,0,0,0,0,0,89.0
2,2127983451,william r jacobs,2343,1790891469,1997.0,1,1994-07-01,133274750.0,,journal of bacteriology,...,0,0,0,0,0,0,0,0,0,89.0
3,2127983451,william r jacobs,2343,1790891469,1997.0,1,1994-07-01,133274750.0,,journal of bacteriology,...,0,0,0,0,0,0,0,0,0,89.0
4,2127983451,william r jacobs,2343,1790891469,1997.0,1,1994-07-01,133274750.0,,journal of bacteriology,...,0,0,0,0,0,0,0,0,0,89.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34890,2749244068,quang duc truong,17131,2035272936,2013.0,2,2013-01-01,42323631.0,,applied surface science,...,0,1,0,0,0,1,0,0,0,125.0
34891,268272321,alex mishelevitz,16929,1970421558,2012.0,1,2012-10-01,112095004.0,,journal of chemical technology & biotechnology,...,0,1,0,0,0,1,0,0,0,450.0
34892,2768061868,munesh kumari,2071,2108644616,2014.0,1,2013-10-01,202381698.0,,plos one,...,0,0,0,1,0,1,0,0,0,1500.0
34893,2639980879,d k hazra,8363,1420593637,2014.0,1,2014-01-01,,,,...,0,0,0,1,0,0,0,0,0,1500.0


In [23]:
# Logging citations and collaborators

# Now we create logged categories with laplace smoothing
df_authors['LogMAGCumCitationsAtRetraction'] = \
        np.log(df_authors['MAGCumCitationsAtRetraction']+1)

df_authors['LogMAGCumCollaboratorsAtRetraction'] = \
        np.log(df_authors['MAGCumCollaboratorsAtRetraction']+1)

In [24]:
# processing reasons
df_authors['ReasonPropagatedMajorityOfMajority'].value_counts()

ReasonPropagatedMajorityOfMajority
misconduct    9986
mistake       9644
plagiarism    9236
other         5930
Name: count, dtype: int64

In [25]:
# processing altmetric


# Let us first read the RW papers dataset

df_papers = pd.read_csv(RW_ORIGINAL_W_YEAR_LOCAL_PATH, usecols=['Record ID','OriginalPaperDOI',
                                                                    'RetractionDOI','RetractionDate'])

# Removing papers with no DOI i.e. attention = NaN

df_papers = df_papers[(~df_papers.RetractionDOI.isna() | ~df_papers.OriginalPaperDOI.isna()) & 
                     (~df_papers.OriginalPaperDOI.isin(['unavailable','Unavailable',
                                                       'unavailabe','unavailabe','unavailablbe']) | 
                     ~df_papers.RetractionDOI.isin(['unavailable','Unavailable',
                                                   'unavailabe','unavailabe','unavailablbe']))]

# Removing anomalous data point from 1756

df_papers = df_papers[~df_papers.RetractionDate.eq('6/24/1756 12:00:00 AM')].copy()

df_papers['RetractionDate'] = df_papers['RetractionDate'].str.split().str[0]

# Converting Retraction Date to datetime format
df_papers['RetractionDate'] = pd.to_datetime(df_papers.RetractionDate, format='%Y-%m-%d')

df_papers.head()

Unnamed: 0,Record ID,RetractionDate,RetractionDOI,OriginalPaperDOI
0,28599,2021-05-15,10.1007/s12035-021-02424-8,10.1007/s12035-016-0248-x
1,28504,2021-05-14,10.1007/s00228-021-03150-9,10.1007/s00228-018-2601-7
2,28506,2021-05-14,10.1021/acsbiomaterials.1c00362,10.1021/acsbiomaterials.9b00547
3,28505,2021-05-14,10.1042/BSR-20160523_RET,10.1042/BSR20160523
4,28502,2021-05-13,10.1007/s10653-021-00962-7,10.1007/s10653-018-0182-0


In [26]:
# Defining broads categories for types of sources
source_to_category = {'AggregateSocialMedia mentions':['Twitter mentions', 'Facebook mentions',
                                      'Google+ mentions', 'LinkedIn mentions', 'Reddit mentions',
                                       'Pinterest mentions','Video mentions'],
                      'AggregateNewsMedia mentions': ['News mentions'],
                      'AggregateBlogs mentions':['Blog mentions'],
                      'AggregateKnowledgeRepositories mentions':['Patent mentions','Peer review mentions',
                                               'Wikipedia mentions','F1000 mentions',
                                               'Q&A mentions']
                     }

# Now let us only read the explorer data for which data from API is available

indir_api = "/Users/sm9654/desktop/NYUAD/nyuad-research/retraction_effects_on_collaboration_networks/data/h4_altmetric/api/new_collection_dec2022/explorer_style_files/"
valid_rids = [int(rid.split(".")[0]) for rid in os.listdir(indir_api)]

dfs_mentions = []

# going through each record id
for rid in valid_rids:
    # reading the file for altmetric data
    df_i = pd.read_csv(indir_api+str(rid)+".csv").rename(columns={'Date':'MentionDate'})
    
    # checking if the shape of the file is zero
    if(df_i.shape[0] != 0):
        
        # casting date column to datetime
        df_i['MentionDate'] = pd.to_datetime(df_i.MentionDate, format='%Y-%m-%d')
        
        # Exclude 'MentionDate' from the sum (was giving error)
        mention_columns = [col for col in df_i.columns if col != 'MentionDate']
        # summing number of mentions across all types of media
        df_i['Total mentions'] = df_i[mention_columns].sum(axis=1)
        # record id column to rid
        df_i['Record ID'] = rid
        # merging this with papers so we get the retraction date
        df_merged_i = df_i.merge(df_papers,on='Record ID',how='left')
        
        # getting number of months from retraction for each date of mention
        df_merged_i['MonthsToMentionFromRetraction'] = ((df_merged_i['MentionDate'] - \
                                                         df_merged_i['RetractionDate'])/np.timedelta64(1, 'M'))
        
        df_merged_i = df_merged_i.drop(columns=['RetractionDate',
       'RetractionDOI', 'OriginalPaperDOI', 'MentionDate'])
        
        
        def create_mention_columns(df_temp, upperlimit=None, lowerlimit=None,suffix='AtRetraction'):
            if upperlimit and lowerlimit:
                df_temp = df_temp[df_temp.\
                              MonthsToMentionFromRetraction.lt(upperlimit) & 
                              df_temp.MonthsToMentionFromRetraction.gt(lowerlimit)].\
                                groupby('Record ID').sum().reset_index().\
                                drop(columns=['MonthsToMentionFromRetraction'])
            elif lowerlimit:
                df_temp = df_temp[df_temp.MonthsToMentionFromRetraction.gt(lowerlimit)].\
                                groupby('Record ID').sum().reset_index().\
                                drop(columns=['MonthsToMentionFromRetraction'])
                
            elif upperlimit:
                df_temp = df_temp[df_temp.MonthsToMentionFromRetraction.lt(upperlimit)].\
                                groupby('Record ID').sum().reset_index().\
                                drop(columns=['MonthsToMentionFromRetraction'])
                
            # Classifying sources to types of sources of mentions
            for source in source_to_category.keys():
                df_temp[source] = df_temp[source_to_category.get(source)].sum(axis=1)

            # Renaming for easy access
            df_temp.columns = df_temp.columns.str.replace(" mentions","Mentions"+suffix)
            
            return df_temp
        
        
        # Let us first extract mentions at the time of retraction (within 6 month before and after retraction)
        df_merged_i_AtRetraction = create_mention_columns(df_merged_i, 
                                                          upperlimit=7, lowerlimit=-7)
        
        # Now let us extract mentions before retraction (i.e. 6 months before retraction)
        df_merged_i_PreRetraction = create_mention_columns(df_merged_i, 
                                                          upperlimit=-6, suffix='PreRetraction')
        
        # Now let us extract mentions after retraction (i.e. 6 months after retraction)
        df_merged_i_PostRetraction = create_mention_columns(df_merged_i, 
                                                          lowerlimit=6, suffix='PostRetraction')
        # Now we merge all of these 
        df_merged_i_attention = df_merged_i_AtRetraction.merge(df_merged_i_PreRetraction, on='Record ID',
                                                      how='outer')\
                                                .merge(df_merged_i_PostRetraction, on='Record ID',
                                                      how='outer')
        # Filling NaNs with 0s
        df_merged_i_attention.fillna(0, inplace=True)
        
        dfs_mentions.append(df_merged_i_attention)
        


In [27]:
df_mentions_concatenated = pd.concat(dfs_mentions)
df_mentions_concatenated.head()

Unnamed: 0,Record ID,BlogMentionsAtRetraction,F1000MentionsAtRetraction,FacebookMentionsAtRetraction,Google+MentionsAtRetraction,LinkedInMentionsAtRetraction,NewsMentionsAtRetraction,PatentMentionsAtRetraction,Peer reviewMentionsAtRetraction,PinterestMentionsAtRetraction,...,RedditMentionsPostRetraction,TwitterMentionsPostRetraction,VideoMentionsPostRetraction,WeiboMentionsPostRetraction,WikipediaMentionsPostRetraction,TotalMentionsPostRetraction,AggregateSocialMediaMentionsPostRetraction,AggregateNewsMediaMentionsPostRetraction,AggregateBlogsMentionsPostRetraction,AggregateKnowledgeRepositoriesMentionsPostRetraction
0,19905,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,13596,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,22993,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,223,3.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,0.0,...,0.0,0.0,0.0,0.0,2.0,11.0,0.0,0.0,1.0,10.0
0,23333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
# Now that we have processed attention, we need to do the following:
# 1. First we need to merge the dataframes 
# 2. Second we need to merge them to our large dataframe
# 3. Third we need to identify papers for which attention was 0 -- these are papers that are within df_papers
# 4. The rest of the papers will have NaNs as their attention is undefined. 

counter = 0
for rid in valid_rids:
    df_i = pd.read_csv(indir_api+str(rid)+".csv").rename(columns={'Date':'MentionDate'})
    
    if(df_i.shape[0] != 0):
        counter += 1
    else:
        print(rid)
        
len(valid_rids), counter

12844
22987
551
25018
2282
24460
1774
8022
2269
14560
4142
6594
4383
26934
7661
1210
8220
4432
2725
2731
7891
1238
13019
21915
155
21646
22367
6386
22373
18004
17337
3201
182
14164
800
24919
12273
23535
22603
3956
6609
21493
23290
20799
8630
2335
10666
3981
3994
20968
12064
20940
10883
1601
395
3016
4779
21492
17861
21445
15086
12931
12925
20997
10854
21451
801
2644
18039
5659
1359
7072
17444
18777
16982
3599
2687
18788
24059
21647
975
21914
3348
5063
26921
1205
8547
1563
3374
10539
26935
5088
4396
17295
21041
24139
746
9467
3638
4157
19045
1775
3162
4180
4194
25757
4802
12845
544
15680
25033
21255
8790
17903
208
26260
22012
17917
591
3160
16348
12648
14211
27803
5539
3148
13218
12674
26738
744
23654
2901
7851
22210
5934
793
26923
14985
8237
15319
963
630
8380
6385
7931
24926
8357
3564
16994
16980
3216
10467
9713
12516
4223
4545
426
20759
20981
20995
3941
4784
1824
368
6178
8155
1165
6144
4021
4746
1158
7529
18210
1170
21485
1825
4785
2492
17876
4961
6810
21334
21320
20758
16003
6347
5

957
5733
17274
3342
19259
8203
18635
7865
24127
22218
16803
5082
20369
7695
20180
23489
14557
12126
6986
18423
6992
22967
3154
20816
3815
14594
24319
24447
1974
19936
25017
1948
2272
2500
1021
12678
4159
17714
6982
13214
18816
21729
26052
24123
774
27402
23670
6570
13758
3408
4415
2702
16634
17502
13016
24862
21649
11952
26898
5694
18792
827
17338
5119
14157
11761
3554
20755
4788
1182
10872
19103
16556
17890
11239
20796
6174
18221
1633
2448
18553
8165
20972
10668
17107
17885
18208
20797
19116
24283
11238
4010
18591
1183
4951
1197
3970
17846
10697
20754
359
20998
20740
24917
1342
17339
6377
198
2105
1381
615
2688
20218
5695
22341
5865
14181
22355
24863
5736
21099
1578
3384
14963
3623
7337
3637
12679
14208
2501
17729
19704
1008
6001
2273
3838
1791
1961
15857
12848
23459
1963
1793
561
5293
21266
17918
16353
4172
6771
6017
2271
4600
1750
22974
17703
20352
7876
23667
19504
2073
23840
20391
3437
26643
18961
1383
830
24929
2107
12531
24901
3225
26870
58
818
20756
20030
14395
27993
13388
17878

(11265, 9480)

In [29]:
# Now we merge this with our original dataframe

df_authors_w_attention = df_authors.merge(df_mentions_concatenated, on='Record ID', how='left')

# Now we need to impute NaNs with 0s but only for papers for which attention is 0 and DOI is present

valid_papers_w_attention = df_papers['Record ID'].unique()

for attention_col in df_mentions_concatenated.drop(columns=['Record ID']).columns:
    
    df_authors_w_attention[attention_col] = np.where(pd.isnull(df_authors_w_attention[attention_col]), 
                                                      0,
                                                      df_authors_w_attention[attention_col])
    

df_authors_w_attention[['MAGAID','TotalMentionsAtRetraction']].drop_duplicates()['TotalMentionsAtRetraction']\
                                                                        .describe()

count    16482.000000
mean         5.893460
std         99.707017
min          0.000000
25%          0.000000
50%          0.000000
75%          2.000000
max       5496.000000
Name: TotalMentionsAtRetraction, dtype: float64

In [30]:
# computing altmetric score

source_to_weights = {'Blog': 5,
     'F1000': 1,
     'Facebook': 0.25,
     'Google+': 1,
     'LinkedIn': 0.5,
     'News': 8,
     'Patent': 3,
     'Peer review': 1,
     'Pinterest': 0.25,
     'Policy': 3,
     'Q&A': 0.25,
     'Reddit': 0.25,
     'Twitter': 0.25,
     'Video':0.25,
     'Weibo':1,
     'Wikipedia':3}

def compute_altmetric_score(row, suffix="AtRetraction"):
    score = 0
    for source in source_to_weights.keys():
        score += row[source+"Mentions"+suffix] * source_to_weights.get(source)
    return score

suffix = "AtRetraction"
df_authors_w_attention['AltmetricScore'+suffix] = df_authors_w_attention\
                                                    .apply(lambda row: compute_altmetric_score(row),axis=1)

suffix = "PreRetraction"
df_authors_w_attention['AltmetricScore'+suffix] = df_authors_w_attention\
                                                    .apply(lambda row: compute_altmetric_score(row, 
                                                                                              suffix),axis=1)

suffix = "PostRetraction"
df_authors_w_attention['AltmetricScore'+suffix] = df_authors_w_attention\
                                                    .apply(lambda row: compute_altmetric_score(row, suffix),axis=1)

df_authors_w_attention


Unnamed: 0,MAGAID,MAGAuthorName,Record ID,MAGPID,RetractionYear,nRetracted,dateobject,JID,CSID,JournalName,...,WeiboMentionsPostRetraction,WikipediaMentionsPostRetraction,TotalMentionsPostRetraction,AggregateSocialMediaMentionsPostRetraction,AggregateNewsMediaMentionsPostRetraction,AggregateBlogsMentionsPostRetraction,AggregateKnowledgeRepositoriesMentionsPostRetraction,AltmetricScoreAtRetraction,AltmetricScorePreRetraction,AltmetricScorePostRetraction
0,2127983451,william r jacobs,2343,1790891469,1997.0,1,1994-07-01,133274750.0,,journal of bacteriology,...,0.0,0.0,2.0,0.0,0.0,0.0,2.0,0.00,3.0,6.0
1,2127983451,william r jacobs,2343,1790891469,1997.0,1,1994-07-01,133274750.0,,journal of bacteriology,...,0.0,0.0,2.0,0.0,0.0,0.0,2.0,0.00,3.0,6.0
2,2127983451,william r jacobs,2343,1790891469,1997.0,1,1994-07-01,133274750.0,,journal of bacteriology,...,0.0,0.0,2.0,0.0,0.0,0.0,2.0,0.00,3.0,6.0
3,2127983451,william r jacobs,2343,1790891469,1997.0,1,1994-07-01,133274750.0,,journal of bacteriology,...,0.0,0.0,2.0,0.0,0.0,0.0,2.0,0.00,3.0,6.0
4,2127983451,william r jacobs,2343,1790891469,1997.0,1,1994-07-01,133274750.0,,journal of bacteriology,...,0.0,0.0,2.0,0.0,0.0,0.0,2.0,0.00,3.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34890,2749244068,quang duc truong,17131,2035272936,2013.0,2,2013-01-01,42323631.0,,applied surface science,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.25,0.0,0.0
34891,268272321,alex mishelevitz,16929,1970421558,2012.0,1,2012-10-01,112095004.0,,journal of chemical technology & biotechnology,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.00,0.0,0.0
34892,2768061868,munesh kumari,2071,2108644616,2014.0,1,2013-10-01,202381698.0,,plos one,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.00,0.0,0.0
34893,2639980879,d k hazra,8363,1420593637,2014.0,1,2014-01-01,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0


In [31]:
df_authors_w_attention[df_authors_w_attention.AltmetricScoreAtRetraction.gt(0)].AltmetricScoreAtRetraction.describe()


count    14620.000000
mean        17.910021
std        129.082979
min          0.250000
25%          5.000000
50%          7.250000
75%         11.000000
max       3756.500000
Name: AltmetricScoreAtRetraction, dtype: float64

In [32]:
df_authors_w_attention.columns

Index(['MAGAID', 'MAGAuthorName', 'Record ID', 'MAGPID', 'RetractionYear',
       'nRetracted', 'dateobject', 'JID', 'CSID', 'JournalName',
       ...
       'WeiboMentionsPostRetraction', 'WikipediaMentionsPostRetraction',
       'TotalMentionsPostRetraction',
       'AggregateSocialMediaMentionsPostRetraction',
       'AggregateNewsMediaMentionsPostRetraction',
       'AggregateBlogsMentionsPostRetraction',
       'AggregateKnowledgeRepositoriesMentionsPostRetraction',
       'AltmetricScoreAtRetraction', 'AltmetricScorePreRetraction',
       'AltmetricScorePostRetraction'],
      dtype='object', length=131)

In [33]:
# Renaming some more

df_authors_w_attention = df_authors_w_attention.rename(columns={
                                 'SJR':'SJRScoreRetractedPaperYear',
                                 'SJR Best Quartile':'SJRQuartileRetractedPaperYear'})

In [34]:
# Saving
# Constants
OUTPUT_DIRECTORY = OUTDIR
FILENAME = "RW_authors_w_confounders_filteredSample_postNHB"

file_path = os.path.join(OUTPUT_DIRECTORY, f"{FILENAME}.csv")

# Writing DataFrame to CSV with error handling
try:
    df_authors_w_attention.drop_duplicates().to_csv(file_path, index=False)
    print(f"File saved successfully")
except Exception as e:
    print(f"Error saving file: {e}")

File saved successfully


In [35]:
df_authors_w_attention[['MAGPID','RetractionYear','AltmetricScoreAtRetraction']]\
                                .drop_duplicates()

Unnamed: 0,MAGPID,RetractionYear,AltmetricScoreAtRetraction
0,1790891469,1997.0,0.0
5,2339456987,2001.0,3.0
8,1978992330,1998.0,15.0
9,2009868552,1998.0,0.0
11,1958327809,1994.0,0.0
...,...,...,...
34626,1994283215,2013.0,0.0
34704,2024656785,2014.0,0.0
34795,2323068417,2015.0,0.0
34812,1771091107,2015.0,1.5


In [36]:
df_authors_w_attention['MAGPID'].nunique()

5042

In [37]:
df_authors_w_attention['MAGAID'].nunique()

15981

In [38]:
df_authors_w_attention[df_authors_w_attention['nRetracted'] == 1]\
        [['MAGAID','AltmetricScoreAtRetraction']].drop_duplicates()['AltmetricScoreAtRetraction'].describe()

count    14579.000000
mean         6.208793
std         55.562944
min          0.000000
25%          0.000000
50%          0.000000
75%          5.000000
max       3756.500000
Name: AltmetricScoreAtRetraction, dtype: float64

In [39]:
df_authors_w_attention[df_authors_w_attention['nRetracted'] == 1]['MAGAID'].nunique()

14579

In [40]:
df_authors_w_attention[df_authors_w_attention['nRetracted'] == 1]['MAGPID'].nunique()

4578

In [41]:
df_authors_w_attention.columns.tolist()

['MAGAID',
 'MAGAuthorName',
 'Record ID',
 'MAGPID',
 'RetractionYear',
 'nRetracted',
 'dateobject',
 'JID',
 'CSID',
 'JournalName',
 'ConferenceSeriesName',
 'ReasonPropagatedMajorityOfMajority',
 'NumAuthorsInRetractedPaper',
 'GenderizeGender',
 'GenderizeConfidence',
 'MAGAuthorOrder',
 'FirstPubMAGPID',
 'FirstPubYear',
 'AcademicAgeAtRetraction',
 'cumPapersYear',
 'MAGCumPapersAtRetraction',
 'cumCitationsYear',
 'MAGCumCitationsAtRetraction',
 'cumCollaboratorsYear',
 'MAGCumCollaboratorsAtRetraction',
 'AffID',
 'MAGRetractionYearAffRank',
 'AffYear',
 'MAGrootFID',
 'MAGrootFIDMaxPercent',
 'YearOfAttrition',
 'YearsActive',
 'OriginalPaperDate',
 'OriginalPaperYear',
 'AttritedClass',
 'AttritedClassRobust',
 'MAGJCName',
 'MAGJournalType',
 'SJRScoreRetractedPaperYear',
 'SJRQuartileRetractedPaperYear',
 'DemiDecade',
 'MAGAIDRankTypeInRetractedPaper',
 'MAGFieldName',
 'Field_art',
 'Field_biology',
 'Field_business',
 'Field_chemistry',
 'Field_computerscience',
 'Fiel

In [42]:
df_authors_w_attention[df_authors_w_attention['SJRQuartileRetractedPaperYear'].isna()]['MAGAID'].nunique()

4694