# Clustered Regressions

This notebook contains code for running the regression analysis with attrition as dependent variable.

In [32]:
# Importing relevant libraries

import pandas as pd
from sklearn import datasets
import statsmodels.api as sm
import statsmodels.formula.api as smf
from stargazer.stargazer import Stargazer
import math
import numpy as np

import pickle
import seaborn as sns
import matplotlib.pyplot as plt

In [34]:
# Reading the regression file

df_main = pd.read_csv("../../../../data/h4_altmetric/regression/RW_Authors_forRegression_rematching.csv")


###### THIS IS POST NHB STUFF WHERE WE REDEFINE OUR IDEA OF ATTRITION ######
df_attrition_postNHB = pd.read_csv("../../../../data/attrition_postNHB/retracted_authors_attrition_info_postNHB.csv",
                                  usecols=['MAGAID','AttritionNew'])

df_main = df_main.merge(df_attrition_postNHB, on='MAGAID')

# Let us define the dependent variable

df_main['Attrition'] = df_main['AttritedClass']

Yi = 'AttritionNew'

# Removing class that's neither attrited nor non-attrited (i.e. those that are attrited due to something else)
df_main = df_main[df_main.AttritionNew.isin([0,1])]


df_main.tail(2)

Unnamed: 0,Record ID,MAGPID,RetractionYear,OriginalPaperYear,MAGAffIDRetractedPaper,MAGRetractedPIDYear,MAGAffRankRetractedPaper,MAGFirstName,GenderizeGender,GenderizeConfidence,...,TotalPostersPerRecord,PercentageResearchers,PercentagePractitioners,PercentagePublic,PercentageJournalists,RetractorMajority,ReasonPropagatedOverallMajority,ReasonPropagatedMajorityOfMajority,AttritionNew,Attrition
30791,17547,1900584000.0,2015,2015.0,155173764.0,2015.0,,christopher,male,0.99,...,154.0,0.25974,0.090909,0.623377,0.025974,,mistake,mistake,0,0
30792,17547,1900584000.0,2015,2015.0,155173764.0,2015.0,,christopher,male,0.99,...,154.0,0.25974,0.090909,0.623377,0.025974,,mistake,mistake,0,0


In [36]:
df_main[['MAGAID','AttritionNew']].drop_duplicates()['AttritionNew'].value_counts()

AttritionNew
0    9188
1    2286
Name: count, dtype: int64

In [37]:
df_main[df_main.AltmetricScoreAtRetraction.gt(20) & ~df_main.SJRQuartileRetractedPaperYear.isna()].MAGAID.nunique()

392

In [38]:
df_main[df_main.AltmetricScoreAtRetraction.le(20) & ~df_main.SJRQuartileRetractedPaperYear.isna()].MAGAID.nunique()

8080

In [39]:
df_main.MAGCumCollaboratorsAtRetraction.describe()

count    24183.000000
mean       192.493280
std        643.174994
min          0.000000
25%          9.000000
50%         33.000000
75%        130.000000
max      20543.000000
Name: MAGCumCollaboratorsAtRetraction, dtype: float64

In [40]:
df_main.MAGRetractionYearAffRankOrdinal.value_counts(dropna=False)

MAGRetractionYearAffRankOrdinal
1500.0    6327
250.0     2061
125.0     1915
350.0     1757
175.0     1578
          ... 
59.0        17
9.0         14
87.0        12
60.0         8
48.0         3
Name: count, Length: 87, dtype: int64

In [41]:
# Defining attention columns

attention_agg_cols = ['AltmetricScoreAtRetraction',
                     'AggregateSocialMediaMentionsAtRetraction',
                     'AggregateNewsMediaMentionsAtRetraction',
                     'AggregateBlogsMentionsAtRetraction',
                     'AggregateKnowledgeRepositoriesMentionsAtRetraction']

# Creating the relevant dataframe
df = df_main[['Record ID','MAGAID'] + [Yi,'AttritedClassRobust']+
                 ['GenderizeGender',
                 'AcademicAgeAtRetraction',
                 'MAGCumPapersAtRetraction',
                 'LogMAGCumCitationsAtRetraction',
                 'LogMAGCumCollaboratorsAtRetraction']+
                list(df_main.\
                    filter(regex=("Field_.*")).columns)+
                ['MAGAIDRankTypeInRetractedPaper',
                'RetractionYear',
                'ReasonPropagatedMajorityOfMajority',
                'MAGJournalType',
                'SJRQuartileRetractedPaperYear',
                'MAGRetractionYearAffRankOrdinal',
                'NumAuthorsInRetractedPaper']+
                attention_agg_cols].drop_duplicates()

df

Unnamed: 0,Record ID,MAGAID,AttritionNew,AttritedClassRobust,GenderizeGender,AcademicAgeAtRetraction,MAGCumPapersAtRetraction,LogMAGCumCitationsAtRetraction,LogMAGCumCollaboratorsAtRetraction,Field_ART,...,ReasonPropagatedMajorityOfMajority,MAGJournalType,SJRQuartileRetractedPaperYear,MAGRetractionYearAffRankOrdinal,NumAuthorsInRetractedPaper,AltmetricScoreAtRetraction,AggregateSocialMediaMentionsAtRetraction,AggregateNewsMediaMentionsAtRetraction,AggregateBlogsMentionsAtRetraction,AggregateKnowledgeRepositoriesMentionsAtRetraction
0,3031,2.111744e+09,0,0,male,35.0,166.0,8.491670,4.962845,0,...,mistake,journal,,13.0,2.0,0.0,0.0,0.0,0.0,0.0
1,3031,2.245003e+09,0,0,male,2.0,4.0,2.197225,0.693147,0,...,mistake,journal,,13.0,2.0,0.0,0.0,0.0,0.0,0.0
3,1082,2.120727e+09,1,0,male,5.0,32.0,3.258097,3.367296,0,...,mistake,journal,,350.0,7.0,0.0,0.0,0.0,0.0,0.0
4,1082,2.151686e+09,0,0,male,18.0,246.0,6.505784,5.347108,0,...,mistake,journal,,350.0,7.0,0.0,0.0,0.0,0.0,0.0
5,1082,2.552715e+09,0,0,male,6.0,41.0,4.317488,3.737670,0,...,mistake,journal,,350.0,7.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30780,16476,2.671948e+09,1,1,female,1.0,4.0,2.302585,1.945910,0,...,plagiarism,journal,,250.0,6.0,0.0,0.0,0.0,0.0,0.0
30781,16476,2.993835e+09,0,0,female,5.0,45.0,5.141664,3.583519,0,...,plagiarism,journal,,250.0,6.0,0.0,0.0,0.0,0.0,0.0
30782,8314,1.979824e+09,0,0,male,4.0,10.0,3.871201,1.386294,0,...,mistake,journal,3.0,550.0,1.0,0.0,0.0,0.0,0.0,0.0
30784,2835,1.972149e+09,1,1,male,1.0,3.0,1.945910,1.791759,0,...,misconduct,journal,2.0,850.0,4.0,0.0,0.0,0.0,0.0,0.0


In [42]:
def regress_ols_noSJR(df, equation):
    print(Yi)
    # We are dropping columns that have NaNs as those will be dropped anyway, and this allows us to do clustered SEs easily
    return smf.ols(Yi + "~ "+equation, data=df, missing='drop').fit(cov_type='cluster',
                                                                      cov_kwds={'groups':df['Record ID']})

def regress_ols_wSJR(df, equation):
    print(Yi)
    df = df.dropna()
    return smf.ols(Yi + "~ "+equation, data=df, missing='drop').fit(cov_type='cluster',
                                                                      cov_kwds={'groups':df['Record ID']})


def regress_logit_noSJR(df, equation):
    print(Yi)
    return smf.logit(Yi + "~ "+equation, data=df, missing='drop').fit(cov_type='cluster',
                                                                      cov_kwds={'groups':df['Record ID']})

def regress_logit_wSJR(df, equation):
    print(Yi)
    df = df.dropna()
    return smf.logit(Yi + "~ "+equation, data=df, missing='drop').fit(cov_type='cluster',
                                                                      cov_kwds={'groups':df['Record ID']})


In [43]:
# Defining dictionary that goes from field id to expression

expression_dict = {
        "age": "AcademicAgeAtRetraction",
        "gender": "C(GenderizeGender, Treatment(reference='male'))",
        "yearofretraction": "RetractionYear",
        "logcitations":"LogMAGCumCitationsAtRetraction",
        "collaborators":"LogMAGCumCollaboratorsAtRetraction",
        "papers":"MAGCumPapersAtRetraction",
        "reasons":"C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))",
        "affrank":"MAGRetractionYearAffRankOrdinal",
        "contributionrank":"C(MAGAIDRankTypeInRetractedPaper, Treatment(reference='First or Last or Only Author'))",
        "numauthors":"NumAuthorsInRetractedPaper",
        "venue":"C(MAGJournalType, Treatment(reference='conference'))",
        "impactfactor":"SJRQuartileRetractedPaperYear",
        "retractor":"C(RetractorMajority, Treatment(reference='author'))",
        "field": " + ".join(list(df_main.filter(regex=("Field_.*")).columns))
}

In [44]:
def regress_all_together(regressionType, dfi, attention_expression, field_ids):
    """
    In this function, we shall run one regression with all the 
    controls given as func_ids
    """
    
    # Initializing the equation
    equation = attention_expression
    
    for field_id in field_ids:
        equation = equation + " + " + expression_dict.get(field_id)
        
    data = dfi.copy()
    
    if regressionType == 'ols-noSJR':
        est = regress_ols_noSJR(data, equation)
    elif regressionType == 'ols-wSJR':
        est = regress_ols_wSJR(data, equation)
    elif regressionType == 'logit-noSJR':
        est = regress_logit_noSJR(data, equation)
    elif regressionType == 'logit-wSJR':
        est = regress_logit_wSJR(data, equation)
    
    return est

In [45]:
def discretize_attention_binary(score, threshold_high=21):
    """
    This function will discretize attention into binary based on the given threshold
    """
    
    if score < threshold_high:
        return "low attention"
    elif score >= threshold_high:
        return "high attention"

def discretize_attention_tertiary(score, threshold_medium=3, threshold_high=5):
    assert(threshold_medium < threshold_high)
    if score < threshold_medium:
        return "low attention"
    elif score >= threshold_medium and score < threshold_high:
        return "medium attention"
    elif score >= threshold_high:
        return "high attention"

## OLS with SJR without attention categories

In [46]:
# These fields are the ones that are used in all the models
default_fids = ["yearofretraction", "gender", "contributionrank", "venue", 
             "reasons", "field", "affrank", "numauthors", "impactfactor",]

# These are the fields that are highly correlated and will be used one at a time
alternating_fids = ["age", "papers", "logcitations", "collaborators"]

# Defining the expression for attention
attention_exp = "AltmetricScoreAtRetraction"


# Saving all the models
ests = []
for alt_fid in alternating_fids:
    # Finally regressing
    est = regress_all_together('ols-wSJR', df, attention_exp, default_fids + [alt_fid])
    ests.append(est)

tables = Stargazer(ests)

tables.significance_levels([0.05, 0.01, 0.001])

renaming_dict = {'AcademicAgeAtRetraction': 'Academic Age',
                "AltmetricScoreAtRetraction": "Attention",
                "C(GenderizeGender, Treatment(reference='male'))[T.female]":"Female",
                "C(MAGAIDRankTypeInRetractedPaper, Treatment(reference='First or Last or Only Author'))[T.Middle Author]": "Author order: Middle Author",
                "C(MAGJournalType, Treatment(reference='conference'))[T.journal]":"Venue: Journal",
                "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.misconduct]": "Reason: Misconduct",
                "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.plagiarism]":"Reason: Plagiarism",
                "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.other]":"Reason: Other",
                "Intercept":"Constant",
                "LogMAGCumCitationsAtRetraction": "Log(Citations)",
                "LogMAGCumCollaboratorsAtRetraction": "Log(Collaborators)",
                "MAGCumPapersAtRetraction": "Papers",
                "RetractionYear": "Retraction Year",
                "MAGRetractionYearAffRankOrdinal": "Author Affiliation Rank",
                "SJRQuartileRetractedPaperYear": "Journal/Conference Rank",
                "NumAuthorsInRetractedPaper": "Coauthors on Retracted Paper"
                }
                    
tables.rename_covariates(renaming_dict)

order = ["AltmetricScoreAtRetraction",
         "AcademicAgeAtRetraction",
         "MAGCumPapersAtRetraction",
         "LogMAGCumCitationsAtRetraction",
         "LogMAGCumCollaboratorsAtRetraction",
         "C(GenderizeGender, Treatment(reference='male'))[T.female]",
         "MAGRetractionYearAffRankOrdinal",
         "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.misconduct]",
         "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.plagiarism]",
         "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.other]",
         "NumAuthorsInRetractedPaper",
         "C(MAGAIDRankTypeInRetractedPaper, Treatment(reference='First or Last or Only Author'))[T.Middle Author]",
         "RetractionYear",
         "C(MAGJournalType, Treatment(reference='conference'))[T.journal]",
         "SJRQuartileRetractedPaperYear",
         "Intercept"]

tables.covariate_order(order)

tables

AttritionNew
AttritionNew
AttritionNew
AttritionNew


0,1,2,3,4
,,,,
,Dependent variable:AttritionNew,Dependent variable:AttritionNew,Dependent variable:AttritionNew,Dependent variable:AttritionNew
,,,,
,(1),(2),(3),(4)
,,,,
Attention,0.000**,0.000**,0.000***,0.000*
,(0.000),(0.000),(0.000),(0.000)
Academic Age,-0.008***,,,
,(0.000),,,
Papers,,-0.001***,,


## OLS with SJR, 2 attention categories and all experience fields

In [47]:
# These fields are the ones that are used in all the models
default_fids = ["yearofretraction", "gender", "contributionrank", "venue", 
             "reasons", "field", "affrank", "numauthors", "impactfactor",
               "age", "papers", "logcitations", "collaborators"]


HIGH_THRESHOLD = 21

# Let us first discretize attention based on the treshold
df['DiscretizedAltmetricScoreAtRetraction'] = df.apply(lambda row: \
                                        discretize_attention_binary(row['AltmetricScoreAtRetraction'],
                                                                        threshold_high=HIGH_THRESHOLD),
                                        axis=1)

# Defining the expression for attention
attention_exp = "C(DiscretizedAltmetricScoreAtRetraction, Treatment('low attention'))"


# Saving all the models
ests = []
est = regress_all_together('ols-wSJR', df, attention_exp, default_fids)
ests.append(est)

tables = Stargazer(ests)

tables.significance_levels([0.05, 0.01, 0.001])

renaming_dict = {'AcademicAgeAtRetraction': 'Academic Age',
                "C(DiscretizedAltmetricScoreAtRetraction, Treatment('low attention'))[T.high attention]": "High Attention  ($>20$ Altmetric score)",
                "C(GenderizeGender, Treatment(reference='male'))[T.female]":"Female",
                "C(MAGAIDRankTypeInRetractedPaper, Treatment(reference='First or Last or Only Author'))[T.Middle Author]": "Author order: Middle Author",
                "C(MAGJournalType, Treatment(reference='conference'))[T.journal]":"Venue: Journal",
                "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.misconduct]": "Reason: Misconduct",
                "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.plagiarism]":"Reason: Plagiarism",
                "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.other]":"Reason: Other",
                "Intercept":"Constant",
                "LogMAGCumCitationsAtRetraction": "Log(Citations)",
                "LogMAGCumCollaboratorsAtRetraction": "Log(Collaborators)",
                "MAGCumPapersAtRetraction": "Papers",
                "RetractionYear": "Retraction Year",
                "MAGRetractionYearAffRankOrdinal": "Author Affiliation Rank",
                "SJRQuartileRetractedPaperYear": "Journal/Conference Rank",
                "NumAuthorsInRetractedPaper": "Coauthors on Retracted Paper"
                }
                    
tables.rename_covariates(renaming_dict)

order = ["C(DiscretizedAltmetricScoreAtRetraction, Treatment('low attention'))[T.high attention]",
         "AcademicAgeAtRetraction",
         "MAGCumPapersAtRetraction",
         "LogMAGCumCitationsAtRetraction",
         "LogMAGCumCollaboratorsAtRetraction",
         "C(GenderizeGender, Treatment(reference='male'))[T.female]",
         "MAGRetractionYearAffRankOrdinal",
         "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.misconduct]",
         "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.plagiarism]",
         "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.other]",
         "NumAuthorsInRetractedPaper",
         "C(MAGAIDRankTypeInRetractedPaper, Treatment(reference='First or Last or Only Author'))[T.Middle Author]",
         "RetractionYear",
         "C(MAGJournalType, Treatment(reference='conference'))[T.journal]",
         "SJRQuartileRetractedPaperYear",
         "Intercept"]

tables.covariate_order(order)

tables

AttritionNew


0,1
,
,Dependent variable:AttritionNew
,
,(1)
,
High Attention ($>20$ Altmetric score),0.038
,(0.023)
Academic Age,0.002***
,(0.001)
Papers,0.000***


## OLS with SJR and 2 attention categories

In [48]:
# These fields are the ones that are used in all the models
default_fids = ["yearofretraction", "gender", "contributionrank", "venue", 
             "reasons", "field", "affrank", "numauthors", "impactfactor",]

# These are the fields that are highly correlated and will be used one at a time
alternating_fids = ["age", "papers", "logcitations", "collaborators"]

HIGH_THRESHOLD = 21

# Let us first discretize attention based on the treshold
df['DiscretizedAltmetricScoreAtRetraction'] = df.apply(lambda row: \
                                        discretize_attention_binary(row['AltmetricScoreAtRetraction'],
                                                                        threshold_high=HIGH_THRESHOLD),
                                        axis=1)

# Defining the expression for attention
attention_exp = "C(DiscretizedAltmetricScoreAtRetraction, Treatment('low attention'))"


# Saving all the models
ests = []
for alt_fid in alternating_fids:
    # Finally regressing
    est = regress_all_together('ols-wSJR', df, attention_exp, default_fids + [alt_fid])
    ests.append(est)

tables = Stargazer(ests)

tables.significance_levels([0.05, 0.01, 0.001])

renaming_dict = {'AcademicAgeAtRetraction': 'Academic Age',
                "C(DiscretizedAltmetricScoreAtRetraction, Treatment('low attention'))[T.high attention]": "High Attention  ($>20$ Altmetric score)",
                "C(GenderizeGender, Treatment(reference='male'))[T.female]":"Female",
                "C(MAGAIDRankTypeInRetractedPaper, Treatment(reference='First or Last or Only Author'))[T.Middle Author]": "Author order: Middle Author",
                "C(MAGJournalType, Treatment(reference='conference'))[T.journal]":"Venue: Journal",
                "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.misconduct]": "Reason: Misconduct",
                "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.plagiarism]":"Reason: Plagiarism",
                "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.other]":"Reason: Other",
                "Intercept":"Constant",
                "LogMAGCumCitationsAtRetraction": "Log(Citations)",
                "LogMAGCumCollaboratorsAtRetraction": "Log(Collaborators)",
                "MAGCumPapersAtRetraction": "Papers",
                "RetractionYear": "Retraction Year",
                "MAGRetractionYearAffRankOrdinal": "Author Affiliation Rank",
                "SJRQuartileRetractedPaperYear": "Journal/Conference Rank",
                "NumAuthorsInRetractedPaper": "Coauthors on Retracted Paper"
                }
                    
tables.rename_covariates(renaming_dict)

order = ["C(DiscretizedAltmetricScoreAtRetraction, Treatment('low attention'))[T.high attention]",
         "AcademicAgeAtRetraction",
         "MAGCumPapersAtRetraction",
         "LogMAGCumCitationsAtRetraction",
         "LogMAGCumCollaboratorsAtRetraction",
         "C(GenderizeGender, Treatment(reference='male'))[T.female]",
         "MAGRetractionYearAffRankOrdinal",
         "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.misconduct]",
         "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.plagiarism]",
         "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.other]",
         "NumAuthorsInRetractedPaper",
         "C(MAGAIDRankTypeInRetractedPaper, Treatment(reference='First or Last or Only Author'))[T.Middle Author]",
         "RetractionYear",
         "C(MAGJournalType, Treatment(reference='conference'))[T.journal]",
         "SJRQuartileRetractedPaperYear",
         "Intercept"]

tables.covariate_order(order)

tables

AttritionNew
AttritionNew
AttritionNew
AttritionNew


0,1,2,3,4
,,,,
,Dependent variable:AttritionNew,Dependent variable:AttritionNew,Dependent variable:AttritionNew,Dependent variable:AttritionNew
,,,,
,(1),(2),(3),(4)
,,,,
High Attention ($>20$ Altmetric score),0.032,0.021,0.062*,0.035
,(0.026),(0.026),(0.027),(0.023)
Academic Age,-0.008***,,,
,(0.000),,,
Papers,,-0.001***,,


In [56]:
# Now we shall hard-code some things for pretty-fying the latex table
import re

caption = '\\textbf{OLS regression coefficients with 2 Altmetric attention categories: '\
        'low ($score < 20$), '\
        'and high ($score \geq 20$) '\
        'using low attention as reference category}, and '\
        'attrition as a dependent variable for different '\
        'author experience fields: model (1) using academic age, '\
        '(2) using number of papers by the time of retraction, '\
        '(3) using logged number of citations by the the time of retraction, and '\
        '(4) using logged number of collaborators by the time of retraction. '\
        'Controls for discipline of study are included as binary variables, but are not shown. '\

caption = "\\textbf{Complete linear probability models of attrition.} "\
        "Models differ in how authors' experience is measured using "\
        "(1) academic age, (2) number of papers by the time of retraction, "\
        "(3) logged number of citations by the the time of retraction, and "\
        "(4) logged number of collaborators by the time of retraction, respectively. "\
        "Controls for author's scientific discipline are included as categorical variables, but are not shown. "

label = 'supplementarytab:ols_wSJR_2attention'

formatting_dict = {'\\textit{Note:} & \\multicolumn{4}{r}{$^{*}$p$<$0.05; $^{**}$p$<$0.01; $^{***}$p$<$0.001} \\':
                 '\\textit{$^{*}$p$<$0.05; $^{**}$p$<$0.01; $^{***}$p$<$0.001}',
                  
                  '\\begin{table}[!htbp] \\centering': 
                  '\\begin{table}[H]\n{\\fontsize{10.0}{10.0}\\selectfont{\n\\caption{'+caption+'}\n'+\
                   '\\label{'+label+'}\n\\begin{center}',
                  '\\end{tabular}\n':'\\end{tabular}\n\\end{center}\n}}\n',
                  '\\multicolumn{4}{c}{\\textit{Dependent variable:}} \\\n\\cr \\cline{4-5}\n':
                  '\\multicolumn{4}{c}{\\textit{Dependent variable: Attrition}} \\\n\\cr \\cline{2-5}\\\[-1.8ex]\n',
                   '\\begin{tabular}{@{\\extracolsep{5pt}}lcccc}\n':
                   '\\begin{tabular}{@{\\extracolsep{5pt}}lD{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3}}\n',
                   '\\\[-1.8ex] & (1) & (2) & (3) & (4) \\':
                   '& \\multicolumn{1}{c}{\\hspace{10pt}(1)} & \\multicolumn{1}{c}{\\hspace{10pt}(2)} & '\
                       '\\multicolumn{1}{c}{\\hspace{10pt}(3)} & \\multicolumn{1}{c}{\\hspace{10pt}(4)} \\'
                  }


latex = tables.render_latex()


for to_replace, replace_with in formatting_dict.items():
    latex = latex.replace(to_replace, replace_with)

# define the regular expression to match the dollar signs ($) in exponent notation
regex_remove_dollar = r"\$(\^\{.*?\})\$"

# replace all matches of the regular expression with the matched exponent notation, without the dollar signs
latex = re.sub(regex_remove_dollar, r"\1", latex)

# define the regular expression to match the pattern '(df = some_number)'
regex_remove_df = r'\(df = [\d\.]+; [\d\.]+\)'

latex = re.sub(regex_remove_df, '', latex)

# Removing Residual errors
regex_remove_resErr = r'Residual Std. Error.*\\\\\n'

latex = re.sub(regex_remove_resErr, '', latex)


# Replacing the p-value line
to_replace = '\\textit{^{*}p$<$0.05; ^{**}p$<$0.01; ^{***}p$<$0.001}'
replace_with = '\\textit{$^{*}$p$<$0.05; $^{**}$p$<$0.01; $^{***}$p$<$0.001}'
latex = latex.replace(to_replace, replace_with)

# Order observation column
regex_center_numObs = r'(Observations & )(\d+ )(\& \d+ )(\& \d+ )(\& \d+)'

matches = re.findall(regex_center_numObs, latex)
replacement_numObs = ''

# iterate through each match and replace the numbers with the required format
for match in matches:
    a = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[1] + "} & "
    b = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[2].strip("& ") + "} & "
    c = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[3].strip("& ") + "} & "
    d = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[4].strip("& \\\\") + "}"
    replacement_numObs = match[0] + a + b + c + d

# print(replacement_numObs)
    
latex = re.sub(regex_center_numObs, replacement_numObs, latex)    
    
print(latex)




\begin{table}[H]
{\fontsize{10.0}{10.0}\selectfont{
\caption{\textbf{Complete linear probability models of attrition.} Models differ in how authors' experience is measured using (1) academic age, (2) number of papers by the time of retraction, (3) logged number of citations by the the time of retraction, and (4) logged number of collaborators by the time of retraction, respectively. Controls for author's scientific discipline are included as categorical variables, but are not shown. }
\label{supplementarytab:ols_wSJR_2attention}
\begin{center}
\begin{tabular}{@{\extracolsep{5pt}}lD{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3}}
\\[-1.8ex]\hline
\hline \\[-1.8ex]
& \multicolumn{4}{c}{\textit{Dependent variable: Attrition}} \
\cr \cline{2-5}\\[-1.8ex]
& \multicolumn{1}{c}{\hspace{10pt}(1)} & \multicolumn{1}{c}{\hspace{10pt}(2)} & \multicolumn{1}{c}{\hspace{10pt}(3)} & \multicolumn{1}{c}{\hspace{10pt}(4)} \\
\hline \\[-1.8ex]
 High Attention  ($>20$ Altmetric score) & 0.067^{*} & 0.054^{*

## OLS with SJR with 2 attention categories (post 2005)

In [57]:
# These fields are the ones that are used in all the models
default_fids = ["yearofretraction", "gender", "contributionrank", "venue", 
             "reasons", "field", "affrank", "numauthors", "impactfactor",]

# These are the fields that are highly correlated and will be used one at a time
alternating_fids = ["age", "papers", "logcitations", "collaborators"]

HIGH_THRESHOLD = 21

# Let us first discretize attention based on the treshold
df['DiscretizedAltmetricScoreAtRetraction'] = df.apply(lambda row: \
                                        discretize_attention_binary(row['AltmetricScoreAtRetraction'],
                                                                        threshold_high=HIGH_THRESHOLD),
                                        axis=1)

dftemp = df[df.RetractionYear.ge(2005)]
print(dftemp['MAGAID'].nunique())

# Defining the expression for attention
attention_exp = "C(DiscretizedAltmetricScoreAtRetraction, Treatment('low attention'))"


# Saving all the models
ests = []
for alt_fid in alternating_fids:
    # Finally regressing
    est = regress_all_together('ols-wSJR', dftemp, attention_exp, default_fids + [alt_fid])
    ests.append(est)

tables = Stargazer(ests)

tables.significance_levels([0.05, 0.01, 0.001])

renaming_dict = {'AcademicAgeAtRetraction': 'Academic Age',
                "C(DiscretizedAltmetricScoreAtRetraction, Treatment('low attention'))[T.high attention]": "High Attention  ($>20$ Altmetric score)",
                "C(GenderizeGender, Treatment(reference='male'))[T.female]":"Female",
                "C(MAGAIDRankTypeInRetractedPaper, Treatment(reference='First or Last or Only Author'))[T.Middle Author]": "Author order: Middle Author",
                "C(MAGJournalType, Treatment(reference='conference'))[T.journal]":"Venue: Journal",
                "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.misconduct]": "Reason: Misconduct",
                "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.plagiarism]":"Reason: Plagiarism",
                "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.other]":"Reason: Other",
                "Intercept":"Constant",
                "LogMAGCumCitationsAtRetraction": "Log(Citations)",
                "LogMAGCumCollaboratorsAtRetraction": "Log(Collaborators)",
                "MAGCumPapersAtRetraction": "Papers",
                "RetractionYear": "Retraction Year",
                "MAGRetractionYearAffRankOrdinal": "Author Affiliation Rank",
                "SJRQuartileRetractedPaperYear": "Journal/Conference Rank",
                "NumAuthorsInRetractedPaper": "Coauthors on Retracted Paper"
                }
                    
tables.rename_covariates(renaming_dict)

order = ["C(DiscretizedAltmetricScoreAtRetraction, Treatment('low attention'))[T.high attention]",
         "AcademicAgeAtRetraction",
         "MAGCumPapersAtRetraction",
         "LogMAGCumCitationsAtRetraction",
         "LogMAGCumCollaboratorsAtRetraction",
         "C(GenderizeGender, Treatment(reference='male'))[T.female]",
         "MAGRetractionYearAffRankOrdinal",
         "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.misconduct]",
         "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.plagiarism]",
         "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.other]",
         "NumAuthorsInRetractedPaper",
         "C(MAGAIDRankTypeInRetractedPaper, Treatment(reference='First or Last or Only Author'))[T.Middle Author]",
         "RetractionYear",
         "C(MAGJournalType, Treatment(reference='conference'))[T.journal]",
         "SJRQuartileRetractedPaperYear",
         "Intercept"]

tables.covariate_order(order)

tables

13382
Attrition
Attrition
Attrition
Attrition


0,1,2,3,4
,,,,
,Dependent variable:Attrition,Dependent variable:Attrition,Dependent variable:Attrition,Dependent variable:Attrition
,,,,
,(1),(2),(3),(4)
,,,,
High Attention ($>20$ Altmetric score),0.067*,0.053,0.099***,0.063**
,(0.027),(0.027),(0.027),(0.024)
Academic Age,-0.008***,,,
,(0.000),,,
Papers,,-0.001***,,


In [58]:
# Now we shall hard-code some things for pretty-fying the latex table


caption = '\\textbf{OLS regression coefficients for papers between 2005-2015 with 2 Altmetric attention categories: '\
        'low ($score < 20$), '\
        'and high ($score \geq 20$) '\
        'using low attention as reference category}, and '\
        'attrition as a dependent variable for different '\
        'author experience fields: model (1) using academic age, '\
        '(2) using number of papers by the time of retraction, '\
        '(3) using logged number of citations by the the time of retraction, and '\
        '(4) using logged number of collaborators by the time of retraction. '\
        'Controls for discipline of study are included as binary variables, but are not shown. '\

caption = "\\textbf{Linear probability models of attrition for retractions between 2005-2015.} "\
        "Models differ in how authors' experience is measured using "\
        "(1) academic age, (2) number of papers by the time of retraction, "\
        "(3) logged number of citations by the the time of retraction, and "\
        "(4) logged number of collaborators by the time of retraction, respectively. "\
        "Controls for author's scientific discipline are included as categorical variables, but are not shown. "

label = 'supplementarytab:ols_wSJR_2attention_2005on'

# Now we shall hard-code some things for pretty-fying the latex table


caption = '\\textbf{OLS regression coefficients for papers between 2005-2015 with 2 Altmetric attention categories: '\
        'low ($score < 20$), '\
        'and high ($score \geq 20$) '\
        'using low attention as reference category}, and '\
        'attrition as a dependent variable for different '\
        'author experience fields: model (1) using academic age, '\
        '(2) using number of papers by the time of retraction, '\
        '(3) using logged number of citations by the the time of retraction, and '\
        '(4) using logged number of collaborators by the time of retraction. '\
        'Controls for discipline of study are included as binary variables, but are not shown. '\

caption = "\\textbf{Linear probability models of attrition for retractions between 2005-2015.} "\
        "Models differ in how authors' experience is measured using "\
        "(1) academic age, (2) number of papers by the time of retraction, "\
        "(3) logged number of citations by the the time of retraction, and "\
        "(4) logged number of collaborators by the time of retraction, respectively. "\
        "Controls for author's scientific discipline are included as categorical variables, but are not shown. "

label = 'supplementarytab:ols_wSJR_2attention_2005on'

formatting_dict = {'\\textit{Note:} & \\multicolumn{4}{r}{$^{*}$p$<$0.05; $^{**}$p$<$0.01; $^{***}$p$<$0.001} \\':
                 '\\textit{$^{*}$p$<$0.05; $^{**}$p$<$0.01; $^{***}$p$<$0.001}',
                  
                  '\\begin{table}[!htbp] \\centering': 
                  '\\begin{table}[H]\n{\\fontsize{10.0}{10.0}\\selectfont{\n\\caption{'+caption+'}\n'+\
                   '\\label{'+label+'}\n\\begin{center}',
                  '\\end{tabular}\n':'\\end{tabular}\n\\end{center}\n}}\n',
                  '\\multicolumn{4}{c}{\\textit{Dependent variable:}} \\\n\\cr \\cline{4-5}\n':
                  '\\multicolumn{4}{c}{\\textit{Dependent variable: Attrition}} \\\n\\cr \\cline{2-5}\\\[-1.8ex]\n',
                   '\\begin{tabular}{@{\\extracolsep{5pt}}lcccc}\n':
                   '\\begin{tabular}{@{\\extracolsep{5pt}}lD{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3}}\n',
                   '\\\[-1.8ex] & (1) & (2) & (3) & (4) \\':
                   '& \\multicolumn{1}{c}{\\hspace{10pt}(1)} & \\multicolumn{1}{c}{\\hspace{10pt}(2)} & '\
                       '\\multicolumn{1}{c}{\\hspace{10pt}(3)} & \\multicolumn{1}{c}{\\hspace{10pt}(4)} \\'
                  }


latex = tables.render_latex()


for to_replace, replace_with in formatting_dict.items():
    latex = latex.replace(to_replace, replace_with)

# define the regular expression to match the dollar signs ($) in exponent notation
regex_remove_dollar = r"\$(\^\{.*?\})\$"

# replace all matches of the regular expression with the matched exponent notation, without the dollar signs
latex = re.sub(regex_remove_dollar, r"\1", latex)

# define the regular expression to match the pattern '(df = some_number)'
regex_remove_df = r'\(df = [\d\.]+; [\d\.]+\)'

latex = re.sub(regex_remove_df, '', latex)

# Removing Residual errors
regex_remove_resErr = r'Residual Std. Error.*\\\\\n'

latex = re.sub(regex_remove_resErr, '', latex)


# Replacing the p-value line
to_replace = '\\textit{^{*}p$<$0.05; ^{**}p$<$0.01; ^{***}p$<$0.001}'
replace_with = '\\textit{$^{*}$p$<$0.05; $^{**}$p$<$0.01; $^{***}$p$<$0.001}'
latex = latex.replace(to_replace, replace_with)

# Order observation column
regex_center_numObs = r'(Observations & )(\d+ )(\& \d+ )(\& \d+ )(\& \d+)'

matches = re.findall(regex_center_numObs, latex)
replacement_numObs = ''

# iterate through each match and replace the numbers with the required format
for match in matches:
    a = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[1] + "} & "
    b = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[2].strip("& ") + "} & "
    c = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[3].strip("& ") + "} & "
    d = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[4].strip("& \\\\") + "}"
    replacement_numObs = match[0] + a + b + c + d

# print(replacement_numObs)
    
latex = re.sub(regex_center_numObs, replacement_numObs, latex)    
    
print(latex)

\begin{table}[H]
{\fontsize{10.0}{10.0}\selectfont{
\caption{\textbf{Linear probability models of attrition for retractions between 2005-2015.} Models differ in how authors' experience is measured using (1) academic age, (2) number of papers by the time of retraction, (3) logged number of citations by the the time of retraction, and (4) logged number of collaborators by the time of retraction, respectively. Controls for author's scientific discipline are included as categorical variables, but are not shown. }
\label{supplementarytab:ols_wSJR_2attention_2005on}
\begin{center}
\begin{tabular}{@{\extracolsep{5pt}}lD{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3}}
\\[-1.8ex]\hline
\hline \\[-1.8ex]
& \multicolumn{4}{c}{\textit{Dependent variable: Attrition}} \
\cr \cline{2-5}\\[-1.8ex]
& \multicolumn{1}{c}{\hspace{10pt}(1)} & \multicolumn{1}{c}{\hspace{10pt}(2)} & \multicolumn{1}{c}{\hspace{10pt}(3)} & \multicolumn{1}{c}{\hspace{10pt}(4)} \\
\hline \\[-1.8ex]
 High Attention  ($>20$ Altmetr

## Logit with SJR and 2 attention categories

In [59]:
# These fields are the ones that are used in all the models
default_fids = ["yearofretraction", "gender", "contributionrank", "venue", 
             "reasons", "field", "affrank", "numauthors", "impactfactor",]

# These are the fields that are highly correlated and will be used one at a time
alternating_fids = ["age", "papers", "logcitations", "collaborators"]

HIGH_THRESHOLD = 21

# Let us first discretize attention based on the treshold
df['DiscretizedAltmetricScoreAtRetraction'] = df.apply(lambda row: \
                                        discretize_attention_binary(row['AltmetricScoreAtRetraction'],
                                                                        threshold_high=HIGH_THRESHOLD),
                                        axis=1)

# Defining the expression for attention
attention_exp = "C(DiscretizedAltmetricScoreAtRetraction, Treatment('low attention'))"


# Saving all the models
ests = []
for alt_fid in alternating_fids:
    # Finally regressing
    est = regress_all_together('logit-wSJR', df, attention_exp, default_fids + [alt_fid])
    ests.append(est)

tables = Stargazer(ests)

tables.significance_levels([0.05, 0.01, 0.001])

renaming_dict = {'AcademicAgeAtRetraction': 'Academic Age',
                "C(DiscretizedAltmetricScoreAtRetraction, Treatment('low attention'))[T.high attention]": "High Attention  ($>20$ Altmetric score)",
                "C(GenderizeGender, Treatment(reference='male'))[T.female]":"Female",
                "C(MAGAIDRankTypeInRetractedPaper, Treatment(reference='First or Last or Only Author'))[T.Middle Author]": "Author order: Middle Author",
                "C(MAGJournalType, Treatment(reference='conference'))[T.journal]":"Venue: Journal",
                "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.misconduct]": "Reason: Misconduct",
                "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.plagiarism]":"Reason: Plagiarism",
                "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.other]":"Reason: Other",
                "Intercept":"Constant",
                "LogMAGCumCitationsAtRetraction": "Log(Citations)",
                "LogMAGCumCollaboratorsAtRetraction": "Log(Collaborators)",
                "MAGCumPapersAtRetraction": "Papers",
                "RetractionYear": "Retraction Year",
                "MAGRetractionYearAffRankOrdinal": "Author Affiliation Rank",
                "SJRQuartileRetractedPaperYear": "Journal/Conference Rank",
                "NumAuthorsInRetractedPaper": "Coauthors on Retracted Paper"
                }
                    
tables.rename_covariates(renaming_dict)

order = ["C(DiscretizedAltmetricScoreAtRetraction, Treatment('low attention'))[T.high attention]",
         "AcademicAgeAtRetraction",
         "MAGCumPapersAtRetraction",
         "LogMAGCumCitationsAtRetraction",
         "LogMAGCumCollaboratorsAtRetraction",
         "C(GenderizeGender, Treatment(reference='male'))[T.female]",
         "MAGRetractionYearAffRankOrdinal",
         "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.misconduct]",
         "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.plagiarism]",
         "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.other]",
         "NumAuthorsInRetractedPaper",
         "C(MAGAIDRankTypeInRetractedPaper, Treatment(reference='First or Last or Only Author'))[T.Middle Author]",
         "RetractionYear",
         "C(MAGJournalType, Treatment(reference='conference'))[T.journal]",
         "SJRQuartileRetractedPaperYear",
         "Intercept"]

tables.covariate_order(order)

tables

Attrition
Optimization terminated successfully.
         Current function value: 0.408946
         Iterations 8
Attrition
Optimization terminated successfully.
         Current function value: 0.380238
         Iterations 10
Attrition
Optimization terminated successfully.
         Current function value: 0.393389
         Iterations 7
Attrition
Optimization terminated successfully.
         Current function value: 0.353924
         Iterations 8


0,1,2,3,4
,,,,
,Dependent variable:Attrition,Dependent variable:Attrition,Dependent variable:Attrition,Dependent variable:Attrition
,,,,
,(1),(2),(3),(4)
,,,,
High Attention ($>20$ Altmetric score),0.623**,0.524**,0.947***,0.678***
,(0.193),(0.192),(0.194),(0.191)
Academic Age,-0.112***,,,
,(0.008),,,
Papers,,-0.064***,,


In [77]:
print("Log-Likelihood", ests[0].llf, ests[1].llf, ests[2].llf, ests[3].llf)

Log-Likelihood -4374.899178689412 -4067.782336516665 -4208.473078942602 -3786.2758099226457


In [14]:
# Now we shall hard-code some things for pretty-fying the latex table


caption = '\\textbf{Logistic regression coefficients with 2 Altmetric attention categories: '\
        'low ($score < 20$), '\
        'and high ($score \geq 20$) '\
        'using low attention as reference category}, and '\
        'attrition as a dependent variable for different '\
        'author experience fields: model (1) using academic age, '\
        '(2) using number of papers by the time of retraction, '\
        '(3) using logged number of citations by the the time of retraction, and '\
        '(4) using logged number of collaborators by the time of retraction. '\
        'Controls for discipline of study are included as binary variables, but are not shown. '\


caption = "\\textbf{Logistic regression models of attrition.} "\
        "Models differ in how authors' experience is measured using "\
        "(1) academic age, (2) number of papers by the time of retraction, "\
        "(3) logged number of citations by the the time of retraction, and "\
        "(4) logged number of collaborators by the time of retraction, respectively. "\
        "Controls for author's scientific discipline are included as categorical variables, but are not shown. "

label = 'supplementarytab:logit_wSJR_2attention'

formatting_dict = {'\\textit{Note:} & \\multicolumn{4}{r}{$^{*}$p$<$0.05; $^{**}$p$<$0.01; $^{***}$p$<$0.001} \\':
                 '\\textit{$^{*}$p$<$0.05; $^{**}$p$<$0.01; $^{***}$p$<$0.001}',
                  
                  '\\begin{table}[!htbp] \\centering': 
                  '\\begin{table}[H]\n{\\fontsize{7.0}{7.0}\\selectfont{\n\\caption{'+caption+'}\n'+\
                   '\\label{'+label+'}\n\\begin{center}',
                  '\\end{tabular}\n':'\\end{tabular}\n\\end{center}\n}}\n',
                  '\\multicolumn{4}{c}{\\textit{Dependent variable:}} \\\n\\cr \\cline{4-5}\n':
                  '\\multicolumn{4}{c}{\\textit{Dependent variable: Attrition}} \\\n\\cr \\cline{2-5}\\\n',
                   '\\begin{tabular}{@{\\extracolsep{5pt}}lcccc}\n':'\\begin{tabular}{lllll}\n'
                  }



formatting_dict = {'\\textit{Note:} & \\multicolumn{4}{r}{$^{*}$p$<$0.05; $^{**}$p$<$0.01; $^{***}$p$<$0.001} \\':
                 '\\textit{$^{*}$p$<$0.05; $^{**}$p$<$0.01; $^{***}$p$<$0.001}',
                  
                  '\\begin{table}[!htbp] \\centering': 
                  '\\begin{table}[H]\n{\\fontsize{10.0}{10.0}\\selectfont{\n\\caption{'+caption+'}\n'+\
                   '\\label{'+label+'}\n\\begin{center}',
                  '\\end{tabular}\n':'\\end{tabular}\n\\end{center}\n}}\n',
                  '\\multicolumn{4}{c}{\\textit{Dependent variable:}} \\\n\\cr \\cline{4-5}\n':
                  '\\multicolumn{4}{c}{\\textit{Dependent variable: Attrition}} \\\n\\cr \\cline{2-5}\\\[-1.8ex]\n',
                   '\\begin{tabular}{@{\\extracolsep{5pt}}lcccc}\n':
                   '\\begin{tabular}{@{\\extracolsep{5pt}}lD{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3}}\n',
                   '\\\[-1.8ex] & (1) & (2) & (3) & (4) \\':
                   '& \\multicolumn{1}{c}{\\hspace{10pt}(1)} & \\multicolumn{1}{c}{\\hspace{10pt}(2)} & '\
                       '\\multicolumn{1}{c}{\\hspace{10pt}(3)} & \\multicolumn{1}{c}{\\hspace{10pt}(4)} \\'
                  }


latex = tables.render_latex()


for to_replace, replace_with in formatting_dict.items():
    latex = latex.replace(to_replace, replace_with)

# define the regular expression to match the dollar signs ($) in exponent notation
regex_remove_dollar = r"\$(\^\{.*?\})\$"

# replace all matches of the regular expression with the matched exponent notation, without the dollar signs
latex = re.sub(regex_remove_dollar, r"\1", latex)

# define the regular expression to match the pattern '(df = some_number)'
regex_remove_df = r'\(df = [\d\.]+; [\d\.]+\)'

latex = re.sub(regex_remove_df, '', latex)

# Removing Residual errors
regex_remove_resErr = r'Residual Std. Error.*\\\\\n'

latex = re.sub(regex_remove_resErr, '', latex)


# Removing F statistic line
regex_remove_Fstat = r'F Statistic.*\\\\\n'

latex = re.sub(regex_remove_Fstat, '', latex)

# Replacing the p-value line
to_replace = '\\textit{^{*}p$<$0.05; ^{**}p$<$0.01; ^{***}p$<$0.001}'
replace_with = '\\textit{$^{*}$p$<$0.05; $^{**}$p$<$0.01; $^{***}$p$<$0.001}'
latex = latex.replace(to_replace, replace_with)

# Order observation column
regex_center_numObs = r'(Observations & )(\d+ )(\& \d+ )(\& \d+ )(\& \d+)'

matches = re.findall(regex_center_numObs, latex)
replacement_numObs = ''

# iterate through each match and replace the numbers with the required format
for match in matches:
    a = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[1] + "} & "
    b = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[2].strip("& ") + "} & "
    c = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[3].strip("& ") + "} & "
    d = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[4].strip("& \\\\") + "}"
    replacement_numObs = match[0] + a + b + c + d

# print(replacement_numObs)
    
latex = re.sub(regex_center_numObs, replacement_numObs, latex)    
    
print(latex)



\begin{table}[H]
{\fontsize{10.0}{10.0}\selectfont{
\caption{\textbf{Logistic regression models of attrition.} Models differ in how authors' experience is measured using (1) academic age, (2) number of papers by the time of retraction, (3) logged number of citations by the the time of retraction, and (4) logged number of collaborators by the time of retraction, respectively. Controls for author's scientific discipline are included as categorical variables, but are not shown. }
\label{supplementarytab:logit_wSJR_2attention}
\begin{center}
\begin{tabular}{@{\extracolsep{5pt}}lD{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3}}
\\[-1.8ex]\hline
\hline \\[-1.8ex]
& \multicolumn{4}{c}{\textit{Dependent variable: Attrition}} \
\cr \cline{2-5}\\[-1.8ex]
& \multicolumn{1}{c}{\hspace{10pt}(1)} & \multicolumn{1}{c}{\hspace{10pt}(2)} & \multicolumn{1}{c}{\hspace{10pt}(3)} & \multicolumn{1}{c}{\hspace{10pt}(4)} \\
\hline \\[-1.8ex]
 High Attention  ($>20$ Altmetric score) & 0.623^{**} & 0.524^{**} & 

## OLS w/o SJR and 2 attention categories

In [78]:
# These fields are the ones that are used in all the models
default_fids = ["yearofretraction", "gender", "contributionrank", "venue", 
             "reasons", "field", "affrank", "numauthors",]

# These are the fields that are highly correlated and will be used one at a time
alternating_fids = ["age", "papers", "logcitations", "collaborators"]

HIGH_THRESHOLD = 21

# Let us first discretize attention based on the treshold
df['DiscretizedAltmetricScoreAtRetraction'] = df.apply(lambda row: \
                                        discretize_attention_binary(row['AltmetricScoreAtRetraction'],
                                                                        threshold_high=HIGH_THRESHOLD),
                                        axis=1)

# Defining the expression for attention
attention_exp = "C(DiscretizedAltmetricScoreAtRetraction, Treatment('low attention'))"


# Saving all the models
ests = []
for alt_fid in alternating_fids:
    # Finally regressing
    est = regress_all_together('ols-noSJR', df, attention_exp, default_fids + [alt_fid])
    ests.append(est)

tables = Stargazer(ests)

tables.significance_levels([0.05, 0.01, 0.001])

renaming_dict = {'AcademicAgeAtRetraction': 'Academic Age',
                "C(DiscretizedAltmetricScoreAtRetraction, Treatment('low attention'))[T.high attention]": "High Attention  ($>20$ Altmetric score)",
                "C(GenderizeGender, Treatment(reference='male'))[T.female]":"Female",
                "C(MAGAIDRankTypeInRetractedPaper, Treatment(reference='First or Last or Only Author'))[T.Middle Author]": "Author order: Middle Author",
                "C(MAGJournalType, Treatment(reference='conference'))[T.journal]":"Venue: Journal",
                "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.misconduct]": "Reason: Misconduct",
                "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.plagiarism]":"Reason: Plagiarism",
                "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.other]":"Reason: Other",
                "Intercept":"Constant",
                "LogMAGCumCitationsAtRetraction": "Log(Citations)",
                "LogMAGCumCollaboratorsAtRetraction": "Log(Collaborators)",
                "MAGCumPapersAtRetraction": "Papers",
                "RetractionYear": "Retraction Year",
                "MAGRetractionYearAffRankOrdinal": "Author Affiliation Rank",
                "NumAuthorsInRetractedPaper": "Coauthors on Retracted Paper"
                }
                    
tables.rename_covariates(renaming_dict)

order = ["C(DiscretizedAltmetricScoreAtRetraction, Treatment('low attention'))[T.high attention]",
         "AcademicAgeAtRetraction",
         "MAGCumPapersAtRetraction",
         "LogMAGCumCitationsAtRetraction",
         "LogMAGCumCollaboratorsAtRetraction",
         "C(GenderizeGender, Treatment(reference='male'))[T.female]",
         "MAGRetractionYearAffRankOrdinal",
         "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.misconduct]",
         "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.plagiarism]",
         "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.other]",
         "NumAuthorsInRetractedPaper",
         "C(MAGAIDRankTypeInRetractedPaper, Treatment(reference='First or Last or Only Author'))[T.Middle Author]",
         "RetractionYear",
         "C(MAGJournalType, Treatment(reference='conference'))[T.journal]",
         "Intercept"]

tables.covariate_order(order)

tables

Attrition
Attrition
Attrition
Attrition


0,1,2,3,4
,,,,
,Dependent variable:Attrition,Dependent variable:Attrition,Dependent variable:Attrition,Dependent variable:Attrition
,,,,
,(1),(2),(3),(4)
,,,,
High Attention ($>20$ Altmetric score),0.052*,0.035,0.088***,0.050*
,(0.025),(0.026),(0.025),(0.023)
Academic Age,-0.008***,,,
,(0.000),,,
Papers,,-0.001***,,


In [79]:
# Now we shall hard-code some things for pretty-fying the latex table


caption = '\\textbf{OLS regression coefficients with 2 Altmetric attention categories: '\
        'low ($score < 20$), '\
        'and high ($score \geq 20$) '\
        'using low attention as reference category}, and '\
        'attrition as a dependent variable for different '\
        'author experience fields: model (1) using academic age, '\
        '(2) using number of papers by the time of retraction, '\
        '(3) using logged number of citations by the the time of retraction, and '\
        '(4) using logged number of collaborators by the time of retraction. '\
        'Controls for discipline of study are included as binary variables, but are not shown. '\
        '\\textbf{Journal/Conference rank is not included as a confounder}.'


caption = "\\textbf{Linear probability models of attrition without controlling for journal/conference rank.} "\
        "Models differ in how authors' experience is measured using "\
        "(1) academic age, (2) number of papers by the time of retraction, "\
        "(3) logged number of citations by the the time of retraction, and "\
        "(4) logged number of collaborators by the time of retraction, respectively. "\
        "Controls for author's scientific discipline are included as categorical variables, but are not shown. "

label = 'supplementarytab:ols_noSJR_2attention'

formatting_dict = {'\\textit{Note:} & \\multicolumn{4}{r}{$^{*}$p$<$0.05; $^{**}$p$<$0.01; $^{***}$p$<$0.001} \\':
                 '\\textit{$^{*}$p$<$0.05; $^{**}$p$<$0.01; $^{***}$p$<$0.001}',
                  
                  '\\begin{table}[!htbp] \\centering': 
                  '\\begin{table}[H]\n{\\fontsize{10.0}{10.0}\\selectfont{\n\\caption{'+caption+'}\n'+\
                   '\\label{'+label+'}\n\\begin{center}',
                  '\\end{tabular}\n':'\\end{tabular}\n\\end{center}\n}}\n',
                  '\\multicolumn{4}{c}{\\textit{Dependent variable:}} \\\n\\cr \\cline{4-5}\n':
                  '\\multicolumn{4}{c}{\\textit{Dependent variable: Attrition}} \\\n\\cr \\cline{2-5}\\\[-1.8ex]\n',
                   '\\begin{tabular}{@{\\extracolsep{5pt}}lcccc}\n':
                   '\\begin{tabular}{@{\\extracolsep{5pt}}lD{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3}}\n',
                   '\\\[-1.8ex] & (1) & (2) & (3) & (4) \\':
                   '& \\multicolumn{1}{c}{\\hspace{10pt}(1)} & \\multicolumn{1}{c}{\\hspace{10pt}(2)} & '\
                       '\\multicolumn{1}{c}{\\hspace{10pt}(3)} & \\multicolumn{1}{c}{\\hspace{10pt}(4)} \\'
                  }


latex = tables.render_latex()


for to_replace, replace_with in formatting_dict.items():
    latex = latex.replace(to_replace, replace_with)

# define the regular expression to match the dollar signs ($) in exponent notation
regex_remove_dollar = r"\$(\^\{.*?\})\$"

# replace all matches of the regular expression with the matched exponent notation, without the dollar signs
latex = re.sub(regex_remove_dollar, r"\1", latex)

# define the regular expression to match the pattern '(df = some_number)'
regex_remove_df = r'\(df = [\d\.]+; [\d\.]+\)'

latex = re.sub(regex_remove_df, '', latex)

# Removing Residual errors
regex_remove_resErr = r'Residual Std. Error.*\\\\\n'

latex = re.sub(regex_remove_resErr, '', latex)


# Removing F statistic line
# regex_remove_Fstat = r'F Statistic.*\\\\\n'

# latex = re.sub(regex_remove_Fstat, '', latex)

# Replacing the p-value line
to_replace = '\\textit{^{*}p$<$0.05; ^{**}p$<$0.01; ^{***}p$<$0.001}'
replace_with = '\\textit{$^{*}$p$<$0.05; $^{**}$p$<$0.01; $^{***}$p$<$0.001}'
latex = latex.replace(to_replace, replace_with)

# Order observation column
regex_center_numObs = r'(Observations & )(\d+ )(\& \d+ )(\& \d+ )(\& \d+)'

matches = re.findall(regex_center_numObs, latex)
replacement_numObs = ''

# iterate through each match and replace the numbers with the required format
for match in matches:
    a = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[1] + "} & "
    b = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[2].strip("& ") + "} & "
    c = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[3].strip("& ") + "} & "
    d = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[4].strip("& \\\\") + "}"
    replacement_numObs = match[0] + a + b + c + d

# print(replacement_numObs)
    
latex = re.sub(regex_center_numObs, replacement_numObs, latex)    
    
print(latex)



\begin{table}[H]
{\fontsize{10.0}{10.0}\selectfont{
\caption{\textbf{Linear probability models of attrition without controlling for journal/conference rank.} Models differ in how authors' experience is measured using (1) academic age, (2) number of papers by the time of retraction, (3) logged number of citations by the the time of retraction, and (4) logged number of collaborators by the time of retraction, respectively. Controls for author's scientific discipline are included as categorical variables, but are not shown. }
\label{supplementarytab:ols_noSJR_2attention}
\begin{center}
\begin{tabular}{@{\extracolsep{5pt}}lD{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3}}
\\[-1.8ex]\hline
\hline \\[-1.8ex]
& \multicolumn{4}{c}{\textit{Dependent variable: Attrition}} \
\cr \cline{2-5}\\[-1.8ex]
& \multicolumn{1}{c}{\hspace{10pt}(1)} & \multicolumn{1}{c}{\hspace{10pt}(2)} & \multicolumn{1}{c}{\hspace{10pt}(3)} & \multicolumn{1}{c}{\hspace{10pt}(4)} \\
\hline \\[-1.8ex]
 High Attention  ($>20$

## Logit w/o SJR and 2 attention categories

In [80]:
# These fields are the ones that are used in all the models
default_fids = ["yearofretraction", "gender", "contributionrank", "venue", 
             "reasons", "field", "affrank", "numauthors",]

# These are the fields that are highly correlated and will be used one at a time
alternating_fids = ["age", "papers", "logcitations", "collaborators"]

HIGH_THRESHOLD = 21

# Let us first discretize attention based on the treshold
df['DiscretizedAltmetricScoreAtRetraction'] = df.apply(lambda row: \
                                        discretize_attention_binary(row['AltmetricScoreAtRetraction'],
                                                                        threshold_high=HIGH_THRESHOLD),
                                        axis=1)

# Defining the expression for attention
attention_exp = "C(DiscretizedAltmetricScoreAtRetraction, Treatment('low attention'))"


# Saving all the models
ests = []
for alt_fid in alternating_fids:
    # Finally regressing
    est = regress_all_together('logit-noSJR', df, attention_exp, default_fids + [alt_fid])
    ests.append(est)

tables = Stargazer(ests)

tables.significance_levels([0.05, 0.01, 0.001])

renaming_dict = {'AcademicAgeAtRetraction': 'Academic Age',
                "C(DiscretizedAltmetricScoreAtRetraction, Treatment('low attention'))[T.high attention]": "High Attention  ($>20$ Altmetric score)",
                "C(GenderizeGender, Treatment(reference='male'))[T.female]":"Female",
                "C(MAGAIDRankTypeInRetractedPaper, Treatment(reference='First or Last or Only Author'))[T.Middle Author]": "Author order: Middle Author",
                "C(MAGJournalType, Treatment(reference='conference'))[T.journal]":"Venue: Journal",
                "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.misconduct]": "Reason: Misconduct",
                "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.plagiarism]":"Reason: Plagiarism",
                "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.other]":"Reason: Other",
                "Intercept":"Constant",
                "LogMAGCumCitationsAtRetraction": "Log(Citations)",
                "LogMAGCumCollaboratorsAtRetraction": "Log(Collaborators)",
                "MAGCumPapersAtRetraction": "Papers",
                "RetractionYear": "Retraction Year",
                "MAGRetractionYearAffRankOrdinal": "Author Affiliation Rank",
                "NumAuthorsInRetractedPaper": "Coauthors on Retracted Paper"
                }
                    
tables.rename_covariates(renaming_dict)

order = ["C(DiscretizedAltmetricScoreAtRetraction, Treatment('low attention'))[T.high attention]",
         "AcademicAgeAtRetraction",
         "MAGCumPapersAtRetraction",
         "LogMAGCumCitationsAtRetraction",
         "LogMAGCumCollaboratorsAtRetraction",
         "C(GenderizeGender, Treatment(reference='male'))[T.female]",
         "MAGRetractionYearAffRankOrdinal",
         "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.misconduct]",
         "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.plagiarism]",
         "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.other]",
         "NumAuthorsInRetractedPaper",
         "C(MAGAIDRankTypeInRetractedPaper, Treatment(reference='First or Last or Only Author'))[T.Middle Author]",
         "RetractionYear",
         "C(MAGJournalType, Treatment(reference='conference'))[T.journal]",
         "Intercept"]

tables.covariate_order(order)

tables

Attrition
Optimization terminated successfully.
         Current function value: 0.410580
         Iterations 8
Attrition
Optimization terminated successfully.
         Current function value: 0.382639
         Iterations 10
Attrition
Optimization terminated successfully.
         Current function value: 0.395190
         Iterations 7
Attrition
Optimization terminated successfully.
         Current function value: 0.355804
         Iterations 8


0,1,2,3,4
,,,,
,Dependent variable:Attrition,Dependent variable:Attrition,Dependent variable:Attrition,Dependent variable:Attrition
,,,,
,(1),(2),(3),(4)
,,,,
High Attention ($>20$ Altmetric score),0.536**,0.430*,0.884***,0.564**
,(0.192),(0.192),(0.196),(0.189)
Academic Age,-0.115***,,,
,(0.006),,,
Papers,,-0.064***,,


In [81]:
# Now we shall hard-code some things for pretty-fying the latex table


caption = '\\textbf{Logistic regression coefficients with 2 Altmetric attention categories: '\
        'low ($score < 20$), '\
        'and high ($score \geq 20$) '\
        'using low attention as reference category}, and '\
        'attrition as a dependent variable for different '\
        'author experience fields: model (1) using academic age, '\
        '(2) using number of papers by the time of retraction, '\
        '(3) using logged number of citations by the the time of retraction, and '\
        '(4) using logged number of collaborators by the time of retraction. '\
        'Controls for discipline of study are included as binary variables, but are not shown. '\
        '\\textbf{Journal/Conference rank is not included as a confounder}.'

caption = "\\textbf{Logistic regression models of attrition without controlling for journal/conference rank.} "\
        "Models differ in how authors' experience is measured using "\
        "(1) academic age, (2) number of papers by the time of retraction, "\
        "(3) logged number of citations by the the time of retraction, and "\
        "(4) logged number of collaborators by the time of retraction, respectively. "\
        "Controls for author's scientific discipline are included as categorical variables, but are not shown. "

label = 'supplementarytab:logit_noSJR_2attention'

formatting_dict = {'\\textit{Note:} & \\multicolumn{4}{r}{$^{*}$p$<$0.05; $^{**}$p$<$0.01; $^{***}$p$<$0.001} \\':
                 '\\textit{$^{*}$p$<$0.05; $^{**}$p$<$0.01; $^{***}$p$<$0.001}',
                  
                  '\\begin{table}[!htbp] \\centering': 
                  '\\begin{table}[H]\n{\\fontsize{10.0}{10.0}\\selectfont{\n\\caption{'+caption+'}\n'+\
                   '\\label{'+label+'}\n\\begin{center}',
                  '\\end{tabular}\n':'\\end{tabular}\n\\end{center}\n}}\n',
                  '\\multicolumn{4}{c}{\\textit{Dependent variable:}} \\\n\\cr \\cline{4-5}\n':
                  '\\multicolumn{4}{c}{\\textit{Dependent variable: Attrition}} \\\n\\cr \\cline{2-5}\\\[-1.8ex]\n',
                   '\\begin{tabular}{@{\\extracolsep{5pt}}lcccc}\n':
                   '\\begin{tabular}{@{\\extracolsep{5pt}}lD{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3}}\n',
                   '\\\[-1.8ex] & (1) & (2) & (3) & (4) \\':
                   '& \\multicolumn{1}{c}{\\hspace{10pt}(1)} & \\multicolumn{1}{c}{\\hspace{10pt}(2)} & '\
                       '\\multicolumn{1}{c}{\\hspace{10pt}(3)} & \\multicolumn{1}{c}{\\hspace{10pt}(4)} \\'
                  }


latex = tables.render_latex()


for to_replace, replace_with in formatting_dict.items():
    latex = latex.replace(to_replace, replace_with)

# define the regular expression to match the dollar signs ($) in exponent notation
regex_remove_dollar = r"\$(\^\{.*?\})\$"

# replace all matches of the regular expression with the matched exponent notation, without the dollar signs
latex = re.sub(regex_remove_dollar, r"\1", latex)

# define the regular expression to match the pattern '(df = some_number)'
regex_remove_df = r'\(df = [\d\.]+; [\d\.]+\)'

latex = re.sub(regex_remove_df, '', latex)

# Removing Residual errors
regex_remove_resErr = r'Residual Std. Error.*\\\\\n'

latex = re.sub(regex_remove_resErr, '', latex)


# Removing F statistic line
regex_remove_Fstat = r'F Statistic.*\\\\\n'

latex = re.sub(regex_remove_Fstat, '', latex)

# Replacing the p-value line
to_replace = '\\textit{^{*}p$<$0.05; ^{**}p$<$0.01; ^{***}p$<$0.001}'
replace_with = '\\textit{$^{*}$p$<$0.05; $^{**}$p$<$0.01; $^{***}$p$<$0.001}'
latex = latex.replace(to_replace, replace_with)

# Order observation column
regex_center_numObs = r'(Observations & )(\d+ )(\& \d+ )(\& \d+ )(\& \d+)'

matches = re.findall(regex_center_numObs, latex)
replacement_numObs = ''

# iterate through each match and replace the numbers with the required format
for match in matches:
    a = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[1] + "} & "
    b = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[2].strip("& ") + "} & "
    c = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[3].strip("& ") + "} & "
    d = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[4].strip("& \\\\") + "}"
    replacement_numObs = match[0] + a + b + c + d

# print(replacement_numObs)
    
latex = re.sub(regex_center_numObs, replacement_numObs, latex)    
    
print(latex)



\begin{table}[H]
{\fontsize{10.0}{10.0}\selectfont{
\caption{\textbf{Logistic regression models of attrition without controlling for journal/conference rank.} Models differ in how authors' experience is measured using (1) academic age, (2) number of papers by the time of retraction, (3) logged number of citations by the the time of retraction, and (4) logged number of collaborators by the time of retraction, respectively. Controls for author's scientific discipline are included as categorical variables, but are not shown. }
\label{supplementarytab:logit_noSJR_2attention}
\begin{center}
\begin{tabular}{@{\extracolsep{5pt}}lD{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3}}
\\[-1.8ex]\hline
\hline \\[-1.8ex]
& \multicolumn{4}{c}{\textit{Dependent variable: Attrition}} \
\cr \cline{2-5}\\[-1.8ex]
& \multicolumn{1}{c}{\hspace{10pt}(1)} & \multicolumn{1}{c}{\hspace{10pt}(2)} & \multicolumn{1}{c}{\hspace{10pt}(3)} & \multicolumn{1}{c}{\hspace{10pt}(4)} \\
\hline \\[-1.8ex]
 High Attention  ($>

## Robust OLS w/ SJR and 2 attention categories

In [82]:
# Reading the regression file

df_main = pd.read_csv("../../../../data/h4_altmetric/regression/RW_Authors_forRegression_rematching.csv")

# Let us define the dependent variable

df_main['Attrition'] = df_main['AttritedClassRobust']

Yi = 'Attrition'

# Removing class that's neither attrited nor non-attrited (i.e. those that are attrited due to something else)
df_main = df_main[df_main.Attrition.isin([0,1])]

df_main.tail(2)

attention_agg_cols = ['AltmetricScoreAtRetraction',
                     'AggregateSocialMediaMentionsAtRetraction',
                     'AggregateNewsMediaMentionsAtRetraction',
                     'AggregateBlogsMentionsAtRetraction',
                     'AggregateKnowledgeRepositoriesMentionsAtRetraction']

# Creating the relevant dataframe
df = df_main[['Record ID','MAGAID'] + [Yi,'AttritedClassRobust']+
                 ['GenderizeGender',
                 'AcademicAgeAtRetraction',
                 'MAGCumPapersAtRetraction',
                 'LogMAGCumCitationsAtRetraction',
                 'LogMAGCumCollaboratorsAtRetraction']+
                list(df_main.\
                    filter(regex=("Field_.*")).columns)+
                ['MAGAIDRankTypeInRetractedPaper',
                'RetractionYear',
                'ReasonPropagatedMajorityOfMajority',
                'MAGJournalType',
                'SJRQuartileRetractedPaperYear',
                'MAGRetractionYearAffRankOrdinal',
                'NumAuthorsInRetractedPaper']+
                attention_agg_cols].drop_duplicates()

df

Unnamed: 0,Record ID,MAGAID,Attrition,AttritedClassRobust,GenderizeGender,AcademicAgeAtRetraction,MAGCumPapersAtRetraction,LogMAGCumCitationsAtRetraction,LogMAGCumCollaboratorsAtRetraction,Field_ART,...,ReasonPropagatedMajorityOfMajority,MAGJournalType,SJRQuartileRetractedPaperYear,MAGRetractionYearAffRankOrdinal,NumAuthorsInRetractedPaper,AltmetricScoreAtRetraction,AggregateSocialMediaMentionsAtRetraction,AggregateNewsMediaMentionsAtRetraction,AggregateBlogsMentionsAtRetraction,AggregateKnowledgeRepositoriesMentionsAtRetraction
0,3031,2.111744e+09,0,0,male,35.0,166.0,8.491670,4.962845,0,...,mistake,journal,,13.0,2.0,0.00,0.0,0.0,0.0,0.0
1,3031,2.245003e+09,0,0,male,2.0,4.0,2.197225,0.693147,0,...,mistake,journal,,13.0,2.0,0.00,0.0,0.0,0.0,0.0
2,1082,2.120727e+09,0,0,male,5.0,32.0,3.258097,3.367296,0,...,mistake,journal,,350.0,7.0,0.00,0.0,0.0,0.0,0.0
3,1082,2.151686e+09,0,0,male,18.0,246.0,6.505784,5.347108,0,...,mistake,journal,,350.0,7.0,0.00,0.0,0.0,0.0,0.0
4,1082,2.552715e+09,0,0,male,6.0,41.0,4.317488,3.737670,0,...,mistake,journal,,350.0,7.0,0.00,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34708,8314,1.979824e+09,0,0,male,4.0,10.0,3.871201,1.386294,0,...,mistake,journal,3.0,550.0,1.0,0.00,0.0,0.0,0.0,0.0
34710,2835,1.972149e+09,1,1,male,1.0,3.0,1.945910,1.791759,0,...,misconduct,journal,2.0,850.0,4.0,0.00,0.0,0.0,0.0,0.0
34715,16836,2.650217e+09,1,1,female,0.0,1.0,0.000000,0.693147,0,...,other,journal,1.0,175.0,2.0,2.75,11.0,0.0,0.0,0.0
34716,16836,2.690000e+09,1,1,female,0.0,1.0,0.000000,0.693147,0,...,other,journal,1.0,175.0,2.0,2.75,11.0,0.0,0.0,0.0


In [83]:
# These fields are the ones that are used in all the models
default_fids = ["yearofretraction", "gender", "contributionrank", "venue", 
             "reasons", "field", "affrank", "numauthors", "impactfactor"]

# These are the fields that are highly correlated and will be used one at a time
alternating_fids = ["age", "papers", "logcitations", "collaborators"]

HIGH_THRESHOLD = 21

# Let us first discretize attention based on the treshold
df['DiscretizedAltmetricScoreAtRetraction'] = df.apply(lambda row: \
                                        discretize_attention_binary(row['AltmetricScoreAtRetraction'],
                                                                        threshold_high=HIGH_THRESHOLD),
                                        axis=1)

# Defining the expression for attention
attention_exp = "C(DiscretizedAltmetricScoreAtRetraction, Treatment('low attention'))"


# Saving all the models
ests = []
for alt_fid in alternating_fids:
    # Finally regressing
    est = regress_all_together('ols-wSJR', df, attention_exp, default_fids + [alt_fid])
    ests.append(est)

tables = Stargazer(ests)

tables.significance_levels([0.05, 0.01, 0.001])

renaming_dict = {'AcademicAgeAtRetraction': 'Academic Age',
                "C(DiscretizedAltmetricScoreAtRetraction, Treatment('low attention'))[T.high attention]": "High Attention  ($>20$ Altemetric score)",
                "C(GenderizeGender, Treatment(reference='male'))[T.female]":"Female",
                "C(MAGAIDRankTypeInRetractedPaper, Treatment(reference='First or Last or Only Author'))[T.Middle Author]": "Author order: Middle Author",
                "C(MAGJournalType, Treatment(reference='conference'))[T.journal]":"Venue: Journal",
                "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.misconduct]": "Reason: Misconduct",
                "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.plagiarism]":"Reason: Plagiarism",
                "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.other]":"Reason: Other",
                "Intercept":"Constant",
                "LogMAGCumCitationsAtRetraction": "Log(Citations)",
                "LogMAGCumCollaboratorsAtRetraction": "Log(Collaborators)",
                "MAGCumPapersAtRetraction": "Papers",
                "RetractionYear": "Retraction Year",
                "MAGRetractionYearAffRankOrdinal": "Author Affiliation Rank",
                "NumAuthorsInRetractedPaper": "Coauthors on Retracted Paper",
                "SJRQuartileRetractedPaperYear": "Journal/Conference Rank",
                }
                    
tables.rename_covariates(renaming_dict)

order = ["C(DiscretizedAltmetricScoreAtRetraction, Treatment('low attention'))[T.high attention]",
         "AcademicAgeAtRetraction",
         "MAGCumPapersAtRetraction",
         "LogMAGCumCitationsAtRetraction",
         "LogMAGCumCollaboratorsAtRetraction",
         "C(GenderizeGender, Treatment(reference='male'))[T.female]",
         "MAGRetractionYearAffRankOrdinal",
         "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.misconduct]",
         "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.plagiarism]",
         "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.other]",
         "NumAuthorsInRetractedPaper",
         "C(MAGAIDRankTypeInRetractedPaper, Treatment(reference='First or Last or Only Author'))[T.Middle Author]",
         "RetractionYear",
         "C(MAGJournalType, Treatment(reference='conference'))[T.journal]",
         "SJRQuartileRetractedPaperYear",
         "Intercept"]

tables.covariate_order(order)

tables

Attrition
Attrition
Attrition
Attrition


0,1,2,3,4
,,,,
,Dependent variable:Attrition,Dependent variable:Attrition,Dependent variable:Attrition,Dependent variable:Attrition
,,,,
,(1),(2),(3),(4)
,,,,
High Attention ($>20$ Altemetric score),0.052*,0.037,0.087***,0.047*
,(0.026),(0.027),(0.027),(0.024)
Academic Age,-0.009***,,,
,(0.000),,,
Papers,,-0.001***,,


In [84]:
# Now we shall hard-code some things for pretty-fying the latex table


caption = '\\textbf{Robust OLS regression coefficients with 2 Altmetric attention categories: '\
        'low ($score < 20$), '\
        'and high ($score \geq 20$) '\
        'using low attention as reference category}, and '\
        'attrition as a dependent variable for different '\
        'author experience fields: model (1) using academic age, '\
        '(2) using number of papers by the time of retraction, '\
        '(3) using logged number of citations by the the time of retraction, and '\
        '(4) using logged number of collaborators by the time of retraction. '\
        'Controls for discipline of study are included as binary variables, but are not shown. '\
        '\\textbf{Authors whose last activity was one year post-retraction were marked as attrited}.'

caption = "\\textbf{Linear probability models of attrition by classifying authors as attrited "\
        "who left scientific publishing in the years -1, 0, and 1.} "\
        "Models differ in how authors' experience is measured using "\
        "(1) academic age, (2) number of papers by the time of retraction, "\
        "(3) logged number of citations by the the time of retraction, and "\
        "(4) logged number of collaborators by the time of retraction, respectively. "\
        "Controls for author's scientific discipline are included as categorical variables, but are not shown. "

label = 'supplementarytab:robust_ols_wSJR_2attention'

formatting_dict = {'\\textit{Note:} & \\multicolumn{4}{r}{$^{*}$p$<$0.05; $^{**}$p$<$0.01; $^{***}$p$<$0.001} \\':
                 '\\textit{$^{*}$p$<$0.05; $^{**}$p$<$0.01; $^{***}$p$<$0.001}',
                  
                  '\\begin{table}[!htbp] \\centering': 
                  '\\begin{table}[H]\n{\\fontsize{10.0}{10.0}\\selectfont{\n\\caption{'+caption+'}\n'+\
                   '\\label{'+label+'}\n\\begin{center}',
                  '\\end{tabular}\n':'\\end{tabular}\n\\end{center}\n}}\n',
                  '\\multicolumn{4}{c}{\\textit{Dependent variable:}} \\\n\\cr \\cline{4-5}\n':
                  '\\multicolumn{4}{c}{\\textit{Dependent variable: Attrition}} \\\n\\cr \\cline{2-5}\\\[-1.8ex]\n',
                   '\\begin{tabular}{@{\\extracolsep{5pt}}lcccc}\n':
                   '\\begin{tabular}{@{\\extracolsep{5pt}}lD{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3}}\n',
                   '\\\[-1.8ex] & (1) & (2) & (3) & (4) \\':
                   '& \\multicolumn{1}{c}{\\hspace{10pt}(1)} & \\multicolumn{1}{c}{\\hspace{10pt}(2)} & '\
                       '\\multicolumn{1}{c}{\\hspace{10pt}(3)} & \\multicolumn{1}{c}{\\hspace{10pt}(4)} \\'
                  }


latex = tables.render_latex()


for to_replace, replace_with in formatting_dict.items():
    latex = latex.replace(to_replace, replace_with)

# define the regular expression to match the dollar signs ($) in exponent notation
regex_remove_dollar = r"\$(\^\{.*?\})\$"

# replace all matches of the regular expression with the matched exponent notation, without the dollar signs
latex = re.sub(regex_remove_dollar, r"\1", latex)

# define the regular expression to match the pattern '(df = some_number)'
regex_remove_df = r'\(df = [\d\.]+; [\d\.]+\)'

latex = re.sub(regex_remove_df, '', latex)

# Removing Residual errors
regex_remove_resErr = r'Residual Std. Error.*\\\\\n'

latex = re.sub(regex_remove_resErr, '', latex)


# # Removing F statistic line
# regex_remove_Fstat = r'F Statistic.*\\\\\n'

# latex = re.sub(regex_remove_Fstat, '', latex)

# Replacing the p-value line
to_replace = '\\textit{^{*}p$<$0.05; ^{**}p$<$0.01; ^{***}p$<$0.001}'
replace_with = '\\textit{$^{*}$p$<$0.05; $^{**}$p$<$0.01; $^{***}$p$<$0.001}'
latex = latex.replace(to_replace, replace_with)

# Order observation column
regex_center_numObs = r'(Observations & )(\d+ )(\& \d+ )(\& \d+ )(\& \d+)'

matches = re.findall(regex_center_numObs, latex)
replacement_numObs = ''

# iterate through each match and replace the numbers with the required format
for match in matches:
    a = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[1] + "} & "
    b = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[2].strip("& ") + "} & "
    c = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[3].strip("& ") + "} & "
    d = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[4].strip("& \\\\") + "}"
    replacement_numObs = match[0] + a + b + c + d

# print(replacement_numObs)
    
latex = re.sub(regex_center_numObs, replacement_numObs, latex)    
    
print(latex)

\begin{table}[H]
{\fontsize{10.0}{10.0}\selectfont{
\caption{\textbf{Linear probability models of attrition by classifying authors as attrited who left scientific publishing in the years -1, 0, and 1.} Models differ in how authors' experience is measured using (1) academic age, (2) number of papers by the time of retraction, (3) logged number of citations by the the time of retraction, and (4) logged number of collaborators by the time of retraction, respectively. Controls for author's scientific discipline are included as categorical variables, but are not shown. }
\label{supplementarytab:robust_ols_wSJR_2attention}
\begin{center}
\begin{tabular}{@{\extracolsep{5pt}}lD{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3}}
\\[-1.8ex]\hline
\hline \\[-1.8ex]
& \multicolumn{4}{c}{\textit{Dependent variable: Attrition}} \
\cr \cline{2-5}\\[-1.8ex]
& \multicolumn{1}{c}{\hspace{10pt}(1)} & \multicolumn{1}{c}{\hspace{10pt}(2)} & \multicolumn{1}{c}{\hspace{10pt}(3)} & \multicolumn{1}{c}{\hspace{10pt

## Robust Logit w/ SJR and 2 attention categories

In [85]:
# These fields are the ones that are used in all the models
default_fids = ["yearofretraction", "gender", "contributionrank", "venue", 
             "reasons", "field", "affrank", "numauthors", "impactfactor",]

# These are the fields that are highly correlated and will be used one at a time
alternating_fids = ["age", "papers", "logcitations", "collaborators"]

HIGH_THRESHOLD = 21

# Let us first discretize attention based on the treshold
df['DiscretizedAltmetricScoreAtRetraction'] = df.apply(lambda row: \
                                        discretize_attention_binary(row['AltmetricScoreAtRetraction'],
                                                                        threshold_high=HIGH_THRESHOLD),
                                        axis=1)

# Defining the expression for attention
attention_exp = "C(DiscretizedAltmetricScoreAtRetraction, Treatment('low attention'))"


# Saving all the models
ests = []
for alt_fid in alternating_fids:
    # Finally regressing
    est = regress_all_together('logit-wSJR', df, attention_exp, default_fids + [alt_fid])
    ests.append(est)

tables = Stargazer(ests)

tables.significance_levels([0.05, 0.01, 0.001])

renaming_dict = {'AcademicAgeAtRetraction': 'Academic Age',
                "C(DiscretizedAltmetricScoreAtRetraction, Treatment('low attention'))[T.high attention]": "High Attention  ($>20$ Altemetric score)",
                "C(GenderizeGender, Treatment(reference='male'))[T.female]":"Female",
                "C(MAGAIDRankTypeInRetractedPaper, Treatment(reference='First or Last or Only Author'))[T.Middle Author]": "Author order: Middle Author",
                "C(MAGJournalType, Treatment(reference='conference'))[T.journal]":"Venue: Journal",
                "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.misconduct]": "Reason: Misconduct",
                "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.plagiarism]":"Reason: Plagiarism",
                "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.other]":"Reason: Other",
                "Intercept":"Constant",
                "LogMAGCumCitationsAtRetraction": "Log(Citations)",
                "LogMAGCumCollaboratorsAtRetraction": "Log(Collaborators)",
                "MAGCumPapersAtRetraction": "Papers",
                "RetractionYear": "Retraction Year",
                "MAGRetractionYearAffRankOrdinal": "Author Affiliation Rank",
                "SJRQuartileRetractedPaperYear": "Journal/Conference Rank",
                "NumAuthorsInRetractedPaper": "Coauthors on Retracted Paper"
                }
                    
tables.rename_covariates(renaming_dict)

order = ["C(DiscretizedAltmetricScoreAtRetraction, Treatment('low attention'))[T.high attention]",
         "AcademicAgeAtRetraction",
         "MAGCumPapersAtRetraction",
         "LogMAGCumCitationsAtRetraction",
         "LogMAGCumCollaboratorsAtRetraction",
         "C(GenderizeGender, Treatment(reference='male'))[T.female]",
         "MAGRetractionYearAffRankOrdinal",
         "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.misconduct]",
         "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.plagiarism]",
         "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.other]",
         "NumAuthorsInRetractedPaper",
         "C(MAGAIDRankTypeInRetractedPaper, Treatment(reference='First or Last or Only Author'))[T.Middle Author]",
         "RetractionYear",
         "C(MAGJournalType, Treatment(reference='conference'))[T.journal]",
         "SJRQuartileRetractedPaperYear",
         "Intercept"]

tables.covariate_order(order)

tables

Attrition
Optimization terminated successfully.
         Current function value: 0.445254
         Iterations 7
Attrition
Optimization terminated successfully.
         Current function value: 0.420290
         Iterations 9
Attrition
Optimization terminated successfully.
         Current function value: 0.427325
         Iterations 7
Attrition
Optimization terminated successfully.
         Current function value: 0.386423
         Iterations 8


0,1,2,3,4
,,,,
,Dependent variable:Attrition,Dependent variable:Attrition,Dependent variable:Attrition,Dependent variable:Attrition
,,,,
,(1),(2),(3),(4)
,,,,
High Attention ($>20$ Altemetric score),0.445*,0.324,0.754***,0.474**
,(0.175),(0.175),(0.179),(0.179)
Academic Age,-0.104***,,,
,(0.007),,,
Papers,,-0.048***,,


In [86]:
# Now we shall hard-code some things for pretty-fying the latex table

caption = "\\textbf{Logistic regression models of attrition by classifying authors as attrited "\
        "who left scientific publishing in the years -1, 0, and 1.} "\
        "Models differ in how authors' experience is measured using "\
        "(1) academic age, (2) number of papers by the time of retraction, "\
        "(3) logged number of citations by the the time of retraction, and "\
        "(4) logged number of collaborators by the time of retraction, respectively. "\
        "Controls for author's scientific discipline are included as categorical variables, but are not shown. "

label = 'supplementarytab:robust_logit_wSJR_2attention'

formatting_dict = {'\\textit{Note:} & \\multicolumn{4}{r}{$^{*}$p$<$0.05; $^{**}$p$<$0.01; $^{***}$p$<$0.001} \\':
                 '\\textit{$^{*}$p$<$0.05; $^{**}$p$<$0.01; $^{***}$p$<$0.001}',
                  
                  '\\begin{table}[!htbp] \\centering': 
                  '\\begin{table}[H]\n{\\fontsize{10.0}{10.0}\\selectfont{\n\\caption{'+caption+'}\n'+\
                   '\\label{'+label+'}\n\\begin{center}',
                  '\\end{tabular}\n':'\\end{tabular}\n\\end{center}\n}}\n',
                  '\\multicolumn{4}{c}{\\textit{Dependent variable:}} \\\n\\cr \\cline{4-5}\n':
                  '\\multicolumn{4}{c}{\\textit{Dependent variable: Attrition}} \\\n\\cr \\cline{2-5}\\\[-1.8ex]\n',
                   '\\begin{tabular}{@{\\extracolsep{5pt}}lcccc}\n':
                   '\\begin{tabular}{@{\\extracolsep{5pt}}lD{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3}}\n',
                   '\\\[-1.8ex] & (1) & (2) & (3) & (4) \\':
                   '& \\multicolumn{1}{c}{\\hspace{10pt}(1)} & \\multicolumn{1}{c}{\\hspace{10pt}(2)} & '\
                       '\\multicolumn{1}{c}{\\hspace{10pt}(3)} & \\multicolumn{1}{c}{\\hspace{10pt}(4)} \\'
                  }


latex = tables.render_latex()


for to_replace, replace_with in formatting_dict.items():
    latex = latex.replace(to_replace, replace_with)

# define the regular expression to match the dollar signs ($) in exponent notation
regex_remove_dollar = r"\$(\^\{.*?\})\$"

# replace all matches of the regular expression with the matched exponent notation, without the dollar signs
latex = re.sub(regex_remove_dollar, r"\1", latex)

# define the regular expression to match the pattern '(df = some_number)'
regex_remove_df = r'\(df = [\d\.]+; [\d\.]+\)'

latex = re.sub(regex_remove_df, '', latex)

# Removing Residual errors
regex_remove_resErr = r'Residual Std. Error.*\\\\\n'

latex = re.sub(regex_remove_resErr, '', latex)


# Removing F statistic line
regex_remove_Fstat = r'F Statistic.*\\\\\n'

latex = re.sub(regex_remove_Fstat, '', latex)

# Replacing the p-value line
to_replace = '\\textit{^{*}p$<$0.05; ^{**}p$<$0.01; ^{***}p$<$0.001}'
replace_with = '\\textit{$^{*}$p$<$0.05; $^{**}$p$<$0.01; $^{***}$p$<$0.001}'
latex = latex.replace(to_replace, replace_with)

# Order observation column
regex_center_numObs = r'(Observations & )(\d+ )(\& \d+ )(\& \d+ )(\& \d+)'

matches = re.findall(regex_center_numObs, latex)
replacement_numObs = ''

# iterate through each match and replace the numbers with the required format
for match in matches:
    a = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[1] + "} & "
    b = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[2].strip("& ") + "} & "
    c = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[3].strip("& ") + "} & "
    d = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[4].strip("& \\\\") + "}"
    replacement_numObs = match[0] + a + b + c + d

# print(replacement_numObs)
    
latex = re.sub(regex_center_numObs, replacement_numObs, latex)    
    
print(latex)


\begin{table}[H]
{\fontsize{10.0}{10.0}\selectfont{
\caption{\textbf{Logistic regression models of attrition by classifying authors as attrited who left scientific publishing in the years -1, 0, and 1.} Models differ in how authors' experience is measured using (1) academic age, (2) number of papers by the time of retraction, (3) logged number of citations by the the time of retraction, and (4) logged number of collaborators by the time of retraction, respectively. Controls for author's scientific discipline are included as categorical variables, but are not shown. }
\label{supplementarytab:robust_logit_wSJR_2attention}
\begin{center}
\begin{tabular}{@{\extracolsep{5pt}}lD{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3}}
\\[-1.8ex]\hline
\hline \\[-1.8ex]
& \multicolumn{4}{c}{\textit{Dependent variable: Attrition}} \
\cr \cline{2-5}\\[-1.8ex]
& \multicolumn{1}{c}{\hspace{10pt}(1)} & \multicolumn{1}{c}{\hspace{10pt}(2)} & \multicolumn{1}{c}{\hspace{10pt}(3)} & \multicolumn{1}{c}{\hspace{1

In [87]:
print("Log-Likelihood", ests[0].llf, ests[1].llf, ests[2].llf, ests[3].llf)


Log-Likelihood -4763.33192407921 -4496.266609057627 -4571.52164779121 -4133.948953555913


## Smaller OLS w/ SJR and 2 attention categories and "Retractor" category

In [24]:
# Reading the regression file

df_main = pd.read_csv("../../../../data/h4_altmetric/regression/RW_Authors_forRegression_rematching.csv")

# Let us define the dependent variable

df_main['Attrition'] = df_main['AttritedClass']

Yi = 'Attrition'

# Removing class that's neither attrited nor non-attrited (i.e. those that are attrited due to something else)
df_main = df_main[df_main.Attrition.isin([0,1])]

df_main.tail(2)

attention_agg_cols = ['AltmetricScoreAtRetraction',
                     'AggregateSocialMediaMentionsAtRetraction',
                     'AggregateNewsMediaMentionsAtRetraction',
                     'AggregateBlogsMentionsAtRetraction',
                     'AggregateKnowledgeRepositoriesMentionsAtRetraction']

# Creating the relevant dataframe
df = df_main[['Record ID','MAGAID'] + [Yi,'AttritedClassRobust']+
                 ['GenderizeGender',
                 'AcademicAgeAtRetraction',
                 'MAGCumPapersAtRetraction',
                 'LogMAGCumCitationsAtRetraction',
                 'LogMAGCumCollaboratorsAtRetraction']+
                list(df_main.\
                    filter(regex=("Field_.*")).columns)+
                ['MAGAIDRankTypeInRetractedPaper',
                'RetractionYear',
                'RetractorMajority',
                'ReasonPropagatedMajorityOfMajority',
                'MAGJournalType',
                'SJRQuartileRetractedPaperYear',
                'MAGRetractionYearAffRankOrdinal',
                'NumAuthorsInRetractedPaper']+
                attention_agg_cols].drop_duplicates()

df

Unnamed: 0,Record ID,MAGAID,Attrition,AttritedClassRobust,GenderizeGender,AcademicAgeAtRetraction,MAGCumPapersAtRetraction,LogMAGCumCitationsAtRetraction,LogMAGCumCollaboratorsAtRetraction,Field_ART,...,ReasonPropagatedMajorityOfMajority,MAGJournalType,SJRQuartileRetractedPaperYear,MAGRetractionYearAffRankOrdinal,NumAuthorsInRetractedPaper,AltmetricScoreAtRetraction,AggregateSocialMediaMentionsAtRetraction,AggregateNewsMediaMentionsAtRetraction,AggregateBlogsMentionsAtRetraction,AggregateKnowledgeRepositoriesMentionsAtRetraction
0,3031,2.111744e+09,0,0,male,35.0,166.0,8.491670,4.962845,0,...,mistake,journal,,13.0,2.0,0.00,0.0,0.0,0.0,0.0
1,3031,2.245003e+09,0,0,male,2.0,4.0,2.197225,0.693147,0,...,mistake,journal,,13.0,2.0,0.00,0.0,0.0,0.0,0.0
2,1082,2.120727e+09,0,0,male,5.0,32.0,3.258097,3.367296,0,...,mistake,journal,,350.0,7.0,0.00,0.0,0.0,0.0,0.0
3,1082,2.151686e+09,0,0,male,18.0,246.0,6.505784,5.347108,0,...,mistake,journal,,350.0,7.0,0.00,0.0,0.0,0.0,0.0
4,1082,2.552715e+09,0,0,male,6.0,41.0,4.317488,3.737670,0,...,mistake,journal,,350.0,7.0,0.00,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34708,8314,1.979824e+09,0,0,male,4.0,10.0,3.871201,1.386294,0,...,mistake,journal,3.0,550.0,1.0,0.00,0.0,0.0,0.0,0.0
34710,2835,1.972149e+09,1,1,male,1.0,3.0,1.945910,1.791759,0,...,misconduct,journal,2.0,850.0,4.0,0.00,0.0,0.0,0.0,0.0
34715,16836,2.650217e+09,1,1,female,0.0,1.0,0.000000,0.693147,0,...,other,journal,1.0,175.0,2.0,2.75,11.0,0.0,0.0,0.0
34716,16836,2.690000e+09,1,1,female,0.0,1.0,0.000000,0.693147,0,...,other,journal,1.0,175.0,2.0,2.75,11.0,0.0,0.0,0.0


In [25]:
# These fields are the ones that are used in all the models
default_fids = ["yearofretraction", "gender", "contributionrank",
             "reasons", "field", "affrank", "numauthors", "impactfactor", "retractor"]

# These are the fields that are highly correlated and will be used one at a time
alternating_fids = ["age", "papers", "logcitations", "collaborators"]

HIGH_THRESHOLD = 21

# Let us first discretize attention based on the treshold
df['DiscretizedAltmetricScoreAtRetraction'] = df.apply(lambda row: \
                                        discretize_attention_binary(row['AltmetricScoreAtRetraction'],
                                                                        threshold_high=HIGH_THRESHOLD),
                                        axis=1)

# Defining the expression for attention
attention_exp = "C(DiscretizedAltmetricScoreAtRetraction, Treatment('low attention'))"


# Saving all the models
ests = []
for alt_fid in alternating_fids:
    # Finally regressing
    est = regress_all_together('ols-wSJR', df, attention_exp, default_fids + [alt_fid])
    ests.append(est)

tables = Stargazer(ests)

tables.significance_levels([0.05, 0.01, 0.001])

renaming_dict = {'AcademicAgeAtRetraction': 'Academic Age',
                "C(DiscretizedAltmetricScoreAtRetraction, Treatment('low attention'))[T.high attention]": "High Attention  ($>20$ Altemetric score)",
                "C(GenderizeGender, Treatment(reference='male'))[T.female]":"Female",
                "C(MAGAIDRankTypeInRetractedPaper, Treatment(reference='First or Last or Only Author'))[T.Middle Author]": "Author order: Middle Author",
                "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.misconduct]": "Reason: Misconduct",
                "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.plagiarism]":"Reason: Plagiarism",
                "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.other]":"Reason: Other",
                "C(RetractorMajority, Treatment(reference='author'))[T.journal]":"Retracted by: Journal",
                "C(RetractorMajority, Treatment(reference='author'))[T.other]":"Retracted by: Other",
                "Intercept":"Constant",
                "LogMAGCumCitationsAtRetraction": "Log(Citations)",
                "LogMAGCumCollaboratorsAtRetraction": "Log(Collaborators)",
                "MAGCumPapersAtRetraction": "Papers",
                "RetractionYear": "Retraction Year",
                "MAGRetractionYearAffRankOrdinal": "Author Affiliation Rank",
                "SJRQuartileRetractedPaperYear": "Journal/Conference Rank",
                "NumAuthorsInRetractedPaper": "Coauthors on Retracted Paper"
                }
                    
tables.rename_covariates(renaming_dict)

order = ["C(DiscretizedAltmetricScoreAtRetraction, Treatment('low attention'))[T.high attention]",
         "AcademicAgeAtRetraction",
         "MAGCumPapersAtRetraction",
         "LogMAGCumCitationsAtRetraction",
         "LogMAGCumCollaboratorsAtRetraction",
         "C(GenderizeGender, Treatment(reference='male'))[T.female]",
         "MAGRetractionYearAffRankOrdinal",
         "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.misconduct]",
         "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.plagiarism]",
         "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.other]",
         "C(RetractorMajority, Treatment(reference='author'))[T.journal]",
         "C(RetractorMajority, Treatment(reference='author'))[T.other]",
         "NumAuthorsInRetractedPaper",
         "C(MAGAIDRankTypeInRetractedPaper, Treatment(reference='First or Last or Only Author'))[T.Middle Author]",
         "RetractionYear",
         "SJRQuartileRetractedPaperYear",
         "Intercept"]

tables.covariate_order(order)

tables

Attrition
Attrition
Attrition
Attrition


0,1,2,3,4
,,,,
,Dependent variable:Attrition,Dependent variable:Attrition,Dependent variable:Attrition,Dependent variable:Attrition
,,,,
,(1),(2),(3),(4)
,,,,
High Attention ($>20$ Altemetric score),0.117*,0.111*,0.164**,0.109*
,(0.054),(0.054),(0.056),(0.049)
Academic Age,-0.009***,,,
,(0.001),,,
Papers,,-0.001***,,


In [26]:
# Now we shall hard-code some things for pretty-fying the latex table

# Now we shall hard-code some things for pretty-fying the latex table

caption = "\\textbf{Linear probability models of attrition including a control for who led the retraction. } "\
        "Models differ in how authors' experience is measured using "\
        "(1) academic age, (2) number of papers by the time of retraction, "\
        "(3) logged number of citations by the the time of retraction, and "\
        "(4) logged number of collaborators by the time of retraction, respectively. "\
        "Controls for author's scientific discipline are included as categorical variables, but are not shown. "\
        "Note that these regression models only include authors whose retraction notices were manually annotated."

label = 'supplementarytab:retractor_ols_wSJR_2attention'

formatting_dict = {'\\textit{Note:} & \\multicolumn{4}{r}{$^{*}$p$<$0.05; $^{**}$p$<$0.01; $^{***}$p$<$0.001} \\':
                 '\\textit{$^{*}$p$<$0.05; $^{**}$p$<$0.01; $^{***}$p$<$0.001}',
                  
                  '\\begin{table}[!htbp] \\centering': 
                  '\\begin{table}[H]\n{\\fontsize{10.0}{10.0}\\selectfont{\n\\caption{'+caption+'}\n'+\
                   '\\label{'+label+'}\n\\begin{center}',
                  '\\end{tabular}\n':'\\end{tabular}\n\\end{center}\n}}\n',
                  '\\multicolumn{4}{c}{\\textit{Dependent variable:}} \\\n\\cr \\cline{4-5}\n':
                  '\\multicolumn{4}{c}{\\textit{Dependent variable: Attrition}} \\\n\\cr \\cline{2-5}\\\[-1.8ex]\n',
                   '\\begin{tabular}{@{\\extracolsep{5pt}}lcccc}\n':
                   '\\begin{tabular}{@{\\extracolsep{5pt}}lD{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3}}\n',
                   '\\\[-1.8ex] & (1) & (2) & (3) & (4) \\':
                   '& \\multicolumn{1}{c}{\\hspace{10pt}(1)} & \\multicolumn{1}{c}{\\hspace{10pt}(2)} & '\
                       '\\multicolumn{1}{c}{\\hspace{10pt}(3)} & \\multicolumn{1}{c}{\\hspace{10pt}(4)} \\'
                  }


latex = tables.render_latex()


for to_replace, replace_with in formatting_dict.items():
    latex = latex.replace(to_replace, replace_with)

# define the regular expression to match the dollar signs ($) in exponent notation
regex_remove_dollar = r"\$(\^\{.*?\})\$"

# replace all matches of the regular expression with the matched exponent notation, without the dollar signs
latex = re.sub(regex_remove_dollar, r"\1", latex)

# define the regular expression to match the pattern '(df = some_number)'
regex_remove_df = r'\(df = [\d\.]+; [\d\.]+\)'

latex = re.sub(regex_remove_df, '', latex)

# Removing Residual errors
regex_remove_resErr = r'Residual Std. Error.*\\\\\n'

latex = re.sub(regex_remove_resErr, '', latex)

# # Removing F statistic line
# regex_remove_Fstat = r'F Statistic.*\\\\\n'

# latex = re.sub(regex_remove_Fstat, '', latex)

# Replacing the p-value line
to_replace = '\\textit{^{*}p$<$0.05; ^{**}p$<$0.01; ^{***}p$<$0.001}'
replace_with = '\\textit{$^{*}$p$<$0.05; $^{**}$p$<$0.01; $^{***}$p$<$0.001}'
latex = latex.replace(to_replace, replace_with)

# Order observation column
regex_center_numObs = r'(Observations & )(\d+ )(\& \d+ )(\& \d+ )(\& \d+)'

matches = re.findall(regex_center_numObs, latex)
replacement_numObs = ''

# iterate through each match and replace the numbers with the required format
for match in matches:
    a = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[1] + "} & "
    b = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[2].strip("& ") + "} & "
    c = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[3].strip("& ") + "} & "
    d = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[4].strip("& \\\\") + "}"
    replacement_numObs = match[0] + a + b + c + d

# print(replacement_numObs)
    
latex = re.sub(regex_center_numObs, replacement_numObs, latex)    
    
print(latex)


\begin{table}[H]
{\fontsize{10.0}{10.0}\selectfont{
\caption{\textbf{Linear probability models of attrition including a control for who led the retraction. } Models differ in how authors' experience is measured using (1) academic age, (2) number of papers by the time of retraction, (3) logged number of citations by the the time of retraction, and (4) logged number of collaborators by the time of retraction, respectively. Controls for author's scientific discipline are included as categorical variables, but are not shown. Note that these regression models only include authors whose retraction notices were manually annotated.}
\label{supplementarytab:retractor_ols_wSJR_2attention}
\begin{center}
\begin{tabular}{@{\extracolsep{5pt}}lD{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3}}
\\[-1.8ex]\hline
\hline \\[-1.8ex]
& \multicolumn{4}{c}{\textit{Dependent variable: Attrition}} \
\cr \cline{2-5}\\[-1.8ex]
& \multicolumn{1}{c}{\hspace{10pt}(1)} & \multicolumn{1}{c}{\hspace{10pt}(2)} & \multic

## Smaller Logit w/ SJR and 2 attention categories and "Retractor" category

In [27]:
# These fields are the ones that are used in all the models
default_fids = ["yearofretraction", "gender", "contributionrank",
             "reasons", "field", "affrank", "numauthors", "impactfactor", "retractor"]

# These are the fields that are highly correlated and will be used one at a time
alternating_fids = ["age", "papers", "logcitations", "collaborators"]

HIGH_THRESHOLD = 21

# Let us first discretize attention based on the treshold
df['DiscretizedAltmetricScoreAtRetraction'] = df.apply(lambda row: \
                                        discretize_attention_binary(row['AltmetricScoreAtRetraction'],
                                                                        threshold_high=HIGH_THRESHOLD),
                                        axis=1)

# Defining the expression for attention
attention_exp = "C(DiscretizedAltmetricScoreAtRetraction, Treatment('low attention'))"


# Saving all the models
ests = []
for alt_fid in alternating_fids:
    # Finally regressing
    est = regress_all_together('logit-wSJR', df, attention_exp, default_fids + [alt_fid])
    ests.append(est)

tables = Stargazer(ests)

tables.significance_levels([0.05, 0.01, 0.001])

renaming_dict = {'AcademicAgeAtRetraction': 'Academic Age',
                "C(DiscretizedAltmetricScoreAtRetraction, Treatment('low attention'))[T.high attention]": "High Attention  ($>20$ Altemetric score)",
                "C(GenderizeGender, Treatment(reference='male'))[T.female]":"Female",
                "C(MAGAIDRankTypeInRetractedPaper, Treatment(reference='First or Last or Only Author'))[T.Middle Author]": "Author order: Middle Author",
                "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.misconduct]": "Reason: Misconduct",
                "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.plagiarism]":"Reason: Plagiarism",
                "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.other]":"Reason: Other",
                "C(RetractorMajority, Treatment(reference='author'))[T.journal]":"Retracted by: Journal",
                "C(RetractorMajority, Treatment(reference='author'))[T.other]":"Retracted by: Other",
                "Intercept":"Constant",
                "LogMAGCumCitationsAtRetraction": "Log(Citations)",
                "LogMAGCumCollaboratorsAtRetraction": "Log(Collaborators)",
                "MAGCumPapersAtRetraction": "Papers",
                "RetractionYear": "Retraction Year",
                "MAGRetractionYearAffRankOrdinal": "Author Affiliation Rank",
                "SJRQuartileRetractedPaperYear": "Journal/Conference Rank",
                "NumAuthorsInRetractedPaper": "Coauthors on Retracted Paper"
                }
                    
tables.rename_covariates(renaming_dict)

order = ["C(DiscretizedAltmetricScoreAtRetraction, Treatment('low attention'))[T.high attention]",
         "AcademicAgeAtRetraction",
         "MAGCumPapersAtRetraction",
         "LogMAGCumCitationsAtRetraction",
         "LogMAGCumCollaboratorsAtRetraction",
         "C(GenderizeGender, Treatment(reference='male'))[T.female]",
         "MAGRetractionYearAffRankOrdinal",
         "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.misconduct]",
         "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.plagiarism]",
         "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.other]",
         "C(RetractorMajority, Treatment(reference='author'))[T.journal]",
         "C(RetractorMajority, Treatment(reference='author'))[T.other]",
         "NumAuthorsInRetractedPaper",
         "C(MAGAIDRankTypeInRetractedPaper, Treatment(reference='First or Last or Only Author'))[T.Middle Author]",
         "RetractionYear",
         "SJRQuartileRetractedPaperYear",
         "Intercept"]

tables.covariate_order(order)

tables

Attrition
Optimization terminated successfully.
         Current function value: 0.422662
         Iterations 8
Attrition
Optimization terminated successfully.
         Current function value: 0.388741
         Iterations 10
Attrition
Optimization terminated successfully.
         Current function value: 0.406180
         Iterations 7
Attrition
Optimization terminated successfully.
         Current function value: 0.363113
         Iterations 8


0,1,2,3,4
,,,,
,Dependent variable:Attrition,Dependent variable:Attrition,Dependent variable:Attrition,Dependent variable:Attrition
,,,,
,(1),(2),(3),(4)
,,,,
High Attention ($>20$ Altemetric score),0.929**,0.847*,1.352***,0.958**
,(0.341),(0.341),(0.347),(0.331)
Academic Age,-0.131***,,,
,(0.014),,,
Papers,,-0.099***,,


In [28]:
# Now we shall hard-code some things for pretty-fying the latex table


caption = "\\textbf{Logistic regression models of attrition including a control for who led the retraction. } "\
        "Models differ in how authors' experience is measured using "\
        "(1) academic age, (2) number of papers by the time of retraction, "\
        "(3) logged number of citations by the the time of retraction, and "\
        "(4) logged number of collaborators by the time of retraction, respectively. "\
        "Controls for author's scientific discipline are included as categorical variables, but are not shown. "\
        "Note that these regression models only include authors whose retraction notices were manually annotated."

label = 'supplementarytab:retractor_logit_wSJR_2attention'

formatting_dict = {'\\textit{Note:} & \\multicolumn{4}{r}{$^{*}$p$<$0.05; $^{**}$p$<$0.01; $^{***}$p$<$0.001} \\':
                 '\\textit{$^{*}$p$<$0.05; $^{**}$p$<$0.01; $^{***}$p$<$0.001}',
                  
                  '\\begin{table}[!htbp] \\centering': 
                  '\\begin{table}[H]\n{\\fontsize{10.0}{10.0}\\selectfont{\n\\caption{'+caption+'}\n'+\
                   '\\label{'+label+'}\n\\begin{center}',
                  '\\end{tabular}\n':'\\end{tabular}\n\\end{center}\n}}\n',
                  '\\multicolumn{4}{c}{\\textit{Dependent variable:}} \\\n\\cr \\cline{4-5}\n':
                  '\\multicolumn{4}{c}{\\textit{Dependent variable: Attrition}} \\\n\\cr \\cline{2-5}\\\[-1.8ex]\n',
                   '\\begin{tabular}{@{\\extracolsep{5pt}}lcccc}\n':
                   '\\begin{tabular}{@{\\extracolsep{5pt}}lD{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3}}\n',
                   '\\\[-1.8ex] & (1) & (2) & (3) & (4) \\':
                   '& \\multicolumn{1}{c}{\\hspace{10pt}(1)} & \\multicolumn{1}{c}{\\hspace{10pt}(2)} & '\
                       '\\multicolumn{1}{c}{\\hspace{10pt}(3)} & \\multicolumn{1}{c}{\\hspace{10pt}(4)} \\'
                  }


latex = tables.render_latex()


for to_replace, replace_with in formatting_dict.items():
    latex = latex.replace(to_replace, replace_with)

# define the regular expression to match the dollar signs ($) in exponent notation
regex_remove_dollar = r"\$(\^\{.*?\})\$"

# replace all matches of the regular expression with the matched exponent notation, without the dollar signs
latex = re.sub(regex_remove_dollar, r"\1", latex)

# define the regular expression to match the pattern '(df = some_number)'
regex_remove_df = r'\(df = [\d\.]+; [\d\.]+\)'

latex = re.sub(regex_remove_df, '', latex)

# Removing Residual errors
regex_remove_resErr = r'Residual Std. Error.*\\\\\n'

latex = re.sub(regex_remove_resErr, '', latex)

# Removing F statistic line
regex_remove_Fstat = r'F Statistic.*\\\\\n'

latex = re.sub(regex_remove_Fstat, '', latex)

# Replacing the p-value line
to_replace = '\\textit{^{*}p$<$0.05; ^{**}p$<$0.01; ^{***}p$<$0.001}'
replace_with = '\\textit{$^{*}$p$<$0.05; $^{**}$p$<$0.01; $^{***}$p$<$0.001}'
latex = latex.replace(to_replace, replace_with)

# Order observation column
regex_center_numObs = r'(Observations & )(\d+ )(\& \d+ )(\& \d+ )(\& \d+)'

matches = re.findall(regex_center_numObs, latex)
replacement_numObs = ''

# iterate through each match and replace the numbers with the required format
for match in matches:
    a = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[1] + "} & "
    b = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[2].strip("& ") + "} & "
    c = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[3].strip("& ") + "} & "
    d = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[4].strip("& \\\\") + "}"
    replacement_numObs = match[0] + a + b + c + d

# print(replacement_numObs)
    
latex = re.sub(regex_center_numObs, replacement_numObs, latex)    
    
print(latex)

\begin{table}[H]
{\fontsize{10.0}{10.0}\selectfont{
\caption{\textbf{Logistic regression models of attrition including a control for who led the retraction. } Models differ in how authors' experience is measured using (1) academic age, (2) number of papers by the time of retraction, (3) logged number of citations by the the time of retraction, and (4) logged number of collaborators by the time of retraction, respectively. Controls for author's scientific discipline are included as categorical variables, but are not shown. Note that these regression models only include authors whose retraction notices were manually annotated.}
\label{supplementarytab:retractor_logit_wSJR_2attention}
\begin{center}
\begin{tabular}{@{\extracolsep{5pt}}lD{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3}}
\\[-1.8ex]\hline
\hline \\[-1.8ex]
& \multicolumn{4}{c}{\textit{Dependent variable: Attrition}} \
\cr \cline{2-5}\\[-1.8ex]
& \multicolumn{1}{c}{\hspace{10pt}(1)} & \multicolumn{1}{c}{\hspace{10pt}(2)} & \mul

## OLS with SJR and 3 attention categories

In [29]:
# Reading the regression file

df_main = pd.read_csv("../../../../data/h4_altmetric/regression/RW_Authors_forRegression_rematching.csv")

# Let us define the dependent variable

df_main['Attrition'] = df_main['AttritedClass']

Yi = 'Attrition'

# Removing class that's neither attrited nor non-attrited (i.e. those that are attrited due to something else)
df_main = df_main[df_main.AttritedClass.isin([0,1])]

# Defining attention columns

attention_agg_cols = ['AltmetricScoreAtRetraction',
                     'AggregateSocialMediaMentionsAtRetraction',
                     'AggregateNewsMediaMentionsAtRetraction',
                     'AggregateBlogsMentionsAtRetraction',
                     'AggregateKnowledgeRepositoriesMentionsAtRetraction']

# Creating the relevant dataframe
df = df_main[['Record ID','MAGAID'] + [Yi,'AttritedClassRobust']+
                 ['GenderizeGender',
                 'AcademicAgeAtRetraction',
                 'MAGCumPapersAtRetraction',
                 'LogMAGCumCitationsAtRetraction',
                 'LogMAGCumCollaboratorsAtRetraction']+
                list(df_main.\
                    filter(regex=("Field_.*")).columns)+
                ['MAGAIDRankTypeInRetractedPaper',
                'RetractionYear',
                'ReasonPropagatedMajorityOfMajority',
                'MAGJournalType',
                'SJRQuartileRetractedPaperYear',
                'MAGRetractionYearAffRankOrdinal',
                'NumAuthorsInRetractedPaper']+
                attention_agg_cols].drop_duplicates()

In [30]:
# These fields are the ones that are used in all the models
default_fids = ["yearofretraction", "gender", "contributionrank", "venue", 
             "reasons", "field", "affrank", "numauthors", "impactfactor",]

# These are the fields that are highly correlated and will be used one at a time
alternating_fids = ["age", "papers", "logcitations", "collaborators"]

LOW_MEDIUM_THRESHOLD = 31
MEDIUM_HIGH_THRESHOLD = 36

# Let us first discretize attention based on the treshold
df['DiscretizedAltmetricScoreAtRetraction'] = df.apply(lambda row: \
                                        discretize_attention_tertiary(row['AltmetricScoreAtRetraction'],
                                                                        threshold_medium=LOW_MEDIUM_THRESHOLD,
                                                                     threshold_high=MEDIUM_HIGH_THRESHOLD),
                                        axis=1)

# Defining the expression for attention
attention_exp = "C(DiscretizedAltmetricScoreAtRetraction, Treatment('medium attention'))"


# Saving all the models
ests = []
for alt_fid in alternating_fids:
    # Finally regressing
    est = regress_all_together('ols-wSJR', df, attention_exp, default_fids + [alt_fid])
    ests.append(est)

tables = Stargazer(ests)

tables.significance_levels([0.05, 0.01, 0.001])

renaming_dict = {'AcademicAgeAtRetraction': 'Academic Age',
                "C(DiscretizedAltmetricScoreAtRetraction, Treatment('medium attention'))[T.high attention]": "High Attention  ($>35$ Altmetric score)",
                "C(DiscretizedAltmetricScoreAtRetraction, Treatment('medium attention'))[T.low attention]": "Low Attention ($\le30$ Altmetric score)",
                "C(GenderizeGender, Treatment(reference='male'))[T.female]":"Female",
                "C(MAGAIDRankTypeInRetractedPaper, Treatment(reference='First or Last or Only Author'))[T.Middle Author]": "Author order: Middle Author",
                "C(MAGJournalType, Treatment(reference='conference'))[T.journal]":"Venue: Journal",
                "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.misconduct]": "Reason: Misconduct",
                "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.plagiarism]":"Reason: Plagiarism",
                "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.other]":"Reason: Other",
                "Intercept":"Constant",
                "LogMAGCumCitationsAtRetraction": "Log(Citations)",
                "LogMAGCumCollaboratorsAtRetraction": "Log(Collaborators)",
                "MAGCumPapersAtRetraction": "Papers",
                "RetractionYear": "Retraction Year",
                "MAGRetractionYearAffRankOrdinal": "Author Affiliation Rank",
                "SJRQuartileRetractedPaperYear": "Journal/Conference Rank",
                "NumAuthorsInRetractedPaper": "Coauthors on Retracted Paper"
                }
                    
tables.rename_covariates(renaming_dict)

order = ["C(DiscretizedAltmetricScoreAtRetraction, Treatment('medium attention'))[T.high attention]",
         "C(DiscretizedAltmetricScoreAtRetraction, Treatment('medium attention'))[T.low attention]",
         "AcademicAgeAtRetraction",
         "MAGCumPapersAtRetraction",
         "LogMAGCumCitationsAtRetraction",
         "LogMAGCumCollaboratorsAtRetraction",
         "C(GenderizeGender, Treatment(reference='male'))[T.female]",
         "MAGRetractionYearAffRankOrdinal",
         "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.misconduct]",
         "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.plagiarism]",
         "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.other]",
         "NumAuthorsInRetractedPaper",
         "C(MAGAIDRankTypeInRetractedPaper, Treatment(reference='First or Last or Only Author'))[T.Middle Author]",
         "RetractionYear",
         "C(MAGJournalType, Treatment(reference='conference'))[T.journal]",
         "SJRQuartileRetractedPaperYear",
         "Intercept"]

tables.covariate_order(order)

tables

Attrition
Attrition
Attrition
Attrition


0,1,2,3,4
,,,,
,Dependent variable:Attrition,Dependent variable:Attrition,Dependent variable:Attrition,Dependent variable:Attrition
,,,,
,(1),(2),(3),(4)
,,,,
High Attention ($>35$ Altmetric score),-0.212**,-0.191*,-0.201**,-0.174*
,(0.081),(0.084),(0.075),(0.068)
Low Attention ($\le30$ Altmetric score),-0.229**,-0.200**,-0.253***,-0.190**
,(0.073),(0.075),(0.065),(0.060)
Academic Age,-0.008***,,,


In [31]:
# Now we shall hard-code some things for pretty-fying the latex table


caption = '\\textbf{OLS regression coefficients with 3 Altmetric attention categories: '\
        'low ($score < 30$), '\
        'medium ($30 \leq score < 35$), and high ($score \geq 35$) '\
        'using medium attention as reference category}, and '\
        'attrition as a dependent variable for different '\
        'author experience fields: model (1) using academic age, '\
        '(2) using number of papers by the time of retraction, '\
        '(3) using logged number of citations by the the time of retraction, and '\
        '(4) using logged number of collaborators by the time of retraction. '\
        'Controls for discipline of study are included as binary variables, but are not shown. '\

label = 'supplementarytab:ols_wSJR_3attention'

caption = "\\textbf{Linear probability models of attrition with low, medium, and high attention categories.} "\
        "Models differ in how authors' experience is measured using "\
        "(1) academic age, (2) number of papers by the time of retraction, "\
        "(3) logged number of citations by the the time of retraction, and "\
        "(4) logged number of collaborators by the time of retraction, respectively. "\
        "Controls for author's scientific discipline are included as categorical variables, but are not shown. "

formatting_dict = {'\\textit{Note:} & \\multicolumn{4}{r}{$^{*}$p$<$0.05; $^{**}$p$<$0.01; $^{***}$p$<$0.001} \\':
                 '\\textit{$^{*}$p$<$0.05; $^{**}$p$<$0.01; $^{***}$p$<$0.001}',
                  
                  '\\begin{table}[!htbp] \\centering': 
                  '\\begin{table}[H]\n{\\fontsize{10.0}{10.0}\\selectfont{\n\\caption{'+caption+'}\n'+\
                   '\\label{'+label+'}\n\\begin{center}',
                  '\\end{tabular}\n':'\\end{tabular}\n\\end{center}\n}}\n',
                  '\\multicolumn{4}{c}{\\textit{Dependent variable:}} \\\n\\cr \\cline{4-5}\n':
                  '\\multicolumn{4}{c}{\\textit{Dependent variable: Attrition}} \\\n\\cr \\cline{2-5}\\\[-1.8ex]\n',
                   '\\begin{tabular}{@{\\extracolsep{5pt}}lcccc}\n':
                   '\\begin{tabular}{@{\\extracolsep{5pt}}lD{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3}}\n',
                   '\\\[-1.8ex] & (1) & (2) & (3) & (4) \\':
                   '& \\multicolumn{1}{c}{\\hspace{10pt}(1)} & \\multicolumn{1}{c}{\\hspace{10pt}(2)} & '\
                       '\\multicolumn{1}{c}{\\hspace{10pt}(3)} & \\multicolumn{1}{c}{\\hspace{10pt}(4)} \\'
                  }


latex = tables.render_latex()


for to_replace, replace_with in formatting_dict.items():
    latex = latex.replace(to_replace, replace_with)

# define the regular expression to match the dollar signs ($) in exponent notation
regex_remove_dollar = r"\$(\^\{.*?\})\$"

# replace all matches of the regular expression with the matched exponent notation, without the dollar signs
latex = re.sub(regex_remove_dollar, r"\1", latex)

# define the regular expression to match the pattern '(df = some_number)'
regex_remove_df = r'\(df = [\d\.]+; [\d\.]+\)'

latex = re.sub(regex_remove_df, '', latex)

# Removing Residual errors
regex_remove_resErr = r'Residual Std. Error.*\\\\\n'

latex = re.sub(regex_remove_resErr, '', latex)


# Replacing the p-value line
to_replace = '\\textit{^{*}p$<$0.05; ^{**}p$<$0.01; ^{***}p$<$0.001}'
replace_with = '\\textit{$^{*}$p$<$0.05; $^{**}$p$<$0.01; $^{***}$p$<$0.001}'
latex = latex.replace(to_replace, replace_with)

# Order observation column
regex_center_numObs = r'(Observations & )(\d+ )(\& \d+ )(\& \d+ )(\& \d+)'

matches = re.findall(regex_center_numObs, latex)
replacement_numObs = ''

# iterate through each match and replace the numbers with the required format
for match in matches:
    a = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[1] + "} & "
    b = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[2].strip("& ") + "} & "
    c = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[3].strip("& ") + "} & "
    d = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[4].strip("& \\\\") + "}"
    replacement_numObs = match[0] + a + b + c + d

# print(replacement_numObs)
    
latex = re.sub(regex_center_numObs, replacement_numObs, latex)    
    
print(latex)



\begin{table}[H]
{\fontsize{10.0}{10.0}\selectfont{
\caption{\textbf{Linear probability models of attrition with low, medium, and high attention categories.} Models differ in how authors' experience is measured using (1) academic age, (2) number of papers by the time of retraction, (3) logged number of citations by the the time of retraction, and (4) logged number of collaborators by the time of retraction, respectively. Controls for author's scientific discipline are included as categorical variables, but are not shown. }
\label{supplementarytab:ols_wSJR_3attention}
\begin{center}
\begin{tabular}{@{\extracolsep{5pt}}lD{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3}}
\\[-1.8ex]\hline
\hline \\[-1.8ex]
& \multicolumn{4}{c}{\textit{Dependent variable: Attrition}} \
\cr \cline{2-5}\\[-1.8ex]
& \multicolumn{1}{c}{\hspace{10pt}(1)} & \multicolumn{1}{c}{\hspace{10pt}(2)} & \multicolumn{1}{c}{\hspace{10pt}(3)} & \multicolumn{1}{c}{\hspace{10pt}(4)} \\
\hline \\[-1.8ex]
 High Attention  ($>35$ 

## Logit with SJR and 3 attention categories

In [32]:
# These fields are the ones that are used in all the models
default_fids = ["yearofretraction", "gender", "contributionrank", "venue", 
             "reasons", "field", "affrank", "numauthors", "impactfactor",]

# These are the fields that are highly correlated and will be used one at a time
alternating_fids = ["age", "papers", "logcitations", "collaborators"]

LOW_MEDIUM_THRESHOLD = 31
MEDIUM_HIGH_THRESHOLD = 36

# Let us first discretize attention based on the treshold
df['DiscretizedAltmetricScoreAtRetraction'] = df.apply(lambda row: \
                                        discretize_attention_tertiary(row['AltmetricScoreAtRetraction'],
                                                                        threshold_medium=LOW_MEDIUM_THRESHOLD,
                                                                     threshold_high=MEDIUM_HIGH_THRESHOLD),
                                        axis=1)

# Defining the expression for attention
attention_exp = "C(DiscretizedAltmetricScoreAtRetraction, Treatment('medium attention'))"


# Saving all the models
ests = []
for alt_fid in alternating_fids:
    # Finally regressing
    est = regress_all_together('logit-wSJR', df, attention_exp, default_fids + [alt_fid])
    ests.append(est)

tables = Stargazer(ests)

tables.significance_levels([0.05, 0.01, 0.001])

renaming_dict = {'AcademicAgeAtRetraction': 'Academic Age',
                "C(DiscretizedAltmetricScoreAtRetraction, Treatment('medium attention'))[T.high attention]": "High Attention  ($>35$ Altmetric score)",
                "C(DiscretizedAltmetricScoreAtRetraction, Treatment('medium attention'))[T.low attention]": "Low Attention ($\le30$ Altmetric score)",
                "C(GenderizeGender, Treatment(reference='male'))[T.female]":"Female",
                "C(MAGAIDRankTypeInRetractedPaper, Treatment(reference='First or Last or Only Author'))[T.Middle Author]": "Author order: Middle Author",
                "C(MAGJournalType, Treatment(reference='conference'))[T.journal]":"Venue: Journal",
                "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.misconduct]": "Reason: Misconduct",
                "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.plagiarism]":"Reason: Plagiarism",
                "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.other]":"Reason: Other",
                "Intercept":"Constant",
                "LogMAGCumCitationsAtRetraction": "Log(Citations)",
                "LogMAGCumCollaboratorsAtRetraction": "Log(Collaborators)",
                "MAGCumPapersAtRetraction": "Papers",
                "RetractionYear": "Retraction Year",
                "MAGRetractionYearAffRankOrdinal": "Author Affiliation Rank",
                "SJRQuartileRetractedPaperYear": "Journal/Conference Rank",
                "NumAuthorsInRetractedPaper": "Coauthors on Retracted Paper"
                }
                    
tables.rename_covariates(renaming_dict)

order = ["C(DiscretizedAltmetricScoreAtRetraction, Treatment('medium attention'))[T.high attention]",
         "C(DiscretizedAltmetricScoreAtRetraction, Treatment('medium attention'))[T.low attention]",
         "AcademicAgeAtRetraction",
         "MAGCumPapersAtRetraction",
         "LogMAGCumCitationsAtRetraction",
         "LogMAGCumCollaboratorsAtRetraction",
         "C(GenderizeGender, Treatment(reference='male'))[T.female]",
         "MAGRetractionYearAffRankOrdinal",
         "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.misconduct]",
         "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.plagiarism]",
         "C(ReasonPropagatedMajorityOfMajority, Treatment(reference='mistake'))[T.other]",
         "NumAuthorsInRetractedPaper",
         "C(MAGAIDRankTypeInRetractedPaper, Treatment(reference='First or Last or Only Author'))[T.Middle Author]",
         "RetractionYear",
         "C(MAGJournalType, Treatment(reference='conference'))[T.journal]",
         "SJRQuartileRetractedPaperYear",
         "Intercept"]

tables.covariate_order(order)

tables

Attrition
Optimization terminated successfully.
         Current function value: 0.408621
         Iterations 8
Attrition
Optimization terminated successfully.
         Current function value: 0.380066
         Iterations 10
Attrition
Optimization terminated successfully.
         Current function value: 0.393601
         Iterations 7
Attrition
Optimization terminated successfully.
         Current function value: 0.353943
         Iterations 8


0,1,2,3,4
,,,,
,Dependent variable:Attrition,Dependent variable:Attrition,Dependent variable:Attrition,Dependent variable:Attrition
,,,,
,(1),(2),(3),(4)
,,,,
High Attention ($>35$ Altmetric score),-1.426**,-1.232*,-1.186*,-1.229**
,(0.489),(0.539),(0.475),(0.448)
Low Attention ($\le30$ Altmetric score),-1.628***,-1.369**,-1.803***,-1.494***
,(0.382),(0.449),(0.368),(0.356)
Academic Age,-0.112***,,,


In [33]:
label = 'supplementarytab:logit_wSJR_3attention'

caption = "\\textbf{Logistic regression models of attrition with low, medium, and high attention categories.} "\
        "Models differ in how authors' experience is measured using "\
        "(1) academic age, (2) number of papers by the time of retraction, "\
        "(3) logged number of citations by the the time of retraction, and "\
        "(4) logged number of collaborators by the time of retraction, respectively. "\
        "Controls for author's scientific discipline are included as categorical variables, but are not shown. "

formatting_dict = {'\\textit{Note:} & \\multicolumn{4}{r}{$^{*}$p$<$0.05; $^{**}$p$<$0.01; $^{***}$p$<$0.001} \\':
                 '\\textit{$^{*}$p$<$0.05; $^{**}$p$<$0.01; $^{***}$p$<$0.001}',
                  
                  '\\begin{table}[!htbp] \\centering': 
                  '\\begin{table}[H]\n{\\fontsize{10.0}{10.0}\\selectfont{\n\\caption{'+caption+'}\n'+\
                   '\\label{'+label+'}\n\\begin{center}',
                  '\\end{tabular}\n':'\\end{tabular}\n\\end{center}\n}}\n',
                  '\\multicolumn{4}{c}{\\textit{Dependent variable:}} \\\n\\cr \\cline{4-5}\n':
                  '\\multicolumn{4}{c}{\\textit{Dependent variable: Attrition}} \\\n\\cr \\cline{2-5}\\\[-1.8ex]\n',
                   '\\begin{tabular}{@{\\extracolsep{5pt}}lcccc}\n':
                   '\\begin{tabular}{@{\\extracolsep{5pt}}lD{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3}}\n',
                   '\\\[-1.8ex] & (1) & (2) & (3) & (4) \\':
                   '& \\multicolumn{1}{c}{\\hspace{10pt}(1)} & \\multicolumn{1}{c}{\\hspace{10pt}(2)} & '\
                       '\\multicolumn{1}{c}{\\hspace{10pt}(3)} & \\multicolumn{1}{c}{\\hspace{10pt}(4)} \\'
                  }


latex = tables.render_latex()


for to_replace, replace_with in formatting_dict.items():
    latex = latex.replace(to_replace, replace_with)

# define the regular expression to match the dollar signs ($) in exponent notation
regex_remove_dollar = r"\$(\^\{.*?\})\$"

# replace all matches of the regular expression with the matched exponent notation, without the dollar signs
latex = re.sub(regex_remove_dollar, r"\1", latex)

# define the regular expression to match the pattern '(df = some_number)'
regex_remove_df = r'\(df = [\d\.]+; [\d\.]+\)'

latex = re.sub(regex_remove_df, '', latex)

# Removing Residual errors
regex_remove_resErr = r'Residual Std. Error.*\\\\\n'

latex = re.sub(regex_remove_resErr, '', latex)

# Removing F statistic line
regex_remove_Fstat = r'F Statistic.*\\\\\n'

latex = re.sub(regex_remove_Fstat, '', latex)

# Replacing the p-value line
to_replace = '\\textit{^{*}p$<$0.05; ^{**}p$<$0.01; ^{***}p$<$0.001}'
replace_with = '\\textit{$^{*}$p$<$0.05; $^{**}$p$<$0.01; $^{***}$p$<$0.001}'
latex = latex.replace(to_replace, replace_with)

# Order observation column
regex_center_numObs = r'(Observations & )(\d+ )(\& \d+ )(\& \d+ )(\& \d+)'

matches = re.findall(regex_center_numObs, latex)
replacement_numObs = ''

# iterate through each match and replace the numbers with the required format
for match in matches:
    a = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[1] + "} & "
    b = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[2].strip("& ") + "} & "
    c = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[3].strip("& ") + "} & "
    d = "\\\multicolumn{1}{c}{\\\hspace{10pt}" + match[4].strip("& \\\\") + "}"
    replacement_numObs = match[0] + a + b + c + d

# print(replacement_numObs)
    
latex = re.sub(regex_center_numObs, replacement_numObs, latex)    
    
print(latex)

\begin{table}[H]
{\fontsize{10.0}{10.0}\selectfont{
\caption{\textbf{Logistic regression models of attrition with low, medium, and high attention categories.} Models differ in how authors' experience is measured using (1) academic age, (2) number of papers by the time of retraction, (3) logged number of citations by the the time of retraction, and (4) logged number of collaborators by the time of retraction, respectively. Controls for author's scientific discipline are included as categorical variables, but are not shown. }
\label{supplementarytab:logit_wSJR_3attention}
\begin{center}
\begin{tabular}{@{\extracolsep{5pt}}lD{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3}}
\\[-1.8ex]\hline
\hline \\[-1.8ex]
& \multicolumn{4}{c}{\textit{Dependent variable: Attrition}} \
\cr \cline{2-5}\\[-1.8ex]
& \multicolumn{1}{c}{\hspace{10pt}(1)} & \multicolumn{1}{c}{\hspace{10pt}(2)} & \multicolumn{1}{c}{\hspace{10pt}(3)} & \multicolumn{1}{c}{\hspace{10pt}(4)} \\
\hline \\[-1.8ex]
 High Attention  ($>3