In [1]:
import numpy as np
import pandas as pd
import sqlalchemy

In [2]:
#connect to database
engine = sqlalchemy.create_engine('mysql://root:6734023@localhost:3306/scientometrics')
posts_data, own_comments_data, comments_data, education_data, languages_data, work_data, mentions_data = [None] * 7
scientometrics_data = pd.read_sql_query('''
                 SELECT * FROM FEATURES;
        ''', con=engine)

#log transformation and normalization
def log_normal(df):
    df = df.transform(lambda x: (np.log(x+1)))
    df=df.transform(lambda x: (x - x.min()) / (x.max()-x.min()))
    return df

In [3]:
print(scientometrics_data.columns)

Index(['ID', 'PAPER_COUNT', 'CITATION_COUNT', 'NUM_FIRST_POS',
       'NUM_SECOND_POS', 'NUM_THIRD_POS', 'NUM_HIGHER_POS',
       'NUM_YEARS_SINCE_FIRST_PUBLICATION',
       'NUM_YEARS_BETWEEN_FIRST_AND_LAST_PUBLICATION',
       'AVG_NUM_PUBLICATIONS_PER_YEAR', 'NUM_INSTITUTIONS',
       'NUM_TOP500_INSTITUTIONS', 'SHANGHAI_RANK', 'NTU_RANK', 'THE_RANK',
       'SHANGHAI_SCORE', 'NTU_SCORE', 'THE_SCORE', 'AVG_TITLE_LENGTH',
       'AVG_ABSTRACT_LENGTH', 'COLLAB_DEGREE_UNWEIGHTED',
       'COLLAB_DEGREE_WEIGHTED', 'COLLAB_CLOSENESS_UNWEIGHTED',
       'COLLAB_CLOSENESS_WEIGHTED', 'COLLAB_BETWEENNESS_UNWEIGHTED',
       'COLLAB_BETWEENNESS_WEIGHTED', 'COLLAB_PAGERANK_UNWEIGHTED',
       'COLLAB_PAGERANK_WEIGHTED', 'COLLAB_EIGENVECTOR_UNWEIGHTED',
       'COLLAB_EIGENVECTOR_WEIGHTED', 'CIT_INDEGREE_UNWEIGHTED',
       'CIT_INDEGREE_WEIGHTED', 'CIT_OUTDEGREE_UNWEIGHTED',
       'CIT_OUTDEGREE_WEIGHTED', 'CIT_CLOSENESS_UNWEIGHTED',
       'CIT_CLOSENESS_WEIGHTED', 'CIT_CLOSENESS_REV_UNWEIGH

In [3]:
#science_quantitative
science_quantitative =  scientometrics_data.copy()
science_quantitative.rename(columns={'CITATION_COUNT': 'score'}, inplace=True)
# Log transformation and scaling
excludeFeatures_quantitative = [0]
# features directly related to number of citations
excludeFeatures_quantitative += [30, 31, 36, 37, 40, 41, 42, 43]
# exclude scientific indices
excludeFeatures_quantitative += [i for i in range(44, 56)]

science_quantitative.drop(science_quantitative.columns[excludeFeatures_quantitative], axis=1, inplace=True)
#log and scale transformation
science_quantitative= log_normal(science_quantitative)
pd.to_pickle(science_quantitative, '../data/preprocessed_science_quantitative.pd')

In [4]:
#science-qualitative
def calculate_score_qualitative(row):
    citationCount = float(row[2])
    paperCount = float(row[1])
    if paperCount == 0:
        return 0
    else:
        return (citationCount+1) / paperCount
    
science_qualitative =  scientometrics_data.copy()
science_qualitative['score'] = science_qualitative.apply(calculate_score_qualitative, axis=1)
#drop related columns
excludeFeatures_qualitative = [0]
# features directly related to number of citations
excludeFeatures_qualitative += [2,  30, 31, 36, 37, 40, 41, 42, 43]
# exclude scientific indices
excludeFeatures_qualitative += [i for i in range(44, 56)]
# features directly related to number of publications
excludeFeatures_qualitative += [1, 5, 6, 9]
science_qualitative.drop(science_qualitative.columns[excludeFeatures_qualitative], axis=1, inplace=True)
#log and scale transformation
science_qualitative= log_normal(science_qualitative)

print(science_qualitative.head(n=2))
pd.to_pickle(science_qualitative, '../data/preprocessed_science_qualitative.pd')

   NUM_FIRST_POS  NUM_SECOND_POS  NUM_YEARS_SINCE_FIRST_PUBLICATION  \
0       0.359146        0.122857                            0.38980   
1       0.359146        0.344903                            0.73781   

   NUM_YEARS_BETWEEN_FIRST_AND_LAST_PUBLICATION  NUM_INSTITUTIONS  \
0                                      0.148817          0.122342   
1                                      0.148817          0.122342   

   NUM_TOP500_INSTITUTIONS  SHANGHAI_RANK  NTU_RANK  THE_RANK  SHANGHAI_SCORE  \
0                 0.422107       0.981686  0.763297  0.576232        0.005625   
1                 0.155787       0.530203  0.236002  0.568291        0.450207   

     ...     CIT_CLOSENESS_UNWEIGHTED  CIT_CLOSENESS_WEIGHTED  \
0    ...                     0.769428                0.639431   
1    ...                     0.770456                0.629827   

   CIT_BETWEENNESS_UNWEIGHTED  CIT_BETWEENNESS_WEIGHTED  TOP_SIM_CORPUS  \
0                    0.000296              0.000000e+00        

In [5]:
#science_hindex
science_hindex =  scientometrics_data.copy()
science_hindex.rename(columns={'H_INDEX': 'score'}, inplace=True)

excludeFeatures_hindex=[0]
# features directly related to number of citations
excludeFeatures_hindex += [2,  30, 31, 36, 37, 40, 41, 42, 43]
# exclude scientific indices
excludeFeatures_hindex += [i for i in range(45, 56)]
# features directly related to number of publications
excludeFeatures_hindex += [1, 5, 6, 9]
science_hindex.drop(science_hindex.columns[excludeFeatures_hindex], axis=1, inplace=True)
#log and scale transformation
science_hindex= log_normal(science_hindex)
pd.to_pickle(science_hindex, '../data/preprocessed_science_hindex.pd')