In [11]:
import pandas as pd

In [12]:
# Load CSV "publications.csv" data
data = pd.read_csv("publications_new.csv")

# Convert into DataFrame
df = pd.DataFrame(data)

# Show the first 5 rows
df.head()


Unnamed: 0,Professor Name,Interests,h-index,Citations,Year,Abstracts,Link,Home_page
0,Mayank Singh,"['NLP', 'Data Mining', 'Machine Learning']",11,470,2023,The multi-sentential long sequence textual dat...,https://scholar.google.com/citations?view_op=v...,http://mayank4490.github.io/
1,Mayank Singh,"['NLP', 'Data Mining', 'Machine Learning']",11,470,2022,Large language models (LLMs) have been shown t...,https://scholar.google.com/citations?view_op=v...,http://mayank4490.github.io/
2,Mayank Singh,"['NLP', 'Data Mining', 'Machine Learning']",11,470,2022,Scientific documents contain tables that list ...,https://scholar.google.com/citations?view_op=v...,http://mayank4490.github.io/
3,Mayank Singh,"['NLP', 'Data Mining', 'Machine Learning']",11,470,2022,Stock market investors debate and heavily disc...,https://scholar.google.com/citations?view_op=v...,http://mayank4490.github.io/
4,Mayank Singh,"['NLP', 'Data Mining', 'Machine Learning']",11,470,2022,"Tsunamis, power blackouts, and distribution sy...",https://scholar.google.com/citations?view_op=v...,http://mayank4490.github.io/


In [13]:

# List of borad CSE fields
keywords = {
    0: ['language', 'text', 'nlp', 'processing', 'sentiment', 'speech', 'linguistics', 'translation', 'retrieval',],
    1: ['vision', 'image', 'object', 'recognition', 'detection', 'segmentation', 'tracking', 'extraction'],
    2: ['algorithm', 'optimization', 'complexity','automata','cryptography'],
    3: ['neural', 'network', 'deep', 'learning'],
    4: ['data', 'big', 'analytics', 'mining', 'visualization', 'statistics', 'analytics', 'modeling',],
    5: ['robotics', 'control', 'autonomous', 'system'],
    6: ['database', 'query', 'sql', 'transaction', 'scheduling'],
    7: ['security', 'privacy', 'encryption', 'authentication', 'wireless','protocols','topology','routing','firewalls','architecture'],
    8: ['web', 'cloud', 'distributed','parallel','grid','scalability','tolerance','databases','middleware'],
    9: ['rendering','animation','virtual','shading','ray', 'tracing','texture'],
    10: ['compilers', 'interpreters', 'syntax'],
}

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [15]:
# create a dataframe of name of professors and topics named 0 to 10 as columns
professors = df['Professor Name'].tolist()
professors = [x for x in professors if str(x) != 'nan']
# unique professors
professors = list(set(professors))
professors_topic = pd.DataFrame(columns=['Professor Name']+list(range(0,11)))
professors_topic['Professor Name'] = professors

professors_topic.head()
print(professors_topic.shape)




(70, 12)


In [16]:
def topic_modelling(abstracts):
    topics = {}

    for abstract in abstracts:
        # Define the number of topics to generate
        n_topics = 10

        # Create Document-Term Matrix using CountVectorizer
        vectorizer = CountVectorizer(stop_words='english')

        # Fit and transform the vectorizer on the abstracts
        dtm = vectorizer.fit_transform([abstract])

        # Create the LDA model
        lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
        lda.fit(dtm)

        # print("Number of words: ", len(vectorizer.get_feature_names_out()))
        # print(vectorizer.get_feature_names_out())

        for keyword in vectorizer.get_feature_names_out():
            for key, value in keywords.items():
                if keyword.lower() in [x.lower() for x in value]:
                    # print(keyword.lower(), "->Topic: ", key)
                    # add the topic to the dictionary
                    
                    if key in topics:
                        topics[key] += 1
                    else:
                        topics[key] = 1

    #print topics in descending order of their frequency
    # for key, value in sorted(topics.items(), key=lambda item: item[1], reverse=True):
    #     print("%s: %s" % (key, value))
    return topics


In [18]:
from scipy.stats import zscore

for profs in professors:
    profs_df = df[df['Professor Name'] == profs]
    print("############################################################### ", profs)
    abstracts = profs_df['Abstracts'].tolist()
    abstracts = [x for x in abstracts if str(x) != 'nan']
    topics = topic_modelling(abstracts)
    for key, value in topics.items():
        professors_topic.loc[professors_topic['Professor Name'] == profs, key] = value

# replace nan with 0
professors_topic = professors_topic.fillna(0)
professors_topic.iloc[:,1:] = professors_topic.iloc[:,1:].apply(zscore, axis=1)

# print(professors_topic)

# find cosine similarity between each professor
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(professors_topic.iloc[:,1:])

cosine_sim = pd.DataFrame(cosine_sim)
cosine_sim.columns = professors_topic['Professor Name'].tolist()
cosine_sim.index = professors_topic['Professor Name'].tolist()
# convert into dataframe and save as csv
cosine_sim.to_csv('cosine_sim_LDA_new.csv')

# Add column of h-index for each professor from df
professors_topic['h-index'] = 0
for index, row in professors_topic.iterrows():
    professors_topic.loc[index, 'h-index'] = df.loc[df['Professor Name'] == row['Professor Name'], 'h-index'].iloc[0]


# add a column of total publications for each professor which contains the sum of elements in abstract 
professors_topic['Total Publications'] = 0
for index, row in professors_topic.iterrows():
    professors_topic.loc[index, 'Total Publications'] = df.loc[df['Professor Name'] == row['Professor Name'], 'Abstracts'].count()

# add a column of Link to Google Scholar for each professor
professors_topic['Home Page'] = 0
for index, row in professors_topic.iterrows():
    professors_topic.loc[index, 'Home Page'] = df.loc[df['Professor Name'] == row['Professor Name'], 'Home_page'].iloc[0]

# from df create a df with columns 'Professor Name', '2018, '2019', '2020', '2021', and '2022'. For each professors check the years and add the frequency
professors_topic['2018'] = 0
professors_topic['2019'] = 0
professors_topic['2020'] = 0
professors_topic['2021'] = 0
professors_topic['2022'] = 0
for prof in professors:
    years = df.loc[df['Professor Name'] == prof, 'Year'].tolist()
    for year in years:
        if year == 2018:
            professors_topic.loc[professors_topic['Professor Name'] == prof, '2018'] += 1
        elif year == 2019:
            professors_topic.loc[professors_topic['Professor Name'] == prof, '2019'] += 1
        elif year == 2020:
            professors_topic.loc[professors_topic['Professor Name'] == prof, '2020'] += 1
        elif year == 2021:
            professors_topic.loc[professors_topic['Professor Name'] == prof, '2021'] += 1
        elif year == 2022:
            professors_topic.loc[professors_topic['Professor Name'] == prof, '2022'] += 1

# add a column of Score by adding 40% of 2022, 30% of 2021, 20% of 2020, 5% of 2019, and 5% of 2018
professors_topic['Score'] = 0
for index, row in professors_topic.iterrows():
    professors_topic.loc[index, 'Score'] = (row['2022']*0.4) + (row['2021']*0.3) + (row['2020']*0.2) + (row['2019']*0.05) + (row['2018']*0.05)

# Scale score to range between 0 and 10
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 10))
professors_topic['Score'] = scaler.fit_transform(professors_topic[['Score']])

# save the dataframe to csv
professors_topic.to_csv('professors_topic_LDA_new.csv', index=False)

###############################################################  Subhashish Banerjee
###############################################################  Chester Rebeiro
###############################################################  Shweta Agrawal
###############################################################  Smruti R. Sarangi
###############################################################  Jayant Haritsa
###############################################################  Shirish Shevade
###############################################################  Surender Baswana
###############################################################  Uday Reddy Bondhugula
###############################################################  Rijurekha Sen
###############################################################  Ganesh Ramakrishnan
###############################################################  Gugan Thoppe | गुगन थोप्पे
###############################################################  Chetan Arora
#######



TypeError: unsupported operand type(s) for +: 'float' and 'str'

In [17]:
from scipy.stats import zscore
from f2 import *

topics = {
    0: ['language', 'text', 'nlp', 'processing', 'sentiment', 'speech', 'linguistics', 'translation', 'retrieval',],
    1: ['vision', 'image', 'object', 'recognition', 'detection', 'segmentation', 'tracking', 'extraction'],
    2: ['algorithm', 'optimization', 'complexity','automata','cryptography'],
    3: ['neural', 'network', 'deep', 'learning'],
    4: ['data', 'big', 'analytics', 'mining', 'visualization', 'statistics', 'analytics', 'modeling',],
    5: ['robotics', 'control', 'autonomous', 'system'],
    6: ['database', 'query', 'sql', 'transaction', 'scheduling'],
    7: ['security', 'privacy', 'encryption', 'authentication', 'wireless','protocols','topology','routing','firewalls','architecture'],
    8: ['web', 'cloud', 'distributed','parallel','grid','scalability','tolerance','databases','middleware'],
    9: ['rendering','animation','virtual','shading','ray', 'tracing','texture'],
    10: ['compilers', 'interpreters', 'syntax'],
}


# print(professors_topic)
for profs in professors:
    profs_df = df[df['Professor Name'] == profs]
    print("############################################################### ", profs)
    abstracts = profs_df['Abstracts'].tolist()
    abstracts = [x for x in abstracts if str(x) != 'nan']
    doc_topic_prob = assign_topic_probabilities(abstracts, topics)
    # convert into dataframe
    doc_topic_prob = pd.DataFrame(doc_topic_prob)
    # get the sum along columns and normalize it
    doc_topic_prob = doc_topic_prob.sum(axis=0)
    doc_topic_prob = doc_topic_prob/doc_topic_prob.sum()
    for i in range(0, 11):
        professors_topic.loc[professors_topic['Professor Name'] == profs, i] = doc_topic_prob[i]

# replace nan with 0
professors_topic = professors_topic.fillna(0)

# find cosine similarity between each professor
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(professors_topic.iloc[:,1:], professors_topic.iloc[:,1:])
cosine_sim = pd.DataFrame(cosine_sim)
cosine_sim.columns = professors_topic['Professor Name'].tolist()
cosine_sim.index = professors_topic['Professor Name'].tolist()
print(cosine_sim)
# convert into dataframe and save as csv
cosine_sim.to_csv('cosine_sim_new.csv')

# Add column of h-index for each professor from df
professors_topic['h-index'] = 0
for index, row in professors_topic.iterrows():
    professors_topic.loc[index, 'h-index'] = df.loc[df['Professor Name'] == row['Professor Name'], 'h-index'].iloc[0]


# add a column of total publications for each professor which contains the sum of elements in abstract 
professors_topic['Total Publications'] = 0
for index, row in professors_topic.iterrows():
    professors_topic.loc[index, 'Total Publications'] = df.loc[df['Professor Name'] == row['Professor Name'], 'Abstracts'].count()

# add a column of Link to Google Scholar for each professor
professors_topic['Home Page'] = 0
for index, row in professors_topic.iterrows():
    professors_topic.loc[index, 'Home Page'] = df.loc[df['Professor Name'] == row['Professor Name'], 'Home_page'].iloc[0]

# from df create a df with columns 'Professor Name', '2018, '2019', '2020', '2021', and '2022'. For each professors check the years and add the frequency
professors_topic['2018'] = 0
professors_topic['2019'] = 0
professors_topic['2020'] = 0
professors_topic['2021'] = 0
professors_topic['2022'] = 0
for prof in professors:
    years = df.loc[df['Professor Name'] == prof, 'Year'].tolist()
    for year in years:
        if year == 2018:
            professors_topic.loc[professors_topic['Professor Name'] == prof, '2018'] += 1
        elif year == 2019:
            professors_topic.loc[professors_topic['Professor Name'] == prof, '2019'] += 1
        elif year == 2020:
            professors_topic.loc[professors_topic['Professor Name'] == prof, '2020'] += 1
        elif year == 2021:
            professors_topic.loc[professors_topic['Professor Name'] == prof, '2021'] += 1
        elif year == 2022:
            professors_topic.loc[professors_topic['Professor Name'] == prof, '2022'] += 1

# add a column of Score by adding 40% of 2022, 30% of 2021, 20% of 2020, 5% of 2019, and 5% of 2018
professors_topic['Score'] = 0
for index, row in professors_topic.iterrows():
    professors_topic.loc[index, 'Score'] = (row['2022']*0.4) + (row['2021']*0.3) + (row['2020']*0.2) + (row['2019']*0.05) + (row['2018']*0.05)

# Scale score to range between 0 and 10
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 10))
professors_topic['Score'] = scaler.fit_transform(professors_topic[['Score']])


# save the dataframe to csv
professors_topic.to_csv('professors_topic_new.csv', index=False)

###############################################################  Subhashish Banerjee
###############################################################  Chester Rebeiro
###############################################################  Shweta Agrawal
###############################################################  Smruti R. Sarangi
###############################################################  Jayant Haritsa
###############################################################  Shirish Shevade
###############################################################  Surender Baswana
###############################################################  Uday Reddy Bondhugula
###############################################################  Rijurekha Sen
###############################################################  Ganesh Ramakrishnan
###############################################################  Gugan Thoppe | गुगन थोप्पे
###############################################################  Chetan Arora
#######

  doc_topic_prob = doc_topic_prob / doc_topic_prob.sum(axis=1)[:, np.newaxis]


###############################################################  Arpita Patra
###############################################################  Shivaram Kalyanakrishnan
###############################################################  Prashanth L.A.
###############################################################  Abhishek Bichhawat
###############################################################  Sara Achour
###############################################################  Sukhendu Das
###############################################################  Nima Anari
###############################################################  Anirban Dasgupta
###############################################################  Shishir N. Y. Kolathaya
###############################################################  Phillip Isola
###############################################################  Naveen Garg
###############################################################  Mausam
######################################