In [1]:
import pandas as pd

In [2]:
# Load CSV "publications.csv" data
data = pd.read_csv("publications.csv")

# Convert into DataFrame
df = pd.DataFrame(data)

# Show the first 5 rows
df.head()


Unnamed: 0,Professor Name,Interests,h-index,Abstracts,Link,Home_page
0,Mayank Singh,"['NLP', 'Data Mining', 'Machine Learning']",11,Large language models (LLMs) have been shown t...,https://scholar.google.com/citations?view_op=v...,http://mayank4490.github.io/
1,Mayank Singh,"['NLP', 'Data Mining', 'Machine Learning']",11,Code-mixing is the phenomenon of using more th...,https://scholar.google.com/citations?view_op=v...,http://mayank4490.github.io/
2,Mayank Singh,"['NLP', 'Data Mining', 'Machine Learning']",11,Text generation is a highly active area of res...,https://scholar.google.com/citations?view_op=v...,http://mayank4490.github.io/
3,Mayank Singh,"['NLP', 'Data Mining', 'Machine Learning']",11,Projection of changes in extreme indices of cl...,https://scholar.google.com/citations?view_op=v...,http://mayank4490.github.io/
4,Mayank Singh,"['NLP', 'Data Mining', 'Machine Learning']",11,"Understanding the current research trends, pro...",https://scholar.google.com/citations?view_op=v...,http://mayank4490.github.io/


In [3]:

# List of borad CSE fields
keywords = {
    0: ['language', 'text', 'nlp', 'processing'],
    1: ['vision', 'image', 'object', 'recognition'],
    2: ['algorithm', 'optimization', 'learning', 'model'],
    3: ['neural', 'network', 'deep', 'learning'],
    4: ['data', 'big', 'analytics', 'mining'],
    5: ['robotics', 'control', 'autonomous', 'system'],
    6: ['database', 'query', 'sql', 'transaction'],
    7: ['security', 'privacy', 'encryption', 'authentication'],
    8: ['software', 'engineering', 'testing'],
    9: ['web', 'cloud', 'distributed'],
    10: ['architecture', 'computer', 'system', 'network']
}

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [6]:
# create a dataframe of name of professors and topics named 0 to 10 as columns
professors = df['Professor Name'].tolist()
professors = [x for x in professors if str(x) != 'nan']
# unique professors
professors = list(set(professors))
professors_topic = pd.DataFrame(columns=['Professor Name']+list(range(0,10)))
professors_topic['Professor Name'] = professors

# Add column of h-index for each professor from df
professors_topic['h-index'] = 0
for index, row in professors_topic.iterrows():
    professors_topic.loc[index, 'h-index'] = df.loc[df['Professor Name'] == row['Professor Name'], 'h-index'].iloc[0]


# add a column of total publications for each professor which contains the sum of elements in abstract 
professors_topic['Total Publications'] = 0
for index, row in professors_topic.iterrows():
    professors_topic.loc[index, 'Total Publications'] = df.loc[df['Professor Name'] == row['Professor Name'], 'Abstracts'].count()

# add a column named Score with value h-index+total publications
professors_topic['Score'] = professors_topic['h-index'] + professors_topic['Total Publications']

# Scale the score column to 0 to 10
professors_topic['Score'] = (professors_topic['Score'] - professors_topic['Score'].min()) / (professors_topic['Score'].max() - professors_topic['Score'].min()) * 10

# print(professors_topic[professors_topic['Professor Name'] == 'Mayank Singh'])
professors_topic.head()




Unnamed: 0,Professor Name,0,1,2,3,4,5,6,7,8,9,h-index,Total Publications,Score
0,Mainack Mondal,,,,,,,,,,,17,25,3.586957
1,Sandeep Shukla,,,,,,,,,,,39,7,4.021739
2,Vinay Ribeiro,,,,,,,,,,,22,19,3.478261
3,Uday Reddy Bondhugula,,,,,,,,,,,33,16,4.347826
4,Vinay P. Namboodiri,,,,,,,,,,,27,74,10.0


In [7]:
def topic_modelling(abstracts):
    topics = {}

    for abstract in abstracts:
        # Define the number of topics to generate
        n_topics = 10

        # Create Document-Term Matrix using CountVectorizer
        vectorizer = CountVectorizer(stop_words='english')

        # Fit and transform the vectorizer on the abstracts
        dtm = vectorizer.fit_transform([abstract])

        # Create the LDA model
        lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
        lda.fit(dtm)

        # print("Number of words: ", len(vectorizer.get_feature_names_out()))
        # print(vectorizer.get_feature_names_out())

        for keyword in vectorizer.get_feature_names_out():
            for key, value in keywords.items():
                if keyword.lower() in [x.lower() for x in value]:
                    # print(keyword.lower(), "->Topic: ", key)
                    # add the topic to the dictionary
                    
                    if key in topics:
                        topics[key] += 1
                    else:
                        topics[key] = 1

    #print topics in descending order of their frequency
    # for key, value in sorted(topics.items(), key=lambda item: item[1], reverse=True):
    #     print("%s: %s" % (key, value))
    return topics


In [8]:
from scipy.stats import zscore

for profs in professors:
    profs_df = df[df['Professor Name'] == profs]
    print("############################################################### ", profs)
    abstracts = profs_df['Abstracts'].tolist()
    abstracts = [x for x in abstracts if str(x) != 'nan']
    topics = topic_modelling(abstracts)
    for key, value in topics.items():
        professors_topic.loc[professors_topic['Professor Name'] == profs, key] = value

# replace nan with 0
professors_topic = professors_topic.fillna(0)
professors_topic.iloc[:,1:] = professors_topic.iloc[:,1:].apply(zscore, axis=1)
# print(professors_topic)

# find cosine similarity between each professor
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(professors_topic.iloc[:,1:])

cosine_sim = pd.DataFrame(cosine_sim)
cosine_sim.columns = professors_topic['Professor Name'].tolist()
cosine_sim.index = professors_topic['Professor Name'].tolist()
# convert into dataframe and save as csv
cosine_sim.to_csv('cosine_sim_LDA.csv')

# save the dataframe to csv
professors_topic.to_csv('professors_topic.csv', index=False)

###############################################################  Mainack Mondal
###############################################################  Sandeep Shukla
###############################################################  Vinay Ribeiro
###############################################################  Uday Reddy Bondhugula
###############################################################  Vinay P. Namboodiri
###############################################################  Anirban Dasgupta
###############################################################  Supratik Chakraborty
###############################################################  Gugan Thoppe | गुगन थोप्पे
###############################################################  Chester Rebeiro
###############################################################  Om P. Damani
###############################################################  Ajit Rajwade
###############################################################  Ashutosh Modi
#############

In [9]:
from scipy.stats import zscore
from f2 import *

topics = {
    0: ['language', 'text', 'nlp', 'processing'],
    1: ['vision', 'image', 'object', 'recognition'],
    2: ['algorithm', 'optimization', 'learning', 'model'],
    3: ['neural', 'network', 'deep', 'learning'],
    4: ['data', 'big', 'analytics', 'mining'],
    5: ['robotics', 'control', 'autonomous', 'system'],
    6: ['database', 'query', 'sql', 'transaction'],
    7: ['security', 'privacy', 'encryption', 'authentication'],
    8: ['software', 'engineering', 'testing'],
    9: ['web', 'cloud', 'distributed'],
}

# print(professors_topic)
for profs in professors:
    profs_df = df[df['Professor Name'] == profs]
    print("############################################################### ", profs)
    abstracts = profs_df['Abstracts'].tolist()
    abstracts = [x for x in abstracts if str(x) != 'nan']
    doc_topic_prob = assign_topic_probabilities(abstracts, topics)
    # convert into dataframe
    doc_topic_prob = pd.DataFrame(doc_topic_prob)
    # get the sum along columns and normalize it
    doc_topic_prob = doc_topic_prob.sum(axis=0)
    doc_topic_prob = doc_topic_prob/doc_topic_prob.sum()
    for i in range(0, 10):
        professors_topic.loc[professors_topic['Professor Name'] == profs, i] = doc_topic_prob[i]

# replace nan with 0
professors_topic = professors_topic.fillna(0)

# find cosine similarity between each professor
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(professors_topic.iloc[:,1:], professors_topic.iloc[:,1:])
cosine_sim = pd.DataFrame(cosine_sim)
cosine_sim.columns = professors_topic['Professor Name'].tolist()
cosine_sim.index = professors_topic['Professor Name'].tolist()
print(cosine_sim)
# convert into dataframe and save as csv
cosine_sim.to_csv('cosine_sim.csv')


# save the dataframe to csv
professors_topic.to_csv('professors_topic.csv', index=False)

###############################################################  Mainack Mondal
###############################################################  Sandeep Shukla
###############################################################  Vinay Ribeiro
###############################################################  Uday Reddy Bondhugula
###############################################################  Vinay P. Namboodiri
###############################################################  Anirban Dasgupta
###############################################################  Supratik Chakraborty
###############################################################  Gugan Thoppe | गुगन थोप्पे
###############################################################  Chester Rebeiro


  doc_topic_prob = doc_topic_prob / doc_topic_prob.sum(axis=1)[:, np.newaxis]


###############################################################  Om P. Damani
###############################################################  Ajit Rajwade
###############################################################  Ashutosh Modi
###############################################################  Pallab Dasgupta
###############################################################  Suyash P. Awate
###############################################################  Bivas Mitra
###############################################################  Sayan Ranu
###############################################################  Shanmuganathan Raman
###############################################################  Raghavan Komondoor
###############################################################  Partha Talukdar
###############################################################  Ganesh Ramakrishnan
###############################################################  abhilash jindal
#################################