In [1]:
import sqlite3
import pandas as pd

conn = sqlite3.connect('main.db')

In [2]:
authors_df = pd.read_csv('authors.csv')
authors_df.to_sql('authors_raw', conn, if_exists = 'replace')
display(authors_df)

papers_df = pd.read_csv('papers.csv')
papers_df.to_sql('papers_raw', conn, if_exists = 'replace')
display(papers_df)


Unnamed: 0,source_id,first_name,last_name,institution
0,27,Alan,Murray,
1,27,Anthony,Smith,
2,27,Zoe,Butler,
3,63,Yaser,Abu-Mostafa,
4,60,Michael,Fleisher,
...,...,...,...,...
30232,8693,Joshua,Wang,Google
30233,2302,Ruho,Kondo,"Toyota Central R&D Labs., Inc."
30234,2302,Keisuke,Kawano,"Toyota Central R&D Labs., Inc"
30235,2302,Satoshi,Koide,Toyota Central R&D Labs.


Unnamed: 0,source_id,year,title,abstract,full_text
0,27,1987,Bit-Serial Neural Networks,,573 \n\nBIT - SERIAL NEURAL NETWORKS \n\nAlan...
1,63,1987,Connectivity Versus Entropy,,1 \n\nCONNECTIVITY VERSUS ENTROPY \n\nYaser S...
2,60,1987,The Hopfield Model with Multi-Level Neurons,,278 \n\nTHE HOPFIELD MODEL WITH MUL TI-LEVEL N...
3,59,1987,How Neural Nets Work,,442 \n\nAlan Lapedes \nRobert Farber \n\nThe...
4,69,1987,Spatial Organization of Neural Networks: A Pro...,,740 \n\nSPATIAL ORGANIZATION OF NEURAL NEn...
...,...,...,...,...,...
9675,5452,2019,Discrete Object Generation with Reversible Ind...,The success of generative modeling in continuo...,Discrete Object Generation\n\nwith Reversible ...
9676,4799,2019,Adaptively Aligned Image Captioning via Adapti...,Recent neural models for image captioning usua...,Adaptively Aligned Image Captioning via\n\nAda...
9677,1827,2019,Fully Dynamic Consistent Facility Location,We consider classic clustering problems in ful...,Fully Dynamic Consistent Facility Location\n\n...
9678,8693,2019,Efficient Rematerialization for Deep Networks,"When training complex neural networks, memory ...",Efﬁcient Rematerialization for Deep Networks\n...


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import nltk
from nltk.corpus import stopwords

In [4]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yewsh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# Step 1: Load data from SQLite database
papers_df = pd.read_sql('SELECT * FROM papers_raw', conn)

In [6]:
# Step 2: Preprocess the text
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords
    tokens = [word for word in text.split() if word not in stop_words]
    return ' '.join(tokens)

# Combine 'title' and 'abstract', and preprocess
papers_df['processed_text'] = (papers_df['title'].fillna('') + ' ' + papers_df['abstract'].fillna('')).apply(preprocess_text)

In [7]:
# Step 3: Group by year
grouped = papers_df.groupby('year')

# Step 4: Calculate TF-IDF for each year and extract top terms
hot_topics = {}

for year, group in grouped:
    year_texts = group['processed_text'].tolist()  # Combine all texts for this year
    
    # Calculate TF-IDF
    vectorizer = TfidfVectorizer(max_features=20)  # Top 20 terms per year
    tfidf_matrix = vectorizer.fit_transform(year_texts)
    feature_names = vectorizer.get_feature_names_out()
    scores = tfidf_matrix.sum(axis=0).A1  # Sum TF-IDF scores across all documents
    
    # Get top terms
    terms_scores = sorted(zip(feature_names, scores), key=lambda x: x[1], reverse=True)
    hot_topics[year] = terms_scores

In [8]:
# Step 5: Display hot topics for each year
for year, topics in hot_topics.items():
    print(f"Year {year}:")
    for term, score in topics:
        print(f"  {term}: {score:.2f}")
    print()

Year 1987:
  neural: 22.10
  networks: 19.47
  network: 9.57
  learning: 8.19
  associative: 5.90
  processing: 4.54
  neurons: 4.07
  memories: 3.92
  memory: 3.77
  analysis: 2.97
  based: 2.95
  cortex: 2.95
  classification: 2.85
  information: 2.71
  models: 2.56
  application: 2.48
  computational: 2.34
  connectionist: 2.06
  systems: 2.01
  artificial: 1.97

Year 1988:
  neural: 15.97
  network: 12.40
  networks: 11.95
  learning: 10.88
  analog: 7.06
  backpropagation: 4.30
  using: 4.11
  adaptive: 3.68
  model: 3.65
  distributed: 3.53
  training: 3.27
  modeling: 3.16
  systems: 3.11
  speech: 2.73
  control: 2.67
  temporal: 2.58
  connectionist: 2.46
  learn: 2.32
  representations: 2.01
  system: 1.98

Year 1989:
  neural: 16.66
  networks: 16.37
  network: 10.42
  learning: 9.02
  model: 5.76
  using: 5.59
  backpropagation: 4.56
  analog: 4.45
  recognition: 4.38
  connectionist: 4.26
  algorithm: 4.20
  speech: 3.57
  representations: 3.28
  computer: 3.28
  associati