In [1]:
!pip install contextualized-topic-models

Collecting contextualized-topic-models
  Downloading contextualized_topic_models-2.5.0-py2.py3-none-any.whl (36 kB)
Collecting gensim==4.2.0 (from contextualized-topic-models)
  Downloading gensim-4.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (24.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.0/24.0 MB[0m [31m57.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentence-transformers>=2.1.1 (from contextualized-topic-models)
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ipywidgets==7.5.1 (from contextualized-topic-models)
  Downloading ipywidgets-7.5.1-py2.py3-none-any.whl (121 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.6/121.6 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ipython==8.10.0 (from co

## Import General Utility Libraries

In [2]:
import re
import urllib
import gzip
import io
import csv
import random
from collections import defaultdict
from tqdm import tqdm

Where to store the data file. If you want, you can adjust the path.

In [3]:
path_before_1990 = '/content/drive/My Drive/titles_before_1990.txt'
path_from_1990_to_2009 = '/content/drive/My Drive/titles_from_1990_to_2009.txt'
path_from_2010 = '/content/drive/My Drive/titles_from_2010.txt'

Execute the following cell only once to download the data and write it as a file to your google drive. Afterwards, skip this cell or comment it out.

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from google.colab import drive
drive.mount('/content/drive')

# to download the data manually or get more information, go to: https://dblp.org/faq/How+can+I+download+the+whole+dblp+dataset.html
url = 'https://dblp.uni-trier.de/xml/dblp.xml.gz'
# num_titles = 500000  # the (max)number of titles to load


def load_gzip_file(url):
    """Download Gzip-file."""
    response = urllib.request.urlopen(url)
    compressed_file = io.BytesIO(response.read())
    decompressed_file = gzip.GzipFile(fileobj=compressed_file)
    return decompressed_file

def extract_titles(input_file, max_num=40000):
    """Extract title and publication year of dblp papers, given as input file.

    Divide the papers into 3 time periods.

    Collect max max_num papers per time period.
    """
    pairs_before_1990 = []
    count_before_1990 = 0
    pairs_from_1990_to_2009 = []
    count_from_1990_to_2009 = 0
    pairs_from_2010 = []
    count_from_2010 = 0
    got_title = False
    for line in tqdm(input_file):
        line_str = line.decode('utf-8')
        if got_title:
            # we have a title and check for the corresponding year
            year_result = re.search(r'<year>(.*)</year>', line_str)
            if year_result:
                # we also have the year and thus save the title-year pair
                year = int(year_result.group(1))
                if year < 1990:
                    pairs_before_1990.append((title, year))
                    count_before_1990 += 1
                elif year < 2010:
                    pairs_from_1990_to_2009.append((title, year))
                    count_from_1990_to_2009 += 1
                else:
                    pairs_from_2010.append((title, year))
                    count_from_2010 += 1
                got_title = False
        else:
            # we have no title and search for title
            result = re.search(r'<title>(.*)</title>', line_str)
            if result:
                title = result.group(1)
                if len(title.split(' ')) < 3:
                    # only include titles with at least four words
                    continue
                got_title = True

        if count_before_1990 >= max_num and count_from_1990_to_2009 >= max_num and count_from_2010 >= max_num:
            return pairs_before_1990, pairs_from_1990_to_2009, pairs_from_2010

    return pairs_before_1990, pairs_from_1990_to_2009, pairs_from_2010

def save_data(pairs, file_path):
    with open(file_path, 'w') as fout:
        writer = csv.writer(fout)
        for pair in pairs:
            writer.writerow(pair)

in_file = load_gzip_file(url)
pairs_before_1990, pairs_from_1990_to_2009, pairs_from_2010 = extract_titles(in_file)
save_data(pairs_before_1990, path_before_1990)
save_data(pairs_from_1990_to_2009, path_from_1990_to_2009)
save_data(pairs_from_2010, path_from_2010)

Mounted at /content/drive


NameError: ignored

Mount your google drive (in case it is not yet mounted) so that the newly created files are available.

# LDA

In [5]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [6]:
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z ]', '', text)
    text = text.lower()
    return text

### Before the 1990s:

In [None]:
with open(path_before_1990) as fin:
    reader = csv.reader(fin)
    titles = [row[0] for row in reader]

Let's perform some simple preprocessing:

In [None]:
prepro_titles = [preprocess_text(title) for title in titles]

In [None]:
prepro_titles[:10]

['object model capabilities for distributed object management',
 'distributed object management technology',
 'muffin a distributed database machine',
 'algebraical optimization of ftaexpressions',
 'wissensrepraumlsentation und maschinelles lernen',
 'an algebraic characterization of stuf',
 'zur systemarchitektur von lilog',
 'mengenorientierte auswertung von anfragen in der logikprogrammiersprache prolog',
 'definite resolution over constraint languages',
 'dokumentation der syntax der liloggrammatik']

Now we turn the documents (or titles in this case) into a matrix feature representation.

##### Changed N gram range (n gram range of (1,3) seems to give more distinct topics)

In [None]:
num_features = 500
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=num_features, stop_words='english', ngram_range = (1,3))
tf = tf_vectorizer.fit_transform(prepro_titles)
tf_feature_names = tf_vectorizer.get_feature_names_out()



num_lda_topics = 7
lda = LatentDirichletAllocation(n_components=num_lda_topics, max_iter=5, learning_method='online', random_state=42).fit(tf)
for topic_idx, topic in enumerate(lda.components_):
    print(f'Topic {topic_idx}:', end=' ')
    print(' '.join([tf_feature_names[i] for i in topic.argsort()[:-12 - 1:-1]]))

Topic 0: systems analysis problem using sets languages performance von set decision based architecture
Topic 1: computer data new application processing graphs structure switching detection development study class
Topic 2: logic algorithm programming optimal network circuits solution evaluation techniques programs research search
Topic 3: control information approach software memory machines dynamic computers boolean management stochastic function
Topic 4: networks theory linear parallel models digital problems applications finite binary number methods
Topic 5: note functions method algorithms recognition sequential pattern chemical machine synthesis adaptive technical
Topic 6: design model language distributed time der theorem implementation generation zur optimization simulation


In [None]:
num_features = 500
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=num_features, stop_words='english', ngram_range = (1,2))
tf = tf_vectorizer.fit_transform(prepro_titles)
tf_feature_names = tf_vectorizer.get_feature_names_out()



num_lda_topics = 7
lda = LatentDirichletAllocation(n_components=num_lda_topics, max_iter=5, learning_method='online', random_state=42).fit(tf)
for topic_idx, topic in enumerate(lda.components_):
    print(f'Topic {topic_idx}:', end=' ')
    print(' '.join([tf_feature_names[i] for i in topic.argsort()[:-12 - 1:-1]]))

Topic 0: sets set machine nonlinear generalized management interactive graphics graph fuzzy feature editor
Topic 1: note model information application sequential languages development technical stochastic function technical note automata
Topic 2: problem optimal circuits der theorem von solution techniques programs adaptive und search
Topic 3: logic data programming functions using method processing language chemical structure synthesis detection
Topic 4: algorithm networks theory linear approach algorithms parallel models applications graphs computing structures
Topic 5: systems control analysis recognition distributed performance machines finite binary dynamic pattern evaluation
Topic 6: computer design new problems network digital software memory time simulation number methods


##### Did not remove stop words (Is not better)

In [None]:
num_features = 500
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=num_features, ngram_range = (1,3))
tf = tf_vectorizer.fit_transform(prepro_titles)
tf_feature_names = tf_vectorizer.get_feature_names_out()



num_lda_topics = 7
lda = LatentDirichletAllocation(n_components=num_lda_topics, max_iter=5, learning_method='online', random_state=42).fit(tf)
for topic_idx, topic in enumerate(lda.components_):
    print(f'Topic {topic_idx}:', end=' ')
    print(' '.join([tf_feature_names[i] for i in topic.argsort()[:-12 - 1:-1]]))

Topic 0: for systems computer by networks linear programming on of parallel digital distributed
Topic 1: of the on and of the on the logic analysis to for approach new
Topic 2: on note some information functions and method note on in recognition pattern von
Topic 3: the for to algorithm optimal for the of algorithm for from to the problem finite
Topic 4: of and with in systems models sequential performance applications problems machines languages
Topic 5: in an of system for the and theory in the network algorithms model
Topic 6: of control design and using in on data sets design of dynamic language


##### Changed max_df (No change)

In [None]:
num_features = 500
tf_vectorizer = CountVectorizer(max_df=0.98, min_df=2, max_features=num_features, stop_words='english', ngram_range = (1,3))
tf = tf_vectorizer.fit_transform(prepro_titles)
tf_feature_names = tf_vectorizer.get_feature_names_out()



num_lda_topics = 7
lda = LatentDirichletAllocation(n_components=num_lda_topics, max_iter=5, learning_method='online', random_state=42).fit(tf)
for topic_idx, topic in enumerate(lda.components_):
    print(f'Topic {topic_idx}:', end=' ')
    print(' '.join([tf_feature_names[i] for i in topic.argsort()[:-12 - 1:-1]]))

Topic 0: systems analysis problem using sets languages performance von set decision based architecture
Topic 1: computer data new application processing graphs structure switching detection development study class
Topic 2: logic algorithm programming optimal network circuits solution evaluation techniques programs research search
Topic 3: control information approach software memory machines dynamic computers boolean management stochastic function
Topic 4: networks theory linear parallel models digital problems applications finite binary number methods
Topic 5: note functions method algorithms recognition sequential pattern chemical machine synthesis adaptive technical
Topic 6: design model language distributed time der theorem implementation generation zur optimization simulation


In [None]:
num_features = 500
tf_vectorizer = CountVectorizer(max_df=0.5, min_df=2, max_features=num_features, stop_words='english', ngram_range = (1,3))
tf = tf_vectorizer.fit_transform(prepro_titles)
tf_feature_names = tf_vectorizer.get_feature_names_out()



num_lda_topics = 7
lda = LatentDirichletAllocation(n_components=num_lda_topics, max_iter=5, learning_method='online', random_state=42).fit(tf)
for topic_idx, topic in enumerate(lda.components_):
    print(f'Topic {topic_idx}:', end=' ')
    print(' '.join([tf_feature_names[i] for i in topic.argsort()[:-12 - 1:-1]]))

Topic 0: systems analysis problem using sets languages performance von set decision based architecture
Topic 1: computer data new application processing graphs structure switching detection development study class
Topic 2: logic algorithm programming optimal network circuits solution evaluation techniques programs research search
Topic 3: control information approach software memory machines dynamic computers boolean management stochastic function
Topic 4: networks theory linear parallel models digital problems applications finite binary number methods
Topic 5: note functions method algorithms recognition sequential pattern chemical machine synthesis adaptive technical
Topic 6: design model language distributed time der theorem implementation generation zur optimization simulation


In [None]:
num_features = 500
tf_vectorizer = CountVectorizer(max_df=0.1, min_df=2, max_features=num_features, stop_words='english', ngram_range = (1,3))
tf = tf_vectorizer.fit_transform(prepro_titles)
tf_feature_names = tf_vectorizer.get_feature_names_out()



num_lda_topics = 7
lda = LatentDirichletAllocation(n_components=num_lda_topics, max_iter=5, learning_method='online', random_state=42).fit(tf)
for topic_idx, topic in enumerate(lda.components_):
    print(f'Topic {topic_idx}:', end=' ')
    print(' '.join([tf_feature_names[i] for i in topic.argsort()[:-12 - 1:-1]]))

Topic 0: systems analysis problem using sets languages performance von set decision based architecture
Topic 1: computer data new application processing graphs structure switching detection development study class
Topic 2: logic algorithm programming optimal network circuits solution evaluation techniques programs research search
Topic 3: control information approach software memory machines dynamic computers boolean management stochastic function
Topic 4: networks theory linear parallel models digital problems applications finite binary number methods
Topic 5: note functions method algorithms recognition sequential pattern chemical machine synthesis adaptive technical
Topic 6: design model language distributed time der theorem implementation generation zur optimization simulation


##### Changed num features (Does not seem to improve results too much)

In [None]:
tf_vectorizer = CountVectorizer(max_df=0.99, min_df=2, stop_words='english', ngram_range = (1,3))
tf = tf_vectorizer.fit_transform(prepro_titles)
tf_feature_names = tf_vectorizer.get_feature_names_out()



num_lda_topics = 7
lda = LatentDirichletAllocation(n_components=num_lda_topics, max_iter=5, learning_method='online', random_state=42).fit(tf)
for topic_idx, topic in enumerate(lda.components_):
    print(f'Topic {topic_idx}:', end=' ')
    print(' '.join([tf_feature_names[i] for i in topic.argsort()[:-12 - 1:-1]]))

Topic 0: model functions logic using application sequential systems graphs theorem programs development boolean
Topic 1: control analysis systems digital memory machines dynamic methods approach processing applications automatic
Topic 2: computer new information number chemical study random general technique logic science function
Topic 3: algorithm optimal method problem recognition time binary pattern solution techniques machine switching
Topic 4: design note network algorithms systems sets der distributed adaptive technical review implementation
Topic 5: systems linear programming parallel finite languages using structure language comments based architecture
Topic 6: networks theory problems computer data structures evaluation use program detection research nonlinear


In [None]:
num_features = 1500
tf_vectorizer = CountVectorizer(max_df=0.99, min_df=2, max_features=num_features, stop_words='english', ngram_range = (1,3))
tf = tf_vectorizer.fit_transform(prepro_titles)
tf_feature_names = tf_vectorizer.get_feature_names_out()



num_lda_topics = 7
lda = LatentDirichletAllocation(n_components=num_lda_topics, max_iter=5, learning_method='online', random_state=42).fit(tf)
for topic_idx, topic in enumerate(lda.components_):
    print(f'Topic {topic_idx}:', end=' ')
    print(' '.join([tf_feature_names[i] for i in topic.argsort()[:-12 - 1:-1]]))

Topic 0: analysis algorithm data sequential approach problems applications graphs computing techniques detection adaptive
Topic 1: application circuits theorem von set development estimation class management properties arithmetic trees
Topic 2: networks note functions optimal method sets finite solution program synthesis research computers
Topic 3: design logic systems models language programming processing languages distributed dynamic data implementation
Topic 4: computer systems control using theory parallel digital performance new model time simulation
Topic 5: linear systems machines binary number methods structures comments use decision review introduction
Topic 6: information network algorithms recognition memory problem der pattern und ii generalized zur


In [None]:
num_features = 5000
tf_vectorizer = CountVectorizer(max_df=0.99, min_df=2, max_features=num_features, stop_words='english', ngram_range = (1,3))
tf = tf_vectorizer.fit_transform(prepro_titles)
tf_feature_names = tf_vectorizer.get_feature_names_out()



num_lda_topics = 7
lda = LatentDirichletAllocation(n_components=num_lda_topics, max_iter=5, learning_method='online', random_state=42).fit(tf)
for topic_idx, topic in enumerate(lda.components_):
    print(f'Topic {topic_idx}:', end=' ')
    print(' '.join([tf_feature_names[i] for i in topic.argsort()[:-12 - 1:-1]]))

Topic 0: computer logic theory new control programs simulation study complexity finite modal technique
Topic 1: using parallel programming der set logic based und fault processing graphics processor
Topic 2: note information network algorithms number structures computers technical review introduction efficient interactive
Topic 3: networks systems model functions data computer digital application recognition distributed sequential time
Topic 4: analysis algorithm control optimal problems systems approach graphs machines languages dynamic theorem
Topic 5: design systems linear software binary methods implementation comments use models architecture language
Topic 6: problem circuits memory chemical von sets solution techniques synthesis detection nonlinear zur


In [None]:
num_features = 100
tf_vectorizer = CountVectorizer(max_df=0.99, min_df=2, max_features=num_features, stop_words='english', ngram_range = (1,3))
tf = tf_vectorizer.fit_transform(prepro_titles)
tf_feature_names = tf_vectorizer.get_feature_names_out()



num_lda_topics = 7
lda = LatentDirichletAllocation(n_components=num_lda_topics, max_iter=5, learning_method='online', random_state=42).fit(tf)
for topic_idx, topic in enumerate(lda.components_):
    print(f'Topic {topic_idx}:', end=' ')
    print(' '.join([tf_feature_names[i] for i in topic.argsort()[:-12 - 1:-1]]))

Topic 0: computer networks model optimal problems sequential circuits applications sets time der program
Topic 1: systems theory linear simulation number methods structures use decision nonlinear introduction random
Topic 2: algorithm data programming approach distributed graphs theorem structure set machine automatic management
Topic 3: control design information new machines finite dynamic evaluation research study letter sequential
Topic 4: using parallel digital language performance processing computing chemical based computers search image
Topic 5: analysis method network algorithms models application recognition binary pattern techniques development review
Topic 6: logic note problem functions software memory languages implementation solution programs architecture technical


##### Changed num of topics


In [None]:
num_features = 100
tf_vectorizer = CountVectorizer(max_df=0.99, min_df=2, max_features=num_features, stop_words='english', ngram_range = (1,3))
tf = tf_vectorizer.fit_transform(prepro_titles)
tf_feature_names = tf_vectorizer.get_feature_names_out()



num_lda_topics = 5
lda = LatentDirichletAllocation(n_components=num_lda_topics, max_iter=5, learning_method='online', random_state=42).fit(tf)
for topic_idx, topic in enumerate(lda.components_):
    print(f'Topic {topic_idx}:', end=' ')
    print(' '.join([tf_feature_names[i] for i in topic.argsort()[:-12 - 1:-1]]))

Topic 0: design analysis model language sequential performance sets graphs time der chemical implementation
Topic 1: systems linear network algorithms recognition software memory pattern methods decision architecture computers
Topic 2: logic data note problem functions method application distributed theorem computing structure solution
Topic 3: computer control algorithm networks information approach optimal new circuits machines finite binary
Topic 4: using theory programming problems parallel models digital processing applications simulation languages structures


In [None]:
num_features = 100
tf_vectorizer = CountVectorizer(max_df=0.99, min_df=2, max_features=num_features, stop_words='english', ngram_range = (1,3))
tf = tf_vectorizer.fit_transform(prepro_titles)
tf_feature_names = tf_vectorizer.get_feature_names_out()



num_lda_topics = 7
lda = LatentDirichletAllocation(n_components=num_lda_topics, max_iter=5, learning_method='online', random_state=42).fit(tf)
for topic_idx, topic in enumerate(lda.components_):
    print(f'Topic {topic_idx}:', end=' ')
    print(' '.join([tf_feature_names[i] for i in topic.argsort()[:-12 - 1:-1]]))

Topic 0: computer networks model optimal problems sequential circuits applications sets time der program
Topic 1: systems theory linear simulation number methods structures use decision nonlinear introduction random
Topic 2: algorithm data programming approach distributed graphs theorem structure set machine automatic management
Topic 3: control design information new machines finite dynamic evaluation research study letter sequential
Topic 4: using parallel digital language performance processing computing chemical based computers search image
Topic 5: analysis method network algorithms models application recognition binary pattern techniques development review
Topic 6: logic note problem functions software memory languages implementation solution programs architecture technical


In [None]:
num_features = 100
tf_vectorizer = CountVectorizer(max_df=0.99, min_df=2, max_features=num_features, stop_words='english', ngram_range = (1,3))
tf = tf_vectorizer.fit_transform(prepro_titles)
tf_feature_names = tf_vectorizer.get_feature_names_out()



num_lda_topics = 10
lda = LatentDirichletAllocation(n_components=num_lda_topics, max_iter=5, learning_method='online', random_state=42).fit(tf)
for topic_idx, topic in enumerate(lda.components_):
    print(f'Topic {topic_idx}:', end=' ')
    print(' '.join([tf_feature_names[i] for i in topic.argsort()[:-12 - 1:-1]]))

Topic 0: using problems parallel digital circuits applications simulation nonlinear image processing sequential computers
Topic 1: systems linear performance software memory methods decision architecture computers management database distributed
Topic 2: data note functions distributed graphs structure machine technical estimation search generalized stochastic
Topic 3: control information recognition time machines dynamic pattern general sequential stochastic automatic approach
Topic 4: model theory models sequential processing binary number structures use review introduction random
Topic 5: computer new finite based study class architecture networks approach automata evaluation performance
Topic 6: logic problem method der implementation set programs optimization processes automata solution ii
Topic 7: design optimal application sets theorem chemical development structure automatic software control memory
Topic 8: algorithm networks approach network algorithms computing solution evalu

In [None]:
num_features = 100
tf_vectorizer = CountVectorizer(max_df=0.5, min_df=2, max_features=num_features, stop_words='english', ngram_range = (1,3))
tf = tf_vectorizer.fit_transform(prepro_titles)
tf_feature_names = tf_vectorizer.get_feature_names_out()



num_lda_topics = 15
lda = LatentDirichletAllocation(n_components=num_lda_topics, max_iter=5, learning_method='online', random_state=42).fit(tf)
for topic_idx, topic in enumerate(lda.components_):
    print(f'Topic {topic_idx}:', end=' ')
    print(' '.join([tf_feature_names[i] for i in topic.argsort()[:-12 - 1:-1]]))

Topic 0: using parallel performance software time pattern computers processes editor recognition image letter
Topic 1: systems linear estimation database decision distributed stochastic time performance control architecture optimal
Topic 2: data distributed graphs structure set machine management stochastic database model automata approach
Topic 3: control model sequential machines dynamic stochastic optimal automatic method processes circuits approach
Topic 4: models number methods structures use introduction generalized random chemical graphs theory properties
Topic 5: theory problems algorithms recognition binary decision architecture review ii efficient automata pattern
Topic 6: design memory evaluation programs research database optimization letter logic systems editor management
Topic 7: note information functions technical general technical note software problem chemical data decision estimation
Topic 8: network application processing solution development search image communicat

Topics:
0. Computer networks
1. Linear and non linear methods
2. Distributed algorithms and graph theorams
3. Design control and Finite machines
4. Parallel processing on images and chemicals
5. Pattern recognition techniques (perhaps using network algorithms)
6. Logic implementation ( perhaps using software languages and techincal architecture)


### From 1990 to 2009:

Add your code for topic modelling the period from 1990 to 2009 here...

In [None]:
with open(path_from_1990_to_2009) as fin:
    reader = csv.reader(fin)
    titles = [row[0] for row in reader]

prepro_titles = [preprocess_text(title) for title in titles]



In [None]:
num_features = 100
tf_vectorizer = CountVectorizer(max_df=0.99, min_df=2, max_features=num_features, stop_words='english', ngram_range = (1,3))
tf = tf_vectorizer.fit_transform(prepro_titles)
tf_feature_names = tf_vectorizer.get_feature_names_out()



num_lda_topics = 7
lda = LatentDirichletAllocation(n_components=num_lda_topics, max_iter=5, learning_method='online', random_state=42).fit(tf)
for topic_idx, topic in enumerate(lda.components_):
    print(f'Topic {topic_idx}:', end=' ')
    print(' '.join([tf_feature_names[i] for i in topic.argsort()[:-12 - 1:-1]]))

Topic 0: approach adaptive applications dynamic power architecture programming logic services nonlinear new linear
Topic 1: method performance models robust problems evaluation parallel optimization case generalized equation stochastic
Topic 2: using control information methods modeling management fuzzy distributed theory structure selection technology
Topic 3: data study problem graphs software development identification classification case research use solution
Topic 4: analysis application estimation image scheme web recognition computer processing implementation search special
Topic 5: networks design algorithm linear nonlinear new network algorithms optimal equations neural efficient
Topic 6: systems model based learning knowledge realtime hybrid nonlinear stability linear control communication


Topics:
0. Non linear and linear approaches
1. Generalized and robust optimzation methods
2. Information theory and management
3. Identification and clasification (using data and graphs maybe)
4. Image recognition and web search
5. Neural network algorithms
6. Realtime knowledge learning systems


### From 2010 onwards:

Add your code for topic modelling the period from 2010 onwards here...

In [11]:
with open(path_from_2010) as fin:
    reader = csv.reader(fin)
    titles = [row[0] for row in reader]

prepro_titles = [preprocess_text(title) for title in titles]

In [12]:
num_features = 100
tf_vectorizer = CountVectorizer(max_df=0.99, min_df=2, max_features=num_features, stop_words='english', ngram_range = (1,3))
tf = tf_vectorizer.fit_transform(prepro_titles)
tf_feature_names = tf_vectorizer.get_feature_names_out()



num_lda_topics = 7
lda = LatentDirichletAllocation(n_components=num_lda_topics, max_iter=5, learning_method='online', random_state=42).fit(tf)
for topic_idx, topic in enumerate(lda.components_):
    print(f'Topic {topic_idx}:', end=' ')
    print(' '.join([tf_feature_names[i] for i in topic.argsort()[:-12 - 1:-1]]))

Topic 0: optimization new application equations mobile applications modeling methods problem hybrid multiple development
Topic 1: model learning algorithm design adaptive efficient prediction stochastic machine improved smart distribution
Topic 2: systems control method estimation performance robust problems recognition evaluation images realtime segmentation
Topic 3: nonlinear information framework scheme management online scheduling service distributed systems energy adaptive
Topic 4: networks analysis network approach neural study dynamic wireless novel social case research
Topic 5: image linear time classification sensor computing tracking dynamics communication selection human digital
Topic 6: using based data detection deep distributed models optimal power energy fuzzy algorithms


Topics:
0. New mobile applications
1. Machine Learning and improved smart algorithms
2. Realtime image recognition and segmentation
3. Distributed systems
4. Neural networks
5. Image classification and human tracking
6. Data based models (possibly deep learning).




# Combined Topic Models

Method developed by [Bianchi et al. 2021](https://aclanthology.org/2021.acl-short.96/).

[A 6min presentation of the paper by one of the authors.](https://underline.io/lecture/25716-pre-training-is-a-hot-topic-contextualized-document-embeddings-improve-topic-coherence)

Code: [https://github.com/MilaNLProc/contextualized-topic-models](https://github.com/MilaNLProc/contextualized-topic-models)

Tutorial: [https://colab.research.google.com/drive/1fXJjr_rwqvpp1IdNQ4dxqN4Dp88cxO97?usp=sharing](https://colab.research.google.com/drive/1fXJjr_rwqvpp1IdNQ4dxqN4Dp88cxO97?usp=sharing)

Again, perform topic modelling for the three time periods - this time using the combined topic models (CTMs).

You can use and adapt the code from the tutorial linked above.

Use the available GPU for faster running times.

In [5]:
from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessingStopwords
import nltk
from nltk.corpus import stopwords as stop_words

nltk.download('stopwords')
num_ctm_topics = 7  # you can also choose a higher number of topics

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Before the 1990s:

In [14]:


documents = [line.strip() for line in open(path_before_1990, encoding="utf-8").readlines()]

stopwords = list(stop_words.words("english"))

sp = WhiteSpacePreprocessingStopwords(documents, stopwords_list=stopwords)
preprocessed_documents, unpreprocessed_corpus, vocab, retained_indices = sp.preprocess()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [15]:
tp = TopicModelDataPreparation("all-mpnet-base-v2")

training_dataset = tp.fit(text_for_contextual=unpreprocessed_corpus, text_for_bow=preprocessed_documents)

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/198 [00:00<?, ?it/s]

In [16]:
ctm = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, n_components=num_ctm_topics, num_epochs=10)
ctm.fit(training_dataset) # run the model

Epoch: [10/10]	 Seen Samples: [394880/395300]	Train Loss: 35.544203126063415	Time: 0:00:12.793239: : 10it [01:59, 11.97s/it]
100%|██████████| 618/618 [00:10<00:00, 57.39it/s]


In [17]:
for topic in ctm.get_topic_lists(10):
  print('Topic: ', topic)

Topic:  ['programming', 'recognition', 'using', 'based', 'language', 'approach', 'pattern', 'data', 'system', 'knowledge']
Topic:  ['algorithm', 'note', 'problem', 'problems', 'algorithms', 'technical', 'solution', 'two', 'parallel', 'method']
Topic:  ['systems', 'control', 'time', 'optimal', 'linear', 'model', 'models', 'decision', 'analysis', 'estimation']
Topic:  ['networks', 'design', 'computer', 'network', 'fault', 'performance', 'data', 'local', 'architecture', 'digital']
Topic:  ['logic', 'calculus', 'von', 'uuml', 'der', 'propositional', 'symbolic', 'und', 'de', 'zur']
Topic:  ['research', 'information', 'science', 'future', 'library', 'engineering', 'report', 'technology', 'education', 'ai']
Topic:  ['sub', 'sets', 'properties', 'automata', 'free', 'sup', 'grammars', 'arithmetic', 'types', 'algebras']


### From 1990 to 2009

In [10]:

documents = [line.strip() for line in open(path_from_1990_to_2009, encoding="utf-8").readlines()]

stopwords = list(stop_words.words("english"))

sp = WhiteSpacePreprocessingStopwords(documents, stopwords_list=stopwords)
preprocessed_documents, unpreprocessed_corpus, vocab, retained_indices = sp.preprocess()

In [11]:
tp = TopicModelDataPreparation("all-mpnet-base-v2")

training_dataset = tp.fit(text_for_contextual=unpreprocessed_corpus, text_for_bow=preprocessed_documents)

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/1637 [00:00<?, ?it/s]

In [12]:
ctm = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, n_components=num_ctm_topics, num_epochs=10)
ctm.fit(training_dataset) # run the model

Epoch: [10/10]	 Seen Samples: [3272320/3272720]	Train Loss: 41.95292527788007	Time: 0:01:35.353657: : 10it [17:04, 102.50s/it]
100%|██████████| 5114/5114 [01:25<00:00, 60.13it/s]


In [13]:
for topic in ctm.get_topic_lists(10):
  print('Topic: ', topic)

Topic:  ['information', 'system', 'study', 'development', 'knowledge', 'case', 'management', 'web', 'learning', 'software']
Topic:  ['using', 'data', 'image', 'recognition', 'analysis', 'based', 'images', 'classification', 'neural', 'detection']
Topic:  ['problems', 'equations', 'problem', 'method', 'order', 'solution', 'methods', 'finite', 'algorithm', 'solutions']
Topic:  ['special', 'issue', 'introduction', 'uuml', 'der', 'editorial', 'eacute', 'de', 'und', 'auml']
Topic:  ['systems', 'control', 'time', 'sub', 'sup', 'linear', 'nonlinear', 'robust', 'discrete', 'adaptive']
Topic:  ['networks', 'wireless', 'performance', 'power', 'high', 'low', 'routing', 'mobile', 'sensor', 'network']
Topic:  ['impulse', 'errors', 'simplified', 'estimator', 'covariance', 'failures', 'layered', 'angle', 'faults', 'feasibility']


### From 2010 onwards

In [6]:

documents = [line.strip() for line in open(path_from_2010, encoding="utf-8").readlines()]

stopwords = list(stop_words.words("english"))

sp = WhiteSpacePreprocessingStopwords(documents, stopwords_list=stopwords)
preprocessed_documents, unpreprocessed_corpus, vocab, retained_indices = sp.preprocess()


In [7]:
tp = TopicModelDataPreparation("all-mpnet-base-v2")

training_dataset = tp.fit(text_for_contextual=unpreprocessed_corpus, text_for_bow=preprocessed_documents)

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]



Batches:   0%|          | 0/4631 [00:00<?, ?it/s]

In [10]:
ctm = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, n_components=num_ctm_topics, num_epochs=4)
ctm.fit(training_dataset) # run the model

  and should_run_async(code)
Epoch: [4/4]	 Seen Samples: [3704064/3704300]	Train Loss: 51.675873731032745	Time: 0:04:46.027343: : 4it [19:06, 286.62s/it]

  0%|          | 0/14470 [00:00<?, ?it/s][A
  0%|          | 1/14470 [00:00<1:44:27,  2.31it/s][A
  0%|          | 8/14470 [00:00<13:30, 17.85it/s]  [A
  0%|          | 14/14470 [00:00<08:48, 27.35it/s][A
  0%|          | 22/14470 [00:00<06:05, 39.58it/s][A
  0%|          | 29/14470 [00:00<05:08, 46.87it/s][A
  0%|          | 36/14470 [00:01<04:38, 51.88it/s][A
  0%|          | 43/14470 [00:01<04:14, 56.75it/s][A
  0%|          | 50/14470 [00:01<04:12, 57.05it/s][A
  0%|          | 57/14470 [00:01<03:58, 60.36it/s][A
  0%|          | 64/14470 [00:01<03:56, 60.94it/s][A
  0%|          | 71/14470 [00:01<04:01, 59.57it/s][A
  1%|          | 79/14470 [00:01<03:47, 63.15it/s][A
  1%|          | 87/14470 [00:01<03:44, 63.97it/s][A
  1%|          | 95/14470 [00:01<03:41, 64.82it/s][A
  1%|          | 103/14470 [00:02<03:36, 6

In [14]:
for topic in ctm.get_topic_lists(10):
  print('Topic: ', topic)

Topic:  ['learning', 'deep', 'network', 'neural', 'based', 'detection', 'machine', 'classification', 'recognition', 'using']
Topic:  ['review', 'technology', 'special', 'research', 'media', 'social', 'issue', 'health', 'challenges', 'digital']
Topic:  ['control', 'time', 'systems', 'sub', 'nonlinear', 'order', 'linear', 'finite', 'equations', 'differential']
Topic:  ['wireless', 'networks', 'energy', 'efficient', 'sensor', 'power', 'resource', 'allocation', 'computing', 'scheme']
Topic:  ['optimizer', 'simplified', 'cascade', 'weight', 'plants', 'weighting', 'metaheuristic', 'peak', 'obstacle', 'multilayer']
Topic:  ['optimization', 'analysis', 'algorithm', 'model', 'fuzzy', 'decision', 'approach', 'problem', 'multi', 'making']
Topic:  ['image', 'sensing', 'images', 'remote', 'estimation', 'resolution', 'hyperspectral', 'sar', 'sparse', 'imaging']


  and should_run_async(code)


In [None]:
4