In [1]:
import numpy as np
import pandas as pd
import textparser   # For potential use later
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

## Vectorization Using sklearn

### Read Data

In [2]:
projects = pd.read_csv('nrao_projects.csv')
projects = projects.set_index('project_code')
projects.head()

Unnamed: 0_level_0,project_title,project_abstract,fs_type,target,raw_text,standardized_text,no_sw_text,lemmatized_sw_text,lemmatized_no_sw_text
project_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018.1.01205.L,Fifty AU STudy of the chemistry in the disk/en...,The huge variety of planetary systems discover...,line,1,Fifty AU STudy of the chemistry in the disk/en...,fifty au study of the chemistry in the disk en...,fifty au study chemistry disk envelope system ...,fifty au study of the chemistry in the disk en...,fifty au study chemistry disk envelope system ...
2022.1.00316.L,COMPASS: Complex Organic Molecules in Protosta...,The emergence of complex organic molecules in ...,line,1,COMPASS: Complex Organic Molecules in Protosta...,compass complex organic molecules in protostar...,compass complex organic molecules protostars s...,compass complex organic molecule in protostars...,compass complex organic molecule protostars sp...
2017.1.00161.L,ALCHEMI: the ALMA Comprehensive High-resolutio...,A great variety in gas composition is observed...,line,1,ALCHEMI: the ALMA Comprehensive High-resolutio...,alchemi the comprehensive high resolution extr...,alchemi comprehensive high resolution extragal...,alchemi the comprehensive high resolution extr...,alchemi comprehensive high resolution extragal...
2021.1.01616.L,ALMA JELLY - Survey of Nearby Jellyfish and Ra...,We propose the first ever statistical survey o...,line,1,ALMA JELLY - Survey of Nearby Jellyfish and Ra...,jelly survey of nearby jellyfish and ram press...,jelly survey nearby jellyfish ram pressure str...,jelly survey of nearby jellyfish and ram press...,jelly survey nearby jellyfish ram pressure str...
2021.1.00869.L,Bulge symmetry or not? The hidden dynamics of ...,A radio survey of red giant SiO sources in the...,line,1,Bulge symmetry or not? The hidden dynamics of ...,bulge symmetry or not the hidden dynamics of t...,bulge symmetry hidden dynamics far side radio ...,bulge symmetry or not the hidden dynamic of th...,bulge symmetry hidden dynamic far side radio s...


### Select only line projects

In [3]:
line_projects = projects[projects['fs_type'] == 'line']
line_projects.shape

(3628, 9)

### Use lemmatized text with stopwords removed

In [4]:
line_projects = line_projects[['lemmatized_no_sw_text']]
line_projects.head()

Unnamed: 0_level_0,lemmatized_no_sw_text
project_code,Unnamed: 1_level_1
2018.1.01205.L,fifty au study chemistry disk envelope system ...
2022.1.00316.L,compass complex organic molecule protostars sp...
2017.1.00161.L,alchemi comprehensive high resolution extragal...
2021.1.01616.L,jelly survey nearby jellyfish ram pressure str...
2021.1.00869.L,bulge symmetry hidden dynamic far side radio s...


### Generate count vectorized data frame
This is a data frame with columns representing all terms in the corpus, populated by term counts within each document (project title and abstract).

In [5]:
count_vectorizer = CountVectorizer()
cv_projects = count_vectorizer.fit_transform(line_projects.lemmatized_no_sw_text)
count_vectorizer.get_feature_names_out()

array(['aa', 'aalto', 'ab', ..., 'µm', 'λcdm', 'μm'], dtype=object)

In [6]:
cv_projects.shape

(3628, 8641)

### Generate tf-idf vectorized data frame
This is a data frame with columns representing all terms in the corpus, populated by term tf-idf within each document (project title and abstract).

Note, this uses scikit-learn's base tf-idf formula.

In [7]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_projects = tfidf_vectorizer.fit_transform(line_projects.lemmatized_no_sw_text)
tfidf_vectorizer.get_feature_names_out()

array(['aa', 'aalto', 'ab', ..., 'µm', 'λcdm', 'μm'], dtype=object)

In [8]:
tfidf_projects.shape

(3628, 8641)