In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import plotly_express as px
import textparser   # For potential use later
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from textparser import TextParser
from ast import literal_eval
from sklearn.naive_bayes import MultinomialNB
import sklearn
from sklearn.model_selection import train_test_split
import statistics
import math
import random

In [2]:
# Data cleaning
measurements = pd.read_csv('nrao_measurements.csv')
measurements = measurements.drop_duplicates()
measurements['freq_diff'] = measurements['high_freq'] - measurements['low_freq']
measurements = measurements[measurements.freq_diff < 5] # Adele mentioned that frequency range should never be greater than 4 so remove those that do not meet specifications
measurements['freq_med'] = (measurements['low_freq'] + (measurements['freq_diff']/2))
measurements = measurements.query('fs_type == "line"')
measurements['band'] = pd.to_numeric(measurements['band'], errors='coerce', downcast='integer')
valid_band_values = set(range(1, 11))
measurements = measurements[measurements['band'].isin(valid_band_values)] # Removing any rows with incorrect band formatting
measurements

  measurements = pd.read_csv('nrao_measurements.csv')


Unnamed: 0,project_code,project_title,project_abstract,fs_type,low_freq,high_freq,science_category,science_keyword,band,target,diff_freq,med_freq,raw_text,standardized_text,no_sw_text,lemmatized_sw_text,lemmatized_no_sw_text,freq_diff,freq_med
0,2011.0.00010.S,The Physics and Chemisty of Gas in Centaurus A...,Centaurus A with its host NGC5128 is the most ...,line,90.38,90.62,Active galaxies,"Active Galactic Nuclei (AGN)/Quasars (QSO), Me...",3.0,1,0.24,90.500,The Physics and Chemisty of Gas in Centaurus A...,the physics and chemisty of gas in centaurus a...,physics chemisty gas centaurus host v centauru...,the physic and chemisty of gas in centaurus a ...,physic chemisty gas centaurus host v centaurus...,0.24,90.500
1,2011.0.00010.S,The Physics and Chemisty of Gas in Centaurus A...,Centaurus A with its host NGC5128 is the most ...,line,90.70,90.93,Active galaxies,"Active Galactic Nuclei (AGN)/Quasars (QSO), Me...",3.0,1,0.23,90.815,The Physics and Chemisty of Gas in Centaurus A...,the physics and chemisty of gas in centaurus a...,physics chemisty gas centaurus host v centauru...,the physic and chemisty of gas in centaurus a ...,physic chemisty gas centaurus host v centaurus...,0.23,90.815
2,2011.0.00010.S,The Physics and Chemisty of Gas in Centaurus A...,Centaurus A with its host NGC5128 is the most ...,line,91.69,91.92,Active galaxies,"Active Galactic Nuclei (AGN)/Quasars (QSO), Me...",3.0,1,0.23,91.805,The Physics and Chemisty of Gas in Centaurus A...,the physics and chemisty of gas in centaurus a...,physics chemisty gas centaurus host v centauru...,the physic and chemisty of gas in centaurus a ...,physic chemisty gas centaurus host v centaurus...,0.23,91.805
3,2011.0.00010.S,The Physics and Chemisty of Gas in Centaurus A...,Centaurus A with its host NGC5128 is the most ...,line,92.89,93.12,Active galaxies,"Active Galactic Nuclei (AGN)/Quasars (QSO), Me...",3.0,1,0.23,93.005,The Physics and Chemisty of Gas in Centaurus A...,the physics and chemisty of gas in centaurus a...,physics chemisty gas centaurus host v centauru...,the physic and chemisty of gas in centaurus a ...,physic chemisty gas centaurus host v centaurus...,0.23,93.005
4,2011.0.00010.S,The Physics and Chemisty of Gas in Centaurus A...,Centaurus A with its host NGC5128 is the most ...,line,217.59,218.53,Active galaxies,"Active Galactic Nuclei (AGN)/Quasars (QSO), Me...",6.0,1,0.94,218.060,The Physics and Chemisty of Gas in Centaurus A...,the physics and chemisty of gas in centaurus a...,physics chemisty gas centaurus host v centauru...,the physic and chemisty of gas in centaurus a ...,physic chemisty gas centaurus host v centaurus...,0.94,218.060
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67434,2023.A.00009.S,Finding t=0: Tracing the Origins of Rocky Plan...,Recent JWST observations serendipitously revea...,line,345.20,345.67,Disks and planet formation,Disks around low-mass stars,7.0,1,0.47,345.435,Finding t=0: Tracing the Origins of Rocky Plan...,finding t tracing the origins of rocky planete...,finding tracing origins rocky planetesimals jw...,find t trace the origin of rocky planetesimal ...,find trace origin rocky planetesimal jwst rece...,0.47,345.435
67435,2023.A.00009.S,Finding t=0: Tracing the Origins of Rocky Plan...,Recent JWST observations serendipitously revea...,line,345.55,346.02,Disks and planet formation,Disks around low-mass stars,7.0,1,0.47,345.785,Finding t=0: Tracing the Origins of Rocky Plan...,finding t tracing the origins of rocky planete...,finding tracing origins rocky planetesimals jw...,find t trace the origin of rocky planetesimal ...,find trace origin rocky planetesimal jwst rece...,0.47,345.785
67436,2023.A.00009.S,Finding t=0: Tracing the Origins of Rocky Plan...,Recent JWST observations serendipitously revea...,line,346.02,346.96,Disks and planet formation,Disks around low-mass stars,7.0,1,0.94,346.490,Finding t=0: Tracing the Origins of Rocky Plan...,finding t tracing the origins of rocky planete...,finding tracing origins rocky planetesimals jw...,find t trace the origin of rocky planetesimal ...,find trace origin rocky planetesimal jwst rece...,0.94,346.490
67437,2023.A.00009.S,Finding t=0: Tracing the Origins of Rocky Plan...,Recent JWST observations serendipitously revea...,line,346.85,347.79,Disks and planet formation,Disks around low-mass stars,7.0,1,0.94,347.320,Finding t=0: Tracing the Origins of Rocky Plan...,finding t tracing the origins of rocky planete...,finding tracing origins rocky planetesimals jw...,find t trace the origin of rocky planetesimal ...,find trace origin rocky planetesimal jwst rece...,0.94,347.320


In [3]:
def adjust_frequency_ranges(df):
    new_rows = []
    grouped = df.groupby('project_code') # Group by project_code
    # Iterate through each group
    for _, group in grouped:

        group = group.sort_values(by='low_freq') # Sort by low_freq
        
        # Initialize variables for the new rows
        new_low = group.iloc[0]['low_freq']
        new_high = group.iloc[0]['high_freq']
        
        # Iterate through rows to find overlapping ranges
        for i in range(1, len(group)):
            if group.iloc[i]['low_freq'] <= new_high:
                # There's an overlap, adjust new_high
                new_high = max(new_high, group.iloc[i]['high_freq'])
            else:
                # No overlap, add the previous range as a new row
                if new_high - new_low <= 4:
                    new_rows.append({'project_code': group.iloc[i-1]['project_code'],
                                     'low_freq': new_low,
                                     'high_freq': new_high})
                else:
                    # Split the range into multiple rows
                    num_ranges = math.ceil((new_high - new_low) / 4)
                    freq_step = (new_high - new_low) / num_ranges
                    for j in range(num_ranges):
                        new_rows.append({'project_code': group.iloc[i-1]['project_code'],
                                         'low_freq': new_low + j * freq_step,
                                         'high_freq': min(new_low + (j+1) * freq_step, new_high)})
                new_low = group.iloc[i]['low_freq']
                new_high = group.iloc[i]['high_freq']
        
        # Add the last range as a new row
        if new_high - new_low <= 4:
            new_rows.append({'project_code': group.iloc[-1]['project_code'],
                             'low_freq': new_low,
                             'high_freq': new_high})
        else:
            # Split the range into multiple rows
            num_ranges = math.ceil((new_high - new_low) / 4)
            freq_step = (new_high - new_low) / num_ranges
            for j in range(num_ranges):
                new_rows.append({'project_code': group.iloc[-1]['project_code'],
                                 'low_freq': new_low + j * freq_step,
                                 'high_freq': min(new_low + (j+1) * freq_step, new_high)})
    
    # Create a DataFrame from the new rows and concatenate with the original DataFrame
    new_df = pd.DataFrame(new_rows)
    merged_measurements = pd.merge(new_df, df.drop(columns=['low_freq', 'high_freq', 'freq_diff', 'freq_med', 'fs_type', 'target']), on='project_code', how='left')
    
    return merged_measurements.drop_duplicates() # Drop duplicates and return the result

measurements = adjust_frequency_ranges(measurements) #merge overlapping frequencies in a project but make sure that new overlapping frequency does not exceed 4 to stay in line with specifications

In [4]:
measurements = measurements[measurements['band'] != 2] # remove measurements with band 2 because minimal in the whole dataset
measurements = measurements.reset_index()

band_counts = measurements.groupby(['project_code', 'band']).size() # group by project code and band to see how many times a project is in a given band
total_counts_per_project = measurements.groupby('project_code').size() # group by project_code and calculate the total count per project

# Divide each band count by the total count per project to get percentages
band_percentages = (band_counts / total_counts_per_project) * 100
band_percentages = band_percentages.reset_index()
band_percentages.columns = ['project_code','band','percent']
band_percentages['band'] = band_percentages['band'].astype('int64') 
band_percentages # outputs dataframe with project code, band, and the percent of times it falls in that band

Unnamed: 0,project_code,band,percent
0,2011.0.00010.S,3,80.000000
1,2011.0.00010.S,6,20.000000
2,2011.0.00017.S,3,100.000000
3,2011.0.00020.S,6,17.647059
4,2011.0.00020.S,7,82.352941
...,...,...,...
4583,2023.1.01710.S,3,100.000000
4584,2023.1.01720.S,6,100.000000
4585,2023.1.01721.S,7,100.000000
4586,2023.A.00003.S,6,100.000000


In [5]:
# Add in rows for all bands, even those that a given project does not fall in
possible_combinations = []
for project_code in band_percentages['project_code'].unique():
    for band in [1,3,4,5,6,7,8,9,10]:
        possible_combinations.append({'project_code': project_code, 'band': band})
all_combinations_df = pd.DataFrame(possible_combinations)

merged_df = pd.merge(all_combinations_df, band_percentages, on=['project_code', 'band'], how='left') # merge with original DataFrame to fill in percentages
merged_df['percent'].fillna(0, inplace=True) # fill missing percentages with zeros

In [6]:
pivoted_df = merged_df.pivot_table(index='project_code', columns='band', values='percent', fill_value=0) # re-orient merged data frame so each band has a column
pivoted_df.reset_index(inplace=True) # reset index to make project_code a regular column
pivoted_df.index.name = None # rename the index column
print(pivoted_df)

band    project_code  1      3    4    5           6           7    8    9  \
0     2011.0.00010.S  0   80.0  0.0  0.0   20.000000    0.000000  0.0  0.0   
1     2011.0.00017.S  0  100.0  0.0  0.0    0.000000    0.000000  0.0  0.0   
2     2011.0.00020.S  0    0.0  0.0  0.0   17.647059   82.352941  0.0  0.0   
3     2011.0.00028.S  0    0.0  0.0  0.0    0.000000  100.000000  0.0  0.0   
4     2011.0.00039.S  0    0.0  0.0  0.0    0.000000  100.000000  0.0  0.0   
...              ... ..    ...  ...  ...         ...         ...  ...  ...   
3622  2023.1.01710.S  0  100.0  0.0  0.0    0.000000    0.000000  0.0  0.0   
3623  2023.1.01720.S  0    0.0  0.0  0.0  100.000000    0.000000  0.0  0.0   
3624  2023.1.01721.S  0    0.0  0.0  0.0    0.000000  100.000000  0.0  0.0   
3625  2023.A.00003.S  0    0.0  0.0  0.0  100.000000    0.000000  0.0  0.0   
3626  2023.A.00009.S  0    0.0  0.0  0.0    0.000000  100.000000  0.0  0.0   

band   10  
0     0.0  
1     0.0  
2     0.0  
3     0.0  
4  

In [7]:
# Exploratory Data Analysis
unique_bands_per_project = measurements.groupby('project_code')['band'].nunique()
print(unique_bands_per_project.describe()) # most projects in only one band but can go high as 6 -> at least 2 bands should be accounted for

count    3627.000000
mean        1.264957
std         0.582512
min         1.000000
25%         1.000000
50%         1.000000
75%         1.000000
max         6.000000
Name: band, dtype: float64


In [8]:
# There cannot be NAs for tfidf so fill with meaningless information
measurements['band'] = measurements['band'].fillna("-")
measurements['project_abstract'] = measurements['project_abstract'].fillna("-")

# Tfidf
tfidf_vectorizer = TfidfVectorizer()
tfidf_projects = tfidf_vectorizer.fit_transform(measurements.project_abstract) # vectorize text
tfidf_vectorizer.get_feature_names_out()

array(['00', '000', '0001', ..., 'µjy', 'µm', 'λcdm'], dtype=object)

In [9]:
# Create training/testing splits based on each project code
unique_project_codes = measurements['project_code'].unique() # get all unique project codes
band_group, freq_group = train_test_split(unique_project_codes, test_size=0.5, random_state=42) # split the unique project codes into two equal groups
train_band, test_band = train_test_split(band_group, test_size=0.3, random_state=42) # train/test split for band_group
freq_band, freq_band = train_test_split(freq_group, test_size=0.3, random_state=42) # train/test split for freq_group

In [10]:
# Create data frames based on training/testing split
train_band_df = measurements[measurements['project_code'].isin(train_band)] # gets only projects for the task of training the band classification with each frequency instance as a row
train_band_pivot = pivoted_df[pivoted_df['project_code'].isin(train_band)] # gets only projects for the task of training the band classification with each project as a row and each band as a column
train_band_index = train_band_df.index.tolist() # gets lists of indeces for project codes for later use
train_nonzero_counts = train_band_pivot.iloc[:, 1:].apply(lambda row: row.astype(bool).sum(), axis=1) # gets number of bands each project is in
train_nonzero_counts = train_nonzero_counts.reset_index(drop=True)

test_band_df = measurements[measurements['project_code'].isin(test_band)] # gets only projects for the task of testing the band classification with each frequency instance as a row
test_band_pivot = pivoted_df[pivoted_df['project_code'].isin(test_band)] # gets only projects for the task of testing the band classification with each project as a row and each band as a column
test_band_index = test_band_df.index.tolist() # gets lists of indeces for project codes for later use
test_nonzero_counts = test_band_pivot.iloc[:, 1:].apply(lambda row: row.astype(bool).sum(), axis=1) # gets number of bands each project is in
test_nonzero_counts = test_nonzero_counts.reset_index(drop=True)

In [11]:
# Create a dictionary to store the non-zero values for each project code
project_data = {}

# Iterate over each row in the DataFrame
for index, row in test_band_pivot.iterrows():
    non_zero_columns = []
    
    # Iterate over columns 2 to 10
    for col in [1,3,4,5,6,7,8,9,10]:
        if row[col] != 0:
            non_zero_columns.append(col)
    
    # If there are non-zero values, store them in the dictionary
    if len(non_zero_columns) > 0:
        project_data[row["project_code"]] = non_zero_columns

# Convert the dictionary to a list of lists
result = [[project_code, columns] for project_code, columns in project_data.items()] # this allows us to assess accuracy later on

In [12]:
# Define weights since this is an unbalanced dataset - the number of projects in each band is not uniformly distributed
weights = [0] * 9
band_count = pd.DataFrame(train_band_df['band'].value_counts().reset_index()).sort_values(by=['band'])
for i in range(len(band_count['count'])):
    weights[i] = 1/band_count['count'].iloc[i] * 100
print(weights)

[2.0408163265306123, 0.0036473720684247002, 0.012677484787018255, 0.021593608291945586, 0.001820598248584485, 0.004752174119659744, 0.02008435428800964, 0.036483035388544326, 0.1004016064257028]


In [13]:
# Call model: Multinomial Naive Bayes since it works well with text, use weights
clf = sklearn.naive_bayes.MultinomialNB(class_prior = weights)
clf.fit(tfidf_projects[train_band_index], train_band_df['band'])

In [14]:
# Obtain probabilities of each class for each project
prob_drop_dup_df = pd.DataFrame(clf.predict_proba(tfidf_projects[test_band_index])).drop_duplicates()
prob_drop_dup_array = np.array(prob_drop_dup_df)

In [15]:
# Sort to find which classes have the highest probalities
top_2_indices = np.argsort(clf.predict_proba(tfidf_projects[test_band_index]))
num_accurate = 0 # keep track of how many true positives
fp_count = 0

# Figuring out if correct bands are in top two probabilities
for i in range(len(test_band_df['band'])):
    if test_band_df['band'].iloc[i] > 2:
        if test_band_df['band'].iloc[i] == top_2_indices[i][8] + 2: # need to add 2 because we have bands 3-10 and the corresponding indeces are 1-8
            num_accurate += 1
        elif test_band_df['band'].iloc[i] == top_2_indices[i][7] + 2: # need to add 2 because we have bands 3-10 and the corresponding indeces are 1-8
            num_accurate += 1
        else: 
            fp_count += 1
    elif test_band_df['band'].iloc[i] == 1:
        if test_band_df['band'].iloc[i] == top_2_indices[i][8] + 1: # need to add 1 because we have band 1 and the corresponding index is 0
            num_accurate += 1
        elif test_band_df['band'].iloc[i] == top_2_indices[i][7] + 1: # need to add 1 because we have band 1 and the corresponding index is 0
            num_accurate += 1
        else: 
            fp_count += 1
        
print("TP - one of top two probabilities is correct")
print(num_accurate) #true positive
print("FP - neither of top two probabilities is correct")
print(fp_count)
print("percent that true band is one of top two probabilities")
print(num_accurate / len(test_band_df['band']))

TP - one of top two probabilities is correct
29680
FP - neither of top two probabilities is correct
32966
percent that true band is one of top two probabilities
0.47377326565143824


In [16]:
# Function to see how 2 lists intersect
def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3

# Create list of lists of which bands were predicted for each project
# Each inner list represents a project
# Outer list is list of all projects in test split for band classification
outer_list = []
for i in clf.predict_proba(tfidf_projects[test_band_index]):
    inner_list = []
    count = 2
    for j in i:
        if j > .3: # determined threshold based on guess & check -- if predicted probability of being in this class is > .3 -> place in that class
            inner_list.append(count)
        count += 1
    outer_list.append(inner_list)

# Initialize counter variables and empty lists for for loop
extra_count = 0
under_count = 0
count = 0
hooray = 0
intersection = 0
total_length = 0
over_pred = 0
pred_length = 0

for i in prob_drop_dup_df.index:
    # Below print statements allow for easy comparison of what is predicted vs. target
    #print("predicted")
    #print(outer_list[i])
    #print("true")
    #print(result[count][1])
    if len(outer_list[i]) > test_nonzero_counts.iloc[count]: # too many bands were predicted
        extra_count += 1
    elif len(outer_list[i]) < test_nonzero_counts.iloc[count]: # too few bands were predicted
        under_count += 1    
    elif outer_list[i] == result[count][1]: # the correct amount and correct bands were predicted
        hooray += 1
    intersection += len(set(outer_list[i]) & set(result[count][1]))
    total_length += len(result[count][1])
    over_pred += len(outer_list[i]) - len(set(outer_list[i]) & set(result[count][1]))
    pred_length += len(outer_list[i])
    count += 1

print("completely correct percent")
print(hooray/count * 100) #percent of projects that have identical predicted and target
print("percent of target predicted")
print(intersection/total_length * 100)
print("extra bands predicted percent")
print(extra_count/count * 100) #how many projects have at least one band predicted that is not in the target
print("under bands predicted percent")
print(under_count/count * 100) #how many projects have at least one band in the target that is not predicted
print("percent of predicted but not in target")
print(over_pred/pred_length * 100)                                        

completely correct percent
15.992647058823529
percent of target predicted
24.705882352941178
extra bands predicted percent
10.11029411764706
under bands predicted percent
27.941176470588236
percent of predicted but not in target
68.94639556377079
