In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import plotly_express as px
import textparser   # For potential use later
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from textparser import TextParser
from ast import literal_eval
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.datasets import make_regression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import sklearn
from sklearn.model_selection import train_test_split
import statistics
import math
import random
import plotly.express as px

### Clean Data

In [2]:
# Data cleaning
measurements = pd.read_csv('nrao_measurements.csv')
measurements = measurements.drop_duplicates()
measurements['freq_diff'] = measurements['high_freq'] - measurements['low_freq']
measurements = measurements[measurements.freq_diff < 5] # Adele mentioned that frequency range should never be greater than 4 so remove those that do not meet specifications
measurements['freq_med'] = (measurements['low_freq'] + (measurements['freq_diff']/2))
measurements = measurements.query('fs_type == "line"')
measurements['band'] = pd.to_numeric(measurements['band'], errors='coerce', downcast='integer')
valid_band_values = set(range(1, 11))
measurements = measurements[measurements['band'].isin(valid_band_values)] # Removing any rows with incorrect band formatting
band_counts = measurements.groupby('project_code')['band'].nunique()
valid_project_codes = band_counts[band_counts <= 2].index # Get the project codes where unique band values are at most 2
measurements = measurements[measurements['project_code'].isin(valid_project_codes)] # Filter the DataFrame based on valid project codes

  measurements = pd.read_csv('nrao_measurements.csv')


In [3]:
def adjust_frequency_ranges(df):
    new_rows = []
    grouped = df.groupby('project_code') # Group by project_code
    # Iterate through each group
    for _, group in grouped:

        group = group.sort_values(by='low_freq') # Sort by low_freq
        
        # Initialize variables for the new rows
        new_low = group.iloc[0]['low_freq']
        new_high = group.iloc[0]['high_freq']
        
        # Iterate through rows to find overlapping ranges
        for i in range(1, len(group)):
            if group.iloc[i]['low_freq'] <= new_high:
                # There's an overlap, adjust new_high
                new_high = max(new_high, group.iloc[i]['high_freq'])
            else:
                # No overlap, add the previous range as a new row
                if new_high - new_low <= 4:
                    new_rows.append({'project_code': group.iloc[i-1]['project_code'],
                                     'low_freq': new_low,
                                     'high_freq': new_high})
                else:
                    # Split the range into multiple rows
                    num_ranges = math.ceil((new_high - new_low) / 4)
                    freq_step = (new_high - new_low) / num_ranges
                    for j in range(num_ranges):
                        new_rows.append({'project_code': group.iloc[i-1]['project_code'],
                                         'low_freq': new_low + j * freq_step,
                                         'high_freq': min(new_low + (j+1) * freq_step, new_high)})
                new_low = group.iloc[i]['low_freq']
                new_high = group.iloc[i]['high_freq']
        
        # Add the last range as a new row
        if new_high - new_low <= 4:
            new_rows.append({'project_code': group.iloc[-1]['project_code'],
                             'low_freq': new_low,
                             'high_freq': new_high})
        else:
            # Split the range into multiple rows
            num_ranges = math.ceil((new_high - new_low) / 4)
            freq_step = (new_high - new_low) / num_ranges
            for j in range(num_ranges):
                new_rows.append({'project_code': group.iloc[-1]['project_code'],
                                 'low_freq': new_low + j * freq_step,
                                 'high_freq': min(new_low + (j+1) * freq_step, new_high)})
    
    # Create a DataFrame from the new rows and concatenate with the original DataFrame
    new_df = pd.DataFrame(new_rows)
    merged_measurements = pd.merge(new_df, df.drop(columns=['low_freq', 'high_freq', 'freq_diff', 'freq_med', 'fs_type', 'target']), on='project_code', how='left')
    
    return merged_measurements.drop_duplicates() # Drop duplicates and return the result

measurements = adjust_frequency_ranges(measurements) #merge overlapping frequencies in a project but make sure that new overlapping frequency does not exceed 4 to stay in line with specifications

In [4]:
measurements = measurements[measurements['band'] != 2] # remove measurements with band 2 because minimal in the whole dataset
measurements = measurements.reset_index()
band_counts = measurements.groupby(['project_code', 'band']).size() # group by project code and band to see how many times a project is in a given band
total_counts_per_project = measurements.groupby('project_code').size() # group by project_code and calculate the total count per project

# Divide each band count by the total count per project to get percentages
band_percentages = (band_counts / total_counts_per_project) * 100
band_percentages = band_percentages.reset_index()
band_percentages.columns = ['project_code','band','percent']
band_percentages['band'] = band_percentages['band'].astype('int64') 
band_percentages # outputs dataframe with project code, band, and the percent of times it falls in that band

Unnamed: 0,project_code,band,percent
0,2011.0.00010.S,3,80.000000
1,2011.0.00010.S,6,20.000000
2,2011.0.00017.S,3,100.000000
3,2011.0.00020.S,6,17.647059
4,2011.0.00020.S,7,82.352941
...,...,...,...
4060,2023.1.01710.S,3,100.000000
4061,2023.1.01720.S,6,100.000000
4062,2023.1.01721.S,7,100.000000
4063,2023.A.00003.S,6,100.000000


### Create Model

In [5]:
# There cannot be NAs for tfidf so fill with meaningless information
measurements['band'] = measurements['band'].fillna("-")
measurements['project_abstract'] = measurements['project_abstract'].fillna("-")

# Tfidf
tfidf_vectorizer = TfidfVectorizer()
tfidf_projects = tfidf_vectorizer.fit_transform(measurements.project_abstract) # vectorize text
tfidf_vectorizer.get_feature_names_out()

array(['00', '000', '0001', ..., 'µjy', 'µm', 'λcdm'], dtype=object)

In [6]:
# Create training/testing splits based on each project code
unique_project_codes = measurements['project_code'].unique() # get all unique project codes
band_group, freq_group = train_test_split(unique_project_codes, test_size=0.5, random_state=42) # split the unique project codes into two equal groups
train_band, test_band = train_test_split(band_group, test_size=0.3, random_state=42) # train/test split for band_group
train_freq, test_freq = train_test_split(freq_group, test_size=0.3, random_state=42) # train/test split for freq_group

In [7]:
# Create data frames based on training/testing split
train_band_df = measurements[measurements['project_code'].isin(train_band)] # gets only projects for the task of training the band classification with each frequency instance as a row
train_band_index = train_band_df.index.tolist() # gets lists of indeces for project codes for later use
test_band_df = measurements[measurements['project_code'].isin(test_band)] # gets only projects for the task of testing the band classification with each frequency instance as a row
test_band_index = test_band_df.index.tolist() # gets lists of indeces for project codes for later use

In [8]:
# Define weights since this is an unbalanced dataset - the number of projects in each band is not uniformly distributed
weights = [0] * 9
band_count = pd.DataFrame(train_band_df['band'].value_counts().reset_index()).sort_values(by=['band'])
for i in range(len(band_count['count'])):
    weights[i] = band_count['count'].iloc[i]/len(train_band_df)

In [9]:
# Call model: Multinomial Naive Bayes since it works well with text, use weights
clf = sklearn.naive_bayes.MultinomialNB(class_prior = weights)
clf.fit(tfidf_projects[train_band_index], train_band_df['band'])

In [10]:
# Obtain probabilities of each class for each project
prob_drop_dup_df = pd.DataFrame(clf.predict_proba(tfidf_projects[test_band_index])).drop_duplicates()
prob_drop_dup_array = np.array(prob_drop_dup_df)

### Assess Model

In [11]:
# Sort to find which classes have the highest probalities
top_2_indices = np.argsort(clf.predict_proba(tfidf_projects[test_band_index]))
prob_sorted = np.sort(clf.predict_proba(tfidf_projects[test_band_index]))
num_accurate = 0 # keep track of how many true positives
fp_count = 0
count_save = 0

# Figuring out if correct bands are in top two probabilities
for i in range(len(test_band_df['band'])):
    if prob_sorted[i][8] >= .83:
        if test_band_df['band'].iloc[i] > 2:
            if test_band_df['band'].iloc[i] == top_2_indices[i][8] + 2: # need to add 2 because we have bands 3-10 and the corresponding indeces are 1-8
                num_accurate += 1
                count_save += 1
            else: 
                fp_count += 1
        elif test_band_df['band'].iloc[i] == 1:
            if test_band_df['band'].iloc[i] == top_2_indices[i][8] + 1: # need to add 1 because we have band 1 and the corresponding index is 0
                num_accurate += 1
                count_save += 1
            else: 
                fp_count += 1
    elif test_band_df['band'].iloc[i] > 2:
        if test_band_df['band'].iloc[i] == top_2_indices[i][8] + 2: # need to add 2 because we have bands 3-10 and the corresponding indeces are 1-8
            num_accurate += 1
        elif test_band_df['band'].iloc[i] == top_2_indices[i][7] + 2: # need to add 2 because we have bands 3-10 and the corresponding indeces are 1-8
            num_accurate += 1
        else: 
            fp_count += 1
    elif test_band_df['band'].iloc[i] == 1:
        if test_band_df['band'].iloc[i] == top_2_indices[i][8] + 1: # need to add 1 because we have band 1 and the corresponding index is 0
            num_accurate += 1
        elif test_band_df['band'].iloc[i] == top_2_indices[i][7] + 1: # need to add 1 because we have band 1 and the corresponding index is 0
            num_accurate += 1
        else: 
            fp_count += 1
        
print("TP - one of top two probabilities is correct")
print(num_accurate) #true positive
print("FP - neither of top two probabilities is correct")
print(fp_count)
print("percent that true band is one of top two probabilities")
print(num_accurate / len(test_band_df['band']) * 100)
print("count only 1 provided and correct")
print(count_save)

TP - one of top two probabilities is correct
28978
FP - neither of top two probabilities is correct
10471
percent that true band is one of top two probabilities
73.45686836168217
count only 1 provided and correct
4256


### Save Results

In [12]:
# Find bands for frequency fit assessment
list_pred = []
# Figuring out if correct bands are in top two probabilities
for i in range(len(test_band_df)):
    inner_pred = []
    inner_pred.append(test_band_df['project_code'].iloc[i])
    if prob_sorted[i][8] >= .83:
        if top_2_indices[i][8] != 0:
            inner_pred.append(top_2_indices[i][8] + 2)# need to add 2 because we have bands 3-10 and the corresponding indeces are 1-8  
        else: 
            inner_pred.append(top_2_indices[i][8] + 1)
    elif top_2_indices[i][8] != 0: # need to add 2 because we have bands 3-10 and the corresponding indeces are 1-8
        inner_pred.append(top_2_indices[i][8] + 2)
        if top_2_indices[i][7] != 0:
            inner_pred.append(top_2_indices[i][7] + 2)
        else:
            inner_pred.append(top_2_indices[i][7] + 1)
    elif top_2_indices[i][8] == 0:
        inner_pred.append(top_2_indices[i][8] + 1)
        if top_2_indices[i][7] != 0:
            inner_pred.append(top_2_indices[i][7] + 2)
        else:
            inner_pred.append(top_2_indices[i][7] + 1)
    list_pred.append(inner_pred)

In [18]:
predictions = pd.DataFrame(list_pred).drop_duplicates()
predictions.columns = ["project", "first_prediction", "second_prediction"] 
predictions["second_prediction"].fillna(0, inplace = True)
predictions["second_prediction"] = predictions.second_prediction.astype(int)
predictions.to_csv('predictions.csv')

### Plots and Figures

In [13]:
# Organize predictions for precision calculation and confusion matrix
final_pred = []
inner_pred = []
# Figuring out if correct bands are in top two probabilities
for i in range(len(test_band_df)):
    if prob_sorted[i][8] >= .83:
        if top_2_indices[i][8] != 0:
            inner_pred.append(test_band_df['project_code'].iloc[i])
            inner_pred.append(top_2_indices[i][8] + 2) # need to add 2 because we have bands 3-10 and the corresponding indeces are 1-8
            final_pred.append(inner_pred)
            inner_pred = []
        else:
            inner_pred.append(test_band_df['project_code'].iloc[i])
            inner_pred.append(top_2_indices[i][8] + 1)
            final_pred.append(inner_pred)
            inner_pred = []
    elif top_2_indices[i][8] != 0: # need to add 2 because we have bands 3-10 and the corresponding indeces are 1-8
        inner_pred.append(test_band_df['project_code'].iloc[i])
        inner_pred.append(top_2_indices[i][8] + 2)
        final_pred.append(inner_pred)
        inner_pred = []
        if top_2_indices[i][7] != 0:
            inner_pred.append(test_band_df['project_code'].iloc[i])
            inner_pred.append(top_2_indices[i][7] + 2)
            final_pred.append(inner_pred)
            inner_pred = []
        else:
            inner_pred.append(test_band_df['project_code'].iloc[i])
            inner_pred.append(top_2_indices[i][7] + 1)
            final_pred.append(inner_pred)
            inner_pred = []
    elif top_2_indices[i][8] == 0:
        inner_pred.append(test_band_df['project_code'].iloc[i])
        inner_pred.append(top_2_indices[i][8] + 1)
        final_pred.append(inner_pred)
        inner_pred = []
        if top_2_indices[i][7] != 0:
            inner_pred.append(test_band_df['project_code'].iloc[i])
            inner_pred.append(top_2_indices[i][7] + 2)
            inner_pred.append(top_2_indices[i][8] + 1)
            final_pred.append(inner_pred)
            inner_pred = []
        else:
            inner_pred.append(test_band_df['project_code'].iloc[i])
            inner_pred.append(top_2_indices[i][7] + 1)
            final_pred.append(inner_pred)
            inner_pred = []
            
final_pred_df = pd.DataFrame(final_pred).drop_duplicates()
final_pred_df.columns = ["project", "prediction"] 
true_df = band_percentages[band_percentages['project_code'].isin(test_band)] 
true_df = true_df.drop("percent",axis=1)

In [15]:
# Find project codes with less than 2 rows
project_counts = true_df['project_code'].value_counts()
missing_projects = project_counts[project_counts < 2].index

# Create new rows for missing projects with band value of 0
new_rows = []
for project_code in missing_projects:
    new_rows.append({'project_code': project_code, 'band': 0})
new_rows = pd.DataFrame(new_rows)

# Append new rows to DataFrame
true_df = pd.concat([true_df, new_rows])#, ignore_index=True)
true_df = true_df.sort_values(by='project_code')

Unnamed: 0,project_code,band
2,2011.0.00017.S,3
88,2011.0.00017.S,0
6,2011.0.00039.S,7
368,2011.0.00039.S,0
15,2011.0.00087.S,7
...,...,...
120,2023.1.01591.S,0
119,2023.1.01660.S,0
4053,2023.1.01660.S,3
4059,2023.1.01707.S,6


In [27]:
# Find project codes with less than 2 rows
project_counts = final_pred_df['project'].value_counts()
missing_projects = project_counts[project_counts < 2].index

# Create new rows for missing projects with band value of 0
new_rows = []
for project_code in missing_projects:
    new_rows.append({'project': project_code, 'prediction': 0})
new_rows = pd.DataFrame(new_rows)

# Append new rows to DataFrame
final_pred_df = pd.concat([final_pred_df, new_rows])#, ignore_index=True)
final_pred_df= final_pred_df.sort_values(by='project')

<bound method Series.unique of 0        3
6        0
113      7
112      4
129      7
        ..
73187    6
73207    6
73208    3
73223    6
63       0
Name: prediction, Length: 1040, dtype: int64>

In [28]:
from sklearn.metrics import confusion_matrix, classification_report
labels = ['0', '3', '4', '5', '6', '7', '8', '9', '10']
test_sentiments = np.array(true_df['band'])
final_pred = np.array(final_pred_df['prediction'])

print(classification_report(test_sentiments, final_pred))
pd.DataFrame(confusion_matrix(test_sentiments, final_pred), index=labels, columns=labels)

              precision    recall  f1-score   support

           0       0.48      0.07      0.13       431
           3       0.20      0.39      0.27       142
           4       0.07      0.07      0.07        41
           5       0.00      0.00      0.00        14
           6       0.25      0.47      0.33       236
           7       0.22      0.31      0.26       140
           8       0.00      0.00      0.00        21
           9       0.50      0.14      0.22        14
          10       0.00      0.00      0.00         1

    accuracy                           0.24      1040
   macro avg       0.19      0.16      0.14      1040
weighted avg       0.33      0.24      0.20      1040



Unnamed: 0,0,3,4,5,6,7,8,9,10
0,31,113,15,1,186,74,8,2,1
3,5,56,9,0,53,19,0,0,0
4,2,15,3,0,15,6,0,0,0
5,2,2,1,0,6,3,0,0,0
6,15,56,7,1,112,45,0,0,0
7,9,26,8,1,50,44,2,0,0
8,0,4,0,0,11,6,0,0,0
9,0,3,0,0,7,2,0,2,0
10,0,0,0,0,0,1,0,0,0


In [19]:
predicted_bands = []
for i in range(len(predictions)):
    predicted_bands.append(predictions['first_prediction'].iloc[i])
    if predictions['second_prediction'].iloc[i] != 0:
        predicted_bands.append(predictions['second_prediction'].iloc[i])

In [20]:
predictbands_df = pd.DataFrame(predicted_bands)
predictbands_df.columns = ["bands"] 

In [None]:
bands = ['3','4','5','6','7','8','9','10']

predictbands_df = predictbands_df.sort_values(by='bands')
predictbands_df = predictbands_df.bands.value_counts()[predictbands_df.bands.unique()]
predictbands_count = predictbands_df.values.tolist()

truebands_df = band_percentages[band_percentages['project_code'].isin(test_band)] 
truebands_df = truebands_df.drop("percent",axis=1)
truebands_df = truebands_df.sort_values(by='band')
truebands_df = truebands_df.band.value_counts()[truebands_df.band.unique()]
truebands_count = truebands_df.values.tolist()

In [None]:
barplot_true = pd.DataFrame({'bands': bands,
              'count': truebands_count,
               'type': "true"})
barplot_pred = pd.DataFrame({'bands': bands,
              'count': predictbands_count,
               'type': "predicted"})
barplot = pd.concat([barplot_true, barplot_pred])
barplot

In [None]:
fig = px.bar(barplot, x = 'bands', y = 'count', color = 'type')
fig.update_layout(barmode='group',
                 title = "Total True vs. Predicted Band Count")
fig.show()

In [None]:
print(sum(predictbands_count))
print(sum(truebands_count))