In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import plotly_express as px
import textparser   # For potential use later
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from textparser import TextParser
from ast import literal_eval
from sklearn.naive_bayes import MultinomialNB
import sklearn

In [2]:
measurements = pd.read_csv('nrao_measurements.csv')
measurements = measurements.drop_duplicates()
measurements['freq_diff'] = measurements['high_freq'] - measurements['low_freq']
measurements = measurements[measurements.freq_diff < 5] #Adele mentioned that frequency range should never be greater than 4
measurements['freq_med'] = (measurements['low_freq']+ (measurements['freq_diff']/2))
measurements = measurements.query('fs_type == "line"')
measurements.head()

Unnamed: 0,project_code,project_title,project_abstract,fs_type,low_freq,high_freq,target,freq_diff,freq_med
0,2011.0.00010.S,The Physics and Chemisty of Gas in Centaurus A...,Centaurus A with its host NGC5128 is the most ...,line,90.38,90.62,1,0.24,90.5
1,2011.0.00010.S,The Physics and Chemisty of Gas in Centaurus A...,Centaurus A with its host NGC5128 is the most ...,line,90.7,90.93,1,0.23,90.815
2,2011.0.00010.S,The Physics and Chemisty of Gas in Centaurus A...,Centaurus A with its host NGC5128 is the most ...,line,91.69,91.92,1,0.23,91.805
3,2011.0.00010.S,The Physics and Chemisty of Gas in Centaurus A...,Centaurus A with its host NGC5128 is the most ...,line,92.89,93.12,1,0.23,93.005
4,2011.0.00010.S,The Physics and Chemisty of Gas in Centaurus A...,Centaurus A with its host NGC5128 is the most ...,line,217.59,218.53,1,0.94,218.06


In [3]:
import pandas as pd
import math

def adjust_frequency_ranges(df):
    new_rows = []
    
    # Group by project_code
    grouped = df.groupby('project_code')
    
    # Iterate through each group
    for _, group in grouped:
        # Sort by low_freq
        group = group.sort_values(by='low_freq')
        
        # Initialize variables for the new rows
        new_low = group.iloc[0]['low_freq']
        new_high = group.iloc[0]['high_freq']
        
        # Iterate through rows to find overlapping ranges
        for i in range(1, len(group)):
            if group.iloc[i]['low_freq'] <= new_high:
                # There's an overlap, adjust new_high
                new_high = max(new_high, group.iloc[i]['high_freq'])
            else:
                # No overlap, add the previous range as a new row
                if new_high - new_low <= 4:
                    new_rows.append({'project_code': group.iloc[i-1]['project_code'],
                                     'low_freq': new_low,
                                     'high_freq': new_high})
                else:
                    # Split the range into multiple rows
                    num_ranges = math.ceil((new_high - new_low) / 4)
                    freq_step = (new_high - new_low) / num_ranges
                    for j in range(num_ranges):
                        new_rows.append({'project_code': group.iloc[i-1]['project_code'],
                                         'low_freq': new_low + j * freq_step,
                                         'high_freq': min(new_low + (j+1) * freq_step, new_high)})
                new_low = group.iloc[i]['low_freq']
                new_high = group.iloc[i]['high_freq']
        
        # Add the last range as a new row
        if new_high - new_low <= 4:
            new_rows.append({'project_code': group.iloc[-1]['project_code'],
                             'low_freq': new_low,
                             'high_freq': new_high})
        else:
            # Split the range into multiple rows
            num_ranges = math.ceil((new_high - new_low) / 4)
            freq_step = (new_high - new_low) / num_ranges
            for j in range(num_ranges):
                new_rows.append({'project_code': group.iloc[-1]['project_code'],
                                 'low_freq': new_low + j * freq_step,
                                 'high_freq': min(new_low + (j+1) * freq_step, new_high)})
    
    # Create a DataFrame from the new rows and concatenate with the original DataFrame
    new_df = pd.DataFrame(new_rows)
    #df = pd.concat([df, new_df])
    merged_measurements = pd.merge(new_df, df.drop(columns=['low_freq', 'high_freq', 'freq_diff', 'freq_med', 'fs_type', 'target']), on='project_code', how='left')
    
    # Drop duplicates and return the result
    return merged_measurements.drop_duplicates()

# Example usage:
# measurements = pd.read_csv('nrao_measurements.csv')
# measurements = adjust_frequency_ranges(measurements)


In [4]:
measurements = adjust_frequency_ranges(measurements)

In [5]:
measurements['Band'] = np.where(measurements['high_freq'] < 43.5, 1, 
                                np.where(measurements['high_freq'] < 97.5, 2,
                                np.where(measurements['high_freq'] < 101.4, 3,
                                np.where(measurements['high_freq'] < 145.7, 4,
                                np.where(measurements['high_freq'] < 170.9, 5,
                                np.where(measurements['high_freq'] < 216.51, 6,
                                np.where(measurements['high_freq'] < 356.231, 7,
                                np.where(measurements['high_freq'] < 479.51, 8,
                                np.where(measurements['high_freq'] < 674.1, 9,
                                10)))))))))

In [6]:
measurements

Unnamed: 0,project_code,low_freq,high_freq,project_title,project_abstract,Band
0,2011.0.00010.S,85.960000,86.190000,The Physics and Chemisty of Gas in Centaurus A...,Centaurus A with its host NGC5128 is the most ...,2
20,2011.0.00010.S,87.050000,87.290000,The Physics and Chemisty of Gas in Centaurus A...,Centaurus A with its host NGC5128 is the most ...,2
40,2011.0.00010.S,87.470000,87.880000,The Physics and Chemisty of Gas in Centaurus A...,Centaurus A with its host NGC5128 is the most ...,2
60,2011.0.00010.S,88.360000,88.590000,The Physics and Chemisty of Gas in Centaurus A...,Centaurus A with its host NGC5128 is the most ...,2
80,2011.0.00010.S,88.910000,89.150000,The Physics and Chemisty of Gas in Centaurus A...,Centaurus A with its host NGC5128 is the most ...,2
...,...,...,...,...,...,...
396663,2023.A.00003.S,260.134444,263.823333,[OIII] Confirmation for Intrinsically Luminous...,"Spectroscopic confirmation of the brightest, h...",7
396683,2023.A.00003.S,263.823333,267.512222,[OIII] Confirmation for Intrinsically Luminous...,"Spectroscopic confirmation of the brightest, h...",7
396703,2023.A.00003.S,267.512222,271.201111,[OIII] Confirmation for Intrinsically Luminous...,"Spectroscopic confirmation of the brightest, h...",7
396723,2023.A.00003.S,271.201111,274.890000,[OIII] Confirmation for Intrinsically Luminous...,"Spectroscopic confirmation of the brightest, h...",7


In [7]:
#convert to lowercase, strip and remove punctuations and remove ALMA, case insensitive
def preprocess(text):
    text = text.lower()
    text = text.strip()
    text = re.compile('<.*?>').sub('', text) 
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  
    text = re.sub(r'(?i)alma', '', text)
    text = re.sub('\s+', ' ', text)  
    text = re.sub(r'\[[0-9]*\]',' ',text) 
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) 
    text = re.sub(r'\s+',' ',text) 
    return text
 
# STOPWORD REMOVAL
def stopword(string):
    a= [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)

#LEMMATIZATION
# Initialize the lemmatizer
wl = WordNetLemmatizer()
 
# This is a helper function to map NTLK position tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(word_tokenize(string)) # Get position tags
    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)] # Map the position tag and lemmatize the word/token
    return " ".join(a)

In [8]:
measurements['raw_text'] = str(measurements.project_title.str.strip() + '. ' + measurements.project_abstract.str.strip())
#print(type(measurements['raw_text']))
measurements['standardized_text'] = measurements.raw_text.apply(lambda x: preprocess(x))
measurements['no_sw_text'] = measurements.standardized_text.apply(lambda x: stopword(x))
measurements['lemmatized_sw_text'] = measurements.standardized_text.apply(lambda x: lemmatizer(x))
measurements['lemmatized_no_sw_text'] = measurements.no_sw_text.apply(lambda x: lemmatizer(x))
measurements

Unnamed: 0,project_code,low_freq,high_freq,project_title,project_abstract,Band,raw_text,standardized_text,no_sw_text,lemmatized_sw_text,lemmatized_no_sw_text
0,2011.0.00010.S,85.960000,86.190000,The Physics and Chemisty of Gas in Centaurus A...,Centaurus A with its host NGC5128 is the most ...,2,0 The Physics and Chemisty of Gas in C...,the physics and chemisty of gas in centaurus ...,physics chemisty gas centaurus physics chemist...,the physic and chemisty of gas in centaurus a ...,physic chemisty gas centaurus physic chemisty ...
20,2011.0.00010.S,87.050000,87.290000,The Physics and Chemisty of Gas in Centaurus A...,Centaurus A with its host NGC5128 is the most ...,2,0 The Physics and Chemisty of Gas in C...,the physics and chemisty of gas in centaurus ...,physics chemisty gas centaurus physics chemist...,the physic and chemisty of gas in centaurus a ...,physic chemisty gas centaurus physic chemisty ...
40,2011.0.00010.S,87.470000,87.880000,The Physics and Chemisty of Gas in Centaurus A...,Centaurus A with its host NGC5128 is the most ...,2,0 The Physics and Chemisty of Gas in C...,the physics and chemisty of gas in centaurus ...,physics chemisty gas centaurus physics chemist...,the physic and chemisty of gas in centaurus a ...,physic chemisty gas centaurus physic chemisty ...
60,2011.0.00010.S,88.360000,88.590000,The Physics and Chemisty of Gas in Centaurus A...,Centaurus A with its host NGC5128 is the most ...,2,0 The Physics and Chemisty of Gas in C...,the physics and chemisty of gas in centaurus ...,physics chemisty gas centaurus physics chemist...,the physic and chemisty of gas in centaurus a ...,physic chemisty gas centaurus physic chemisty ...
80,2011.0.00010.S,88.910000,89.150000,The Physics and Chemisty of Gas in Centaurus A...,Centaurus A with its host NGC5128 is the most ...,2,0 The Physics and Chemisty of Gas in C...,the physics and chemisty of gas in centaurus ...,physics chemisty gas centaurus physics chemist...,the physic and chemisty of gas in centaurus a ...,physic chemisty gas centaurus physic chemisty ...
...,...,...,...,...,...,...,...,...,...,...,...
396663,2023.A.00003.S,260.134444,263.823333,[OIII] Confirmation for Intrinsically Luminous...,"Spectroscopic confirmation of the brightest, h...",7,0 The Physics and Chemisty of Gas in C...,the physics and chemisty of gas in centaurus ...,physics chemisty gas centaurus physics chemist...,the physic and chemisty of gas in centaurus a ...,physic chemisty gas centaurus physic chemisty ...
396683,2023.A.00003.S,263.823333,267.512222,[OIII] Confirmation for Intrinsically Luminous...,"Spectroscopic confirmation of the brightest, h...",7,0 The Physics and Chemisty of Gas in C...,the physics and chemisty of gas in centaurus ...,physics chemisty gas centaurus physics chemist...,the physic and chemisty of gas in centaurus a ...,physic chemisty gas centaurus physic chemisty ...
396703,2023.A.00003.S,267.512222,271.201111,[OIII] Confirmation for Intrinsically Luminous...,"Spectroscopic confirmation of the brightest, h...",7,0 The Physics and Chemisty of Gas in C...,the physics and chemisty of gas in centaurus ...,physics chemisty gas centaurus physics chemist...,the physic and chemisty of gas in centaurus a ...,physic chemisty gas centaurus physic chemisty ...
396723,2023.A.00003.S,271.201111,274.890000,[OIII] Confirmation for Intrinsically Luminous...,"Spectroscopic confirmation of the brightest, h...",7,0 The Physics and Chemisty of Gas in C...,the physics and chemisty of gas in centaurus ...,physics chemisty gas centaurus physics chemist...,the physic and chemisty of gas in centaurus a ...,physic chemisty gas centaurus physic chemisty ...


In [9]:
final_measurements = measurements[['lemmatized_no_sw_text']]

In [10]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_projects = tfidf_vectorizer.fit_transform(final_measurements.lemmatized_no_sw_text)
tfidf_vectorizer.get_feature_names_out()

array(['centaurus', 'chemisty', 'confirmation', 'dtype', 'find', 'gas',
       'intrinsically', 'length', 'luminous', 'object', 'oiii', 'origin',
       'physic', 'plan', 'rocky', 'trace'], dtype=object)

In [11]:
import random

def split_list_into_random_groups(data, num_groups):
    # Shuffle the data randomly
    random.shuffle(data)
    
    # Calculate the size of each group
    group_size = len(data) // num_groups
    remainder = len(data) % num_groups

    groups = []
    start = 0
    for i in range(num_groups):
        if i < remainder:
            end = start + group_size + 1
        else:
            end = start + group_size
        groups.append(data[start:end])
        start = end

    return groups

# Example usage:
data = np.arange(18717)
num_groups = 5
groups = split_list_into_random_groups(data, num_groups)
for i, group in enumerate(groups):
    print(f"Group {i+1}: {group}")

Group 1: [ 8266 14159  7148 ...  2190 16481   360]
Group 2: [18022  4098 12538 ... 13222  3383  1810]
Group 3: [ 1256 15754 12933 ... 12811    13  6859]
Group 4: [15759  1985 11562 ...  9218  7166  8251]
Group 5: [16323  6664 10920 ... 14045 11170 10665]


In [12]:
train_band = np.concatenate((groups[0], groups[1]))
train_med = groups[2]
train_width = groups[3]
test = groups[4]

In [13]:
count_vectorizer = CountVectorizer()
cv_projects = count_vectorizer.fit_transform(final_measurements.lemmatized_no_sw_text)
#count_vectorizer.get_feature_names_out()
text_band = cv_projects[train_band]
text_test = cv_projects[test]

In [14]:
band_labels = measurements['Band'].reset_index(drop=True)
labels_train_band = band_labels.filter(items=train_band, axis=0)
labels_test_band = band_labels.filter(items=test, axis=0)
dict_train_band = labels_train_band.value_counts().to_dict()

In [15]:
dict_train_band

{7: 4239,
 4: 1010,
 2: 709,
 8: 339,
 6: 328,
 5: 277,
 3: 275,
 9: 170,
 10: 138,
 1: 3}

In [16]:
n_samples = len(measurements)
n_classes = 10
n_ind = [1, dict_train_band[2], dict_train_band[3], 
         dict_train_band[4], dict_train_band[5], dict_train_band[6], 
         dict_train_band[7], dict_train_band[8], dict_train_band[9], 
         dict_train_band[10]]

In [17]:
weights = [1/(n_classes * x) for x in n_ind]
weights = [x/sum(weights) for x in weights]
weights = [0, .1, .2, .1, .1, .1, 0, .1, .2, .1]

In [18]:
class_weight.compute_sample_weight('balanced', labels_train_band)

NameError: name 'class_weight' is not defined

In [20]:
text_band

<1x16 sparse matrix of type '<class 'numpy.int64'>'
	with 16 stored elements in Compressed Sparse Row format>

In [None]:
clf = sklearn.naive_bayes.MultinomialNB(class_prior = weights)
clf.fit(text_band, labels_train_band)

In [None]:
#always predicting the same class for all?
clf.predict(text_test)

## 