In [1]:
import pandas as pd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly_express as px
import plotly.figure_factory as ff
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import DBSCAN, HDBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split

SEED = 42

In [2]:
projects = pd.read_csv("../data/nrao_projects.csv")
projects = projects.set_index('project_code')

In [3]:
projects = projects.query('fs_type == "line"')
projects.shape

(3628, 12)

In [4]:
measurements = pd.read_csv('../../nrao_measurements.csv')
measurements = measurements.set_index('project_code')
measurements = measurements[measurements.fs_type == 'line']

  measurements = pd.read_csv('../../nrao_measurements.csv')


## By hand band lower-bound cutoffs
To avoid possible conflicts, we simply call the cutoffs for band 1 and 2 to be 0 and 1GHz, respectively.

In [5]:
band_cutoffs = [0, 1, 84, 120, 163, 211, 275, 385, 602, 787]

## Remove outliers from projects and measurements

See 'Identifying_High_Measurement_Projects.ipynb' in 'data' folder

From this notebook, any project with > 26.5 measurements is an outlier

In [6]:
project_measurements = measurements.groupby(measurements.index)\
    .project_title.count()\
        .sort_values(ascending=False)\
        .to_frame()
project_measurements.columns = ['measurement_count']
project_measurements.head()

Unnamed: 0_level_0,measurement_count
project_code,Unnamed: 1_level_1
2017.1.00161.L,289
2017.1.00886.L,283
2021.2.00052.S,265
2023.1.00963.S,253
2022.1.00224.S,188


In [7]:
outliers = project_measurements[project_measurements.measurement_count > 26.5]

In [8]:
measurements = measurements.loc[~measurements.index.isin(outliers.index)]
projects = projects.loc[~projects.index.isin(outliers.index)]

### Remove measurements that have incorrectly formatted `band`

In [9]:
measurements['band'] = pd.to_numeric(measurements['band'], errors='coerce', downcast='integer')
valid_band_values = set(range(1, 11))
measurements = measurements[measurements['band'].isin(valid_band_values)] # Removing any rows with incorrect band formatting

### Make sure projects dataframe matches projects in measurement dataframes after drops

In [10]:
projects = projects.loc[measurements.index.unique()]

## Train-test split

In [11]:
train_texts, test_texts = train_test_split(projects.lemmatized_no_sw_text, random_state=SEED)

In [12]:
print(f'Number of train texts:{len(list(train_texts))}')
print(f'Number of test texts:{len(list(test_texts))}')

Number of train texts:2462
Number of test texts:821


In [13]:
train_texts

project_code
2022.1.01108.S    probe kinematics streamer understand star form...
2021.1.01495.S    revolutionary insight z gas dust physic althou...
2021.1.00055.S    comprehensive ism view pc scale sub l galaxy z...
2021.2.00056.S    panta rei mass energy flow parsec sub parsec s...
2018.A.00068.T    accretion burst event high mass yso g episodic...
                                        ...                        
2016.1.00615.S    probe dense gas physic extreme southern molecu...
2016.1.00744.S    investigate water deuteration young protostell...
2016.1.01344.S    multi wavelength image possibly planet induced...
2015.1.01235.S    core mass function far outer galaxy cloud dist...
2023.1.00367.S    conic cosmic noon ism condition survey cosmic ...
Name: lemmatized_no_sw_text, Length: 2462, dtype: object

### LDA class

In [14]:
class LDA_Model:
    def __init__(self, N_topics=3):
        self.N_topics = N_topics
        self.countVectorizer = CountVectorizer(stop_words='english')
        self.lda = LatentDirichletAllocation(n_components=self.N_topics, random_state=SEED)
    
    def fit(self, corpus):
        termFrequency = self.countVectorizer.fit_transform(corpus)
        self.lda.fit(termFrequency)
        return self.lda.transform(termFrequency)

    # Additional method to transform new data
    def transform(self, corpus):
        termFrequency = self.countVectorizer.transform(corpus)
        return self.lda.transform(termFrequency)

#### Initialize Model

In [15]:
lda_model = LDA_Model(N_topics=50)

#### Fit model on training set

In [16]:
train_topics = lda_model.fit(train_texts)

In [17]:
words = lda_model.countVectorizer.get_feature_names_out()

In [18]:
N = 10 #number of top words to show
topic_components = lda_model.lda.components_

for topic_idx, topic in enumerate(topic_components):
    print(f"Topic {topic_idx}:")
    # Get the indices of the top N words for this topic
    top_word_indices = topic.argsort()[-N:][::-1]
    # Print these words with their weights
    for word_idx in top_word_indices:
        print(f"{words[word_idx]} (weight: {topic[word_idx]:.2f})")
    print("\n")

Topic 0:
strip (weight: 50.65)
tail (weight: 43.06)
pressure (weight: 41.53)
gas (weight: 31.74)
ram (weight: 30.99)
molecular (weight: 23.63)
dense (weight: 23.21)
physical (weight: 14.94)
region (weight: 14.66)
nucleus (weight: 14.15)


Topic 1:
outflow (weight: 882.61)
jet (weight: 378.11)
wind (weight: 211.61)
disk (weight: 206.87)
high (weight: 196.17)
star (weight: 163.81)
molecular (weight: 158.93)
observation (weight: 152.68)
velocity (weight: 140.84)
scale (weight: 130.84)


Topic 2:
gas (weight: 351.46)
molecular (weight: 279.12)
galaxy (weight: 249.32)
cloud (weight: 147.67)
tracer (weight: 136.49)
dense (weight: 129.51)
study (weight: 114.30)
observation (weight: 107.05)
spiral (weight: 105.43)
star (weight: 101.85)


Topic 3:
molecular (weight: 91.55)
gas (weight: 61.83)
diffuse (weight: 49.20)
observation (weight: 40.88)
line (weight: 40.60)
ice (weight: 34.51)
propose (weight: 34.48)
absorption (weight: 31.04)
allow (weight: 30.79)
chemistry (weight: 28.31)


Topic 4:
di

In [19]:
train_doc_topic = pd.DataFrame(train_topics)
train_doc_topic = train_doc_topic.set_index(train_texts.index.values)
train_doc_topic.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
2022.1.01108.S,0.000235,0.000235,0.000235,0.000235,0.000235,0.000235,0.151758,0.000235,0.346698,0.000235,...,0.000235,0.000235,0.000235,0.000235,0.000235,0.000235,0.000235,0.000235,0.000235,0.000235
2021.1.01495.S,0.000171,0.000171,0.000171,0.000171,0.000171,0.000171,0.000171,0.000171,0.000171,0.000171,...,0.000171,0.000171,0.150745,0.000171,0.000171,0.219299,0.000171,0.000171,0.000171,0.000171
2021.1.00055.S,0.000161,0.000161,0.000161,0.000161,0.000161,0.000161,0.000161,0.000161,0.000161,0.204172,...,0.000161,0.000161,0.000161,0.000161,0.000161,0.000161,0.000161,0.000161,0.000161,0.000161
2021.2.00056.S,0.000194,0.000194,0.000194,0.000194,0.000194,0.000194,0.000194,0.000194,0.000194,0.000194,...,0.000194,0.000194,0.000194,0.000194,0.000194,0.000194,0.000194,0.000194,0.000194,0.000194
2018.A.00068.T,0.000175,0.059577,0.000175,0.000175,0.000175,0.128074,0.200258,0.000175,0.000175,0.000175,...,0.000175,0.000175,0.000175,0.000175,0.000175,0.000175,0.000175,0.000175,0.000175,0.145325


In [20]:
train_texts = pd.DataFrame(train_texts)

### Match test data into topics

In [21]:
test_topics = lda_model.transform(test_texts)

In [22]:
test_doc_topic= pd.DataFrame(test_topics.tolist())
test_doc_topic= test_doc_topic.set_index(test_texts.index.values)
test_doc_topic.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
2017.1.01260.S,0.000217,0.000217,0.000217,0.000217,0.000217,0.000217,0.000217,0.000217,0.000217,0.000217,...,0.000217,0.000217,0.000217,0.000217,0.139139,0.51756,0.000217,0.000217,0.000217,0.000217
2016.1.01372.S,0.000256,0.000256,0.029661,0.000256,0.000256,0.000256,0.000256,0.000256,0.000256,0.000256,...,0.000256,0.000256,0.000256,0.000256,0.000256,0.000256,0.000256,0.000256,0.000256,0.000256
2019.1.01398.S,0.000202,0.115943,0.000202,0.000202,0.000202,0.000202,0.000202,0.000202,0.000202,0.000202,...,0.000202,0.000202,0.000202,0.000202,0.063503,0.707017,0.000202,0.000202,0.000202,0.000202
2017.1.01230.S,0.000222,0.000222,0.000222,0.000222,0.000222,0.000222,0.000222,0.000222,0.000222,0.137161,...,0.000222,0.000222,0.000222,0.000222,0.000222,0.348337,0.000222,0.000222,0.000222,0.000222
2021.1.00045.S,0.000171,0.000171,0.000171,0.000171,0.000171,0.058735,0.000171,0.000171,0.000171,0.000171,...,0.000171,0.000171,0.000171,0.000171,0.000171,0.000171,0.000171,0.000171,0.000171,0.293903


In [23]:
test_texts = pd.DataFrame(test_texts)

### Group documents to highest matching topic

Combine project topic vector frames

In [24]:
proj_topics = pd.concat([train_doc_topic, test_doc_topic])
proj_topics

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
2022.1.01108.S,0.000235,0.000235,0.000235,0.000235,0.000235,0.000235,0.151758,0.000235,0.346698,0.000235,...,0.000235,0.000235,0.000235,0.000235,0.000235,0.000235,0.000235,0.000235,0.000235,0.000235
2021.1.01495.S,0.000171,0.000171,0.000171,0.000171,0.000171,0.000171,0.000171,0.000171,0.000171,0.000171,...,0.000171,0.000171,0.150745,0.000171,0.000171,0.219299,0.000171,0.000171,0.000171,0.000171
2021.1.00055.S,0.000161,0.000161,0.000161,0.000161,0.000161,0.000161,0.000161,0.000161,0.000161,0.204172,...,0.000161,0.000161,0.000161,0.000161,0.000161,0.000161,0.000161,0.000161,0.000161,0.000161
2021.2.00056.S,0.000194,0.000194,0.000194,0.000194,0.000194,0.000194,0.000194,0.000194,0.000194,0.000194,...,0.000194,0.000194,0.000194,0.000194,0.000194,0.000194,0.000194,0.000194,0.000194,0.000194
2018.A.00068.T,0.000175,0.059577,0.000175,0.000175,0.000175,0.128074,0.200258,0.000175,0.000175,0.000175,...,0.000175,0.000175,0.000175,0.000175,0.000175,0.000175,0.000175,0.000175,0.000175,0.145325
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019.1.01461.S,0.000182,0.000182,0.000182,0.000182,0.000182,0.000182,0.000182,0.000182,0.030220,0.000182,...,0.000182,0.000182,0.000182,0.000182,0.000182,0.077800,0.000182,0.000182,0.000182,0.000182
2023.1.01206.S,0.000183,0.000183,0.000183,0.000183,0.015246,0.000183,0.000183,0.000183,0.000183,0.268889,...,0.090452,0.000183,0.000183,0.000183,0.000183,0.302937,0.000183,0.000183,0.000183,0.000183
2021.1.00726.S,0.000190,0.000190,0.000190,0.000190,0.000190,0.000190,0.000190,0.000190,0.000190,0.000190,...,0.000190,0.000190,0.000190,0.000190,0.000190,0.000190,0.000190,0.000190,0.000190,0.000190
2017.1.00261.S,0.000175,0.153244,0.000175,0.000175,0.000175,0.000175,0.000175,0.000175,0.000175,0.076936,...,0.000175,0.000175,0.000175,0.000175,0.000175,0.000175,0.000175,0.000175,0.083825,0.000175


Take highest matching topic for each project

In [25]:
proj_topics['max_topic'] = proj_topics.apply(lambda x: x.argmax(), axis=1)

Create data frame with project id and max topic

In [26]:
proj_max_topic = proj_topics['max_topic'].to_frame()
proj_max_topic

Unnamed: 0,max_topic
2022.1.01108.S,8
2021.1.01495.S,21
2021.1.00055.S,21
2021.2.00056.S,23
2018.A.00068.T,20
...,...
2019.1.01461.S,21
2023.1.01206.S,45
2021.1.00726.S,21
2017.1.00261.S,12


### Add `max_topic` to `measurements` frame to be able to group measurements by max topic

In [27]:
measurements = pd.merge(measurements, proj_max_topic, left_index=True, right_index=True)

In [28]:
proj_max_topic.value_counts().describe()

count     50.000000
mean      65.660000
std       87.445986
min        4.000000
25%       14.500000
50%       28.500000
75%       87.250000
max      419.000000
Name: count, dtype: float64

There are a few topics that match to a large number of documents. Perhaps we need a better topic model or to group documents by project_topic vector similarity.

Eyeball comparison of documents by max topic. This requires looking at the online explorer since printing out abstracts in here gets messy.

In [29]:
proj_max_topic[proj_max_topic.max_topic == 3].head()

Unnamed: 0,max_topic
2019.1.00799.S,3
2023.1.01573.S,3
2019.A.00023.S,3
2017.1.00575.S,3
2013.1.01194.S,3


### Generate test projects measurements
This will be useful for calculating hit rates to evaluate model performance.

**NOTE!!!**
You should not sort these, however tempting. We need to preserve the relationships of the entries to not lose measurement information.

In [30]:
test_proj_meas = measurements.loc[test_texts.index]
test_proj_meas = test_proj_meas.groupby(test_proj_meas.index)\
    .agg({
        'low_freq': lambda x: round(x, 4).tolist(),
        'high_freq': lambda x: round(x, 4).tolist(),
        'med_freq': lambda x: round(x, 4).tolist(),
        'diff_freq': lambda x: round(x, 4).tolist()
    })
test_proj_meas.head()

Unnamed: 0_level_0,low_freq,high_freq,med_freq,diff_freq
project_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2011.0.00010.S,"[90.38, 90.7, 91.69, 92.89, 217.59, 218.67, 21...","[90.62, 90.93, 91.92, 93.12, 218.53, 219.6, 21...","[90.5, 90.815, 91.805, 93.005, 218.06, 219.135...","[0.24, 0.23, 0.23, 0.23, 0.94, 0.93, 0.94, 0.9..."
2011.0.00121.S,"[319.07, 320.48, 319.83, 319.36, 319.71, 316.59]","[320.94, 322.35, 321.71, 321.24, 321.58, 318.47]","[320.005, 321.415, 320.77, 320.3, 320.645, 317...","[1.87, 1.87, 1.88, 1.88, 1.87, 1.88]"
2011.0.00133.S,"[330.24, 332.19, 342.24, 344.24]","[332.12, 334.07, 344.12, 346.12]","[331.18, 333.13, 343.18, 345.18]","[1.88, 1.88, 1.88, 1.88]"
2011.0.00191.S,"[343.04, 344.91, 355.04, 356.91, 343.08, 344.9...","[344.91, 346.79, 356.91, 358.79, 344.96, 346.8...","[343.975, 345.85, 355.975, 357.85, 344.02, 345...","[1.87, 1.88, 1.87, 1.88, 1.88, 1.89, 1.88, 1.89]"
2011.0.00210.S,"[219.42, 219.81, 230.4, 231.18, 219.44, 219.82...","[219.66, 220.05, 230.64, 231.42, 219.67, 220.0...","[219.54, 219.93, 230.52, 231.3, 219.555, 219.9...","[0.24, 0.24, 0.24, 0.24, 0.23, 0.24, 0.24, 0.23]"


### Generate train topic measurements
We will use these to engineer 'areas of interest' among topics using DBSCAN

**NOTE!!!**
You should not sort these, however tempting. We need to preserve the relationships of the entries to not lose measurement information.

In [31]:
train_topic_freqs = measurements.loc[train_texts.index]\
    .groupby('max_topic')\
    .agg({
        'low_freq': lambda x: round(x, 4).tolist(),
        'high_freq': lambda x: round(x, 4).tolist(),
        'med_freq': lambda x: round(x, 4).tolist(),
        'diff_freq': lambda x: round(x, 4).tolist(),
        'band': lambda x: x.astype('int64').tolist()
    })
train_topic_freqs

Unnamed: 0_level_0,low_freq,high_freq,med_freq,diff_freq,band
max_topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,"[212.19, 214.19, 226.19, 228.19, 225.52, 239.6...","[214.06, 216.06, 228.06, 230.06, 227.52, 241.6...","[213.125, 215.125, 227.125, 229.125, 226.52, 2...","[1.87, 1.87, 1.87, 1.87, 2.0, 2.0, 1.87, 1.87,...","[6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, ..."
1,"[477.19, 478.99, 489.19, 491.14, 477.18, 489.1...","[479.19, 480.99, 491.19, 493.14, 479.18, 491.1...","[478.19, 479.99, 490.19, 492.14, 478.18, 490.1...","[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ..."
2,"[100.22, 101.72, 112.46, 114.71, 95.72, 96.97,...","[102.1, 103.6, 114.33, 115.64, 97.6, 98.84, 10...","[101.16, 102.66, 113.395, 115.175, 96.66, 97.9...","[1.88, 1.88, 1.87, 0.93, 1.88, 1.87, 1.87, 1.8...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 8, 8, 8, 3, 3, ..."
3,"[186.31, 186.35, 186.61, 187.56, 187.86, 198.7...","[186.37, 186.4, 186.67, 187.61, 189.74, 198.78...","[186.34, 186.375, 186.64, 187.585, 188.8, 198....","[0.06, 0.05, 0.06, 0.05, 1.88, 0.06, 0.06, 0.0...","[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, ..."
4,"[330.56, 345.76, 330.55, 219.53, 220.37, 230.5...","[330.61, 345.82, 330.61, 219.59, 220.43, 230.5...","[330.585, 345.79, 330.58, 219.56, 220.4, 230.5...","[0.05, 0.06, 0.06, 0.06, 0.06, 0.05, 0.06, 0.1...","[7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, ..."
5,"[94.56, 95.14, 107.67, 109.86, 216.09, 218.09,...","[96.43, 95.19, 109.54, 110.33, 217.96, 219.96,...","[95.495, 95.165, 108.605, 110.095, 217.025, 21...","[1.87, 0.05, 1.87, 0.47, 1.87, 1.87, 1.87, 1.8...","[3, 3, 3, 3, 6, 6, 6, 6, 3, 3, 3, 3, 7, 7, 7, ..."
6,"[343.33, 345.14, 114.66, 283.67, 285.55, 308.2...","[345.21, 347.01, 115.59, 285.54, 287.41, 310.1...","[344.27, 346.075, 115.125, 284.605, 286.48, 30...","[1.88, 1.87, 0.93, 1.87, 1.86, 1.86, 1.87, 1.8...","[7, 7, 3, 7, 7, 7, 7, 6, 6, 3, 3, 3, 6, 6, 6, ..."
7,"[95.81, 95.87, 95.92, 96.71, 98.47, 98.56, 107...","[95.86, 95.93, 95.98, 96.77, 98.53, 98.62, 109...","[95.835, 95.9, 95.95, 96.74, 98.5, 98.59, 108....","[0.05, 0.06, 0.06, 0.06, 0.06, 0.06, 1.87, 0.0...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
8,"[219.52, 219.91, 220.36, 230.5, 232.0, 219.52,...","[219.58, 219.97, 220.42, 230.56, 234.0, 219.59...","[219.55, 219.94, 220.39, 230.53, 233.0, 219.55...","[0.06, 0.06, 0.06, 0.06, 2.0, 0.07, 0.07, 0.07...","[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, ..."
9,"[254.36, 256.24, 269.54, 271.42, 228.43, 229.8...","[256.23, 258.11, 271.41, 273.29, 230.3, 231.67...","[255.295, 257.175, 270.475, 272.355, 229.365, ...","[1.87, 1.87, 1.87, 1.87, 1.87, 1.87, 1.88, 1.9...","[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, ..."


### Use KNN to find $\epsilon$ for DBSCAN
The DBSCAN papers suggest using the elbow method with $k=2*\text{num dimensions}-1$ to find the optimal $\epsilon$

In our case since we're only using 1 dimension we use $k=2$

### Try mining a topic by hand

In [32]:
inspect_topic = 24

In [33]:
# db = DBSCAN(eps=0.25, min_samples=2).fit(list(zip(train_topic_freqs.loc[inspect_topic].med_freq)))
db = HDBSCAN(max_cluster_size=200, min_cluster_size=5).fit(list(zip(train_topic_freqs.loc[inspect_topic].med_freq)))

In [34]:
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_points = len(list(labels))
n_noise = list(labels).count(-1)

print("Estimated number of clusters: %d" % n_clusters)
print("Total number of measurements: %d" % n_points)
print("Estimated number of noise measurements: %d" % n_noise)
print(f"Noise percentage: {round(list(labels).count(-1)/labels.shape[0], 3)}")
print(f'Signal to noise ratio: {1-round(list(labels).count(-1)/labels.shape[0], 3)}')

Estimated number of clusters: 12
Total number of measurements: 112
Estimated number of noise measurements: 10
Noise percentage: 0.089
Signal to noise ratio: 0.911


### Assign labels to measurements for topic and find cluster centers

In [35]:
selected_topic = pd.DataFrame(train_topic_freqs.med_freq.loc[inspect_topic],
                              train_topic_freqs.band.loc[inspect_topic])\
    .reset_index()
selected_topic.columns = ['band', 'med_freq']
selected_topic['cluster_label'] = labels

In [36]:
selected_topic

Unnamed: 0,band,med_freq,cluster_label
0,7,346.500,10
1,7,346.630,10
2,7,347.710,11
3,7,348.090,11
4,7,348.360,-1
...,...,...,...
107,6,259.605,2
108,6,260.230,2
109,6,260.920,2
110,6,261.840,2


### Option 2 CURRENT IMPLEMENTATION: Take min and max of frequency median by cluster to generate 'areas of interest'.

In [37]:
for i in range(0,1):
    print(i)

0


In [66]:
topic_cluster = selected_topic.groupby('cluster_label').agg(
    mean_freq=('med_freq', 'mean'),
    min_freq=('med_freq', 'min'),
    max_freq=('med_freq', 'max'),
    count_freq=('med_freq', 'count'),
    band_min=('band', 'min'),
    band_max=('band', 'max')
).sort_values('count_freq', ascending=False)
topic_cluster.sort_index()


Unnamed: 0_level_0,mean_freq,min_freq,max_freq,count_freq,band_min,band_max
cluster_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
-1,335.4455,224.71,349.795,10,6,7
0,333.651111,332.07,337.96,9,7,7
1,358.31125,354.5,360.3,8,7,7
2,260.0575,258.265,262.255,8,6,6
3,237.145,226.34,242.37,9,6,6
4,155.189,153.28,156.97,10,4,4
5,104.8375,104.475,105.035,6,3,3
6,92.8225,91.585,95.21,6,3,3
7,145.002222,141.435,147.3,9,4,4
8,134.945,132.96,136.15,8,4,4


In [68]:
# Code to check for clusters that span at least two bands
new_rows = []
bad_rows = []
for clst in topic_cluster.index:
    if topic_cluster.loc[clst].band_min != topic_cluster.loc[clst].band_max:
        olap_clst = clst
        olap_band_min = topic_cluster.loc[clst].band_min.astype('int64')
        olap_band_max = topic_cluster.loc[clst].band_max.astype('int64')
        olap_min_freq = topic_cluster.loc[clst].min_freq
        olap_max_freq = topic_cluster.loc[clst].max_freq

        # Check to see that the cluster doesn't span more than two bands
        # If it does to this the cluster is far too large and clusters need to be tuned better
        if olap_band_max - olap_band_min > 1:
            raise ValueError('Cluster spans more than 2 bands. Re-parameterize clusters.')
        
        # Otherwise, we split the cluster into two different clusters on the band boundaries
        new_row1_min_freq = olap_min_freq
        new_row1_max_freq = band_cutoffs[olap_band_max-1]
        new_row2_min_freq = band_cutoffs[olap_band_max-1]
        new_row2_max_freq = olap_max_freq
        reassign_measures = selected_topic[selected_topic.cluster_label == clst].med_freq\
            .sort_values()\
            .to_list()
        new_row1_measures = []
        new_row2_measures = []

        # Loop over reassign_measures and build lists for each new cluster
        for meas in reassign_measures:
            if meas <= new_row2_min_freq:
                new_row1_measures.append(meas)
            else:
                new_row2_measures.append(meas)
        
        # Generate column values for new rows (clusters)
        new_row1_count = len(new_row1_measures)
        new_row2_count = len(new_row2_measures)
        new_row1_mean = np.mean(new_row1_measures)
        new_row2_mean = np.mean(new_row2_measures)
        new_row1_band_min = olap_band_min
        new_row1_band_max = olap_band_min
        new_row2_band_min = olap_band_max
        new_row2_band_max = olap_band_max

        # Make new row lists to add to data frame
        new_row1 = [new_row1_mean,
                    new_row1_min_freq,
                    new_row1_max_freq,
                    new_row1_count,
                    new_row1_band_min,
                    new_row1_band_max
                    ]
        
        new_row2 = [new_row2_mean,
            new_row2_min_freq,
            new_row2_max_freq,
            new_row2_count,
            new_row2_band_min,
            new_row2_band_max
            ]
        
        # Add new rows (clusters) to list to ultimately alter topic_cluster
        # We don't want to alter the data frame we're looping over in the loop
        new_rows.append(new_row1)
        new_rows.append(new_row2)
        bad_rows.append(clst)

    # Drop and add affected rows
    if len(bad_rows) != 0:
        for br in range(len(bad_rows)):
            topic_cluster = topic_cluster.drop(bad_rows[br], axis=0)
        for nr in range(len(new_rows)):
            topic_cluster.loc[len(topic_cluster.index)] = new_rows[nr]
    
    topic_cluster = topic_cluster.reset_index()


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


ValueError: Must have equal len keys and value when setting with an iterable

In [69]:
topic_cluster.sort_index()

Unnamed: 0,index,cluster_label,mean_freq,min_freq,max_freq,count_freq,band_min,band_max
0,0,10,346.579167,346.5,346.66,12,7,7
2,2,4,155.189,153.28,156.97,10,4,4
3,3,9,344.665,344.2,345.34,10,7,7
4,4,0,333.651111,332.07,337.96,9,7,7
5,5,3,237.145,226.34,242.37,9,6,6
6,6,7,145.002222,141.435,147.3,9,4,4
7,7,1,358.31125,354.5,360.3,8,7,7
8,8,2,260.0575,258.265,262.255,8,6,6
9,9,8,134.945,132.96,136.15,8,4,4
10,10,11,347.620714,347.215,348.13,7,7,7
