In [24]:
%matplotlib widget
from collections import defaultdict
import glob
import sys
sys.path.append('../')

from joblib import dump, load
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import numpy as np
import pandas as pd
import pacman_classes
from utils import tokenizer

from sklearn.feature_selection import chi2
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels

In [9]:
proposal_cycle = '25'
proposal_training_data_dir = f'../training_data/Cycle{proposal_cycle}'

First we read in the hand classifications for each proposal and then match them with their training data.

In [10]:
proposal_classifications = pd.read_csv(f'{proposal_training_data_dir}/cycle_{proposal_cycle}_hand_classifications.txt')
proposal_classifications.head()

Unnamed: 0,proposal_num,hand_classification
0,1,stellar physics and stellar types
1,2,galaxies
2,3,stellar physics and stellar types
3,4,stellar physics and stellar types
4,5,stellar populations and the interstellar medium


Generate a list of files from the training data for the specified proposal cycle.
In order to match these files to their new classifications using updated science cataegories, we need to parse their proposal numbers from the filename, then sort them by proposal number.

In [14]:
flist = glob.glob(f"{proposal_training_data_dir}/*training.txt") 
proposal_numbers = [int(val.split('/')[-1].split('_')[0]) for val in flist]
flist_and_pnum = list(zip(flist, proposal_numbers))
flist_and_pnum.sort(key=lambda val: val[1])
flist_sorted, proposal_num = list(zip(*flist_and_pnum))

Now we combine the sorted proposal list with their classifications

In [15]:
hand_classified_null = proposal_classifications[proposal_classifications['hand_classification'].isnull()]
print(f"Total number of missing proposals generated from the pdf abstract list: {len(hand_classified_null)}")
a = np.ediff1d(proposal_num)
idx = list(map(int, np.where(a>1)[0]))
missing_proposals = [proposal_num[val]+1 for val in idx]
print(f"Total number of missing proposals generated using the scraped proposal texts: {len(missing_proposals)}")

# Generate a new columnn to store the filenames and initialize it with NaNs
proposal_classifications['fname'] = [np.nan]*len(proposal_classifications)

# Loop through each proposal and update the dataframe with the filename
for num, fname in zip(proposal_num, flist_sorted):
    proposal_classifications['fname'].loc[num-1] = fname

Total number of missing proposals generated from the pdf abstract list: 292
Total number of missing proposals generated using the scraped proposal texts: 14


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [16]:
for i, (f, proposal_num, cls) in enumerate(zip(proposal_classifications['fname'],proposal_classifications['proposal_num'],proposal_classifications['hand_classification'])):
    try:
        print(f.split('/')[-1], proposal_num, cls)
    except AttributeError as e:
        print(f, proposal_num, cls)
    if i+1==19:
        break

0001_training.txt 1 stellar physics and stellar types
0002_training.txt 2 galaxies
0003_training.txt 3 stellar physics and stellar types
0004_training.txt 4 stellar physics and stellar types
0005_training.txt 5 stellar populations and the interstellar medium
0006_training.txt 6 stellar populations and the interstellar medium
0007_training.txt 7 galaxies
0008_training.txt 8 stellar physics and stellar types
0009_training.txt 9 intergalactic medium and the circumgalactic medium
0010_training.txt 10 large scale structure of the universe
0011_training.txt 11 stellar physics and stellar types
0012_training.txt 12 galaxies
0013_training.txt 13 stellar populations and the interstellar medium
0014_training.txt 14 stellar populations and the interstellar medium
0015_training.txt 15 large scale structure of the universe
0016_training.txt 16 intergalactic medium and the circumgalactic medium
0017_training.txt 17 supermassive black holes and active galaxies
nan 18 nan
0019_training.txt 19 galaxies

Now we need to generate a transformation that maps our science categories to a unique integer in the set {0,1,..,$N_{cat}$-1}.
To do this we will use the `LabelEncoder` class in scikit-learn, but before doing so we need to filter out any row containing NaN. Once we've generated the encoding, we add a column to the DataFrame containing the encoded values

In [17]:
# Drop any rows that have nan
final_df = proposal_classifications.dropna()
encoder = LabelEncoder()
encoder.fit(final_df['hand_classification'])
nl = '\n'
print(f"The identified classes are:\n{nl.join(encoder.classes_)}")
encoded_values = encoder.transform(final_df['hand_classification'])
final_df['encoded_classification'] = encoded_values

The identified classes are:
exoplanets and exoplanet formation
galaxies
intergalactic medium and the circumgalactic medium
large scale structure of the universe
solar system astronomy
stellar physics and stellar types
stellar populations and the interstellar medium
supermassive black holes and active galaxies


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['encoded_classification'] = encoded_values


In [18]:
final_df.head()

Unnamed: 0,proposal_num,hand_classification,fname,encoded_classification
0,1,stellar physics and stellar types,../training_data/Cycle25/0001_training.txt,5
1,2,galaxies,../training_data/Cycle25/0002_training.txt,1
2,3,stellar physics and stellar types,../training_data/Cycle25/0003_training.txt,5
3,4,stellar physics and stellar types,../training_data/Cycle25/0004_training.txt,5
4,5,stellar populations and the interstellar medium,../training_data/Cycle25/0005_training.txt,6


Now that we have a proper encoding for our hand classified proposals, we need to read in each file to extract the text. Once all the files have been processed, we merge the returned `text_df` with the `final_df` from above. This returns a dataframe containing everything we need for processing

In [23]:
text_df = pacman_classes.read_in_dataset(flist=final_df['fname'].values, parallel=False)

TypeError: preprocess() missing 1 required positional argument: 'self'

In [10]:
text_df['text'][1][:550]

'Dwarf galaxies were the first systems to form stars within the Universe. An understanding of their starformation histories (SFH) across cosmic time is therefore imperative for galaxy formation and evolution studies. In particular, understanding how the most metal-poor of these systems, blue compact dwarf (BCD) galaxies, remain chemically pristine despite long periods of moderate SF with a recent burst remains a challenge. We are also yet to understand how starbursting BCDs and quiescent dwarf irregulars (dIrrs) show similar SFHs, despite an ord'

In [11]:
text_df['cleaned_text'][0][:100]

'hubble space telescope hst instrumental elucidate nature intriguing superluminous supernovae slsne e'

In [12]:
combined_df = pd.merge(final_df, text_df, on='fname')
combined_df.head()

Unnamed: 0,proposal_num,hand_classification,fname,encoded_classification,text,cleaned_text
0,1,stellar physics,/home/nmiles/PACMan_dist/training_data/trainin...,4,The Hubble Space Telescope (HST) has been inst...,hubble space telescope hst instrumental elucid...
1,2,galaxies and the igm,/home/nmiles/PACMan_dist/training_data/trainin...,0,Dwarf galaxies were the first systems to form ...,dwarf galaxy form star universe understanding ...
2,3,stellar populations and the ism,/home/nmiles/PACMan_dist/training_data/trainin...,5,The Galactic stellar populations are moving th...,galactic stellar population interstellar mediu...
3,4,stellar physics,/home/nmiles/PACMan_dist/training_data/trainin...,4,We propose to compute state-of-the-art model a...,propose compute state art model atmosphere pho...
4,5,stellar populations and the ism,/home/nmiles/PACMan_dist/training_data/trainin...,5,Hypervelocity stars (HVS) are young stellar ob...,hypervelocity star hvs young stellar object ex...


The first step in our pipeline will be to generate a vocabulary using the entire corpus, then we will turn our documents into vectors using the term frequency inverse document frequency method (tf-idf).

In [25]:
tfidf_vect = TfidfVectorizer(
    max_features=10000,
    use_idf=True,
    norm='l2',
    ngram_range=(1, 2)
)

In [26]:
tfidf_vectorizer_vectors = tfidf_vect.fit_transform(combined_df['cleaned_text'])

NameError: name 'combined_df' is not defined

In [27]:
combined_df.loc[0]

NameError: name 'combined_df' is not defined

In [28]:
first_vector_tfidfvectorizer=tfidf_vectorizer_vectors[1]

#place tf-idf values in a pandas data frame
df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vect.get_feature_names(), columns=["tfidf"])
df.sort_values(by=["tfidf"],ascending=False)

NameError: name 'tfidf_vectorizer_vectors' is not defined

In [29]:
nb_tfidf = Pipeline(
    [('vect', tfidf_vect),
     ('clf', MultinomialNB(alpha=0.05))]
)

In [30]:
cnb_tfidf = Pipeline(
    [('vect', tfidf_vect),
    ('clf', ComplementNB(alpha=0.05))]
)

In [31]:
nb_tfidf.fit(combined_df['cleaned_text'], combined_df['encoded_classification'])

NameError: name 'combined_df' is not defined

Ok, so we trained a NB classifier on the cleaned text and the encoded classifications. The next step is to read in all of training data for cycle 24 and make predictions using the trained classifier. Once we make our predictions, we need to compare them to the truth set 

In [32]:
cnb_tfidf.fit(combined_df['cleaned_text'], combined_df['encoded_classification'])

NameError: name 'combined_df' is not defined

Save the fitted classifiers to disk!

In [33]:
dump(nb_tfidf, 'nb_tfidf_cls.joblib') 

['nb_tfidf_cls.joblib']

In [34]:
read_in_cls = load('nb_tfidf_cls.joblib')

In [35]:
read_in_cls

Pipeline(steps=[('vect',
                 TfidfVectorizer(max_features=10000, ngram_range=(1, 2))),
                ('clf', MultinomialNB(alpha=0.05))])

In [36]:
proposal_classifications_cy24 = pd.read_csv('/Users/nmiles/PACMan_dist/training_data/training_corpus_cy24/cycle_24_hand_classifications.txt')
proposal_classifications_cy24_nonans = proposal_classifications_cy24.dropna()
print(proposal_classifications_cy24_nonans.head())
encoded_values_cy24 = encoder.transform(proposal_classifications_cy24_nonans['hand_classification'])
proposal_classifications_cy24_nonans['true_classification_encoded'] = encoded_values_cy24
print(proposal_classifications_cy24_nonans.head())

FileNotFoundError: [Errno 2] No such file or directory: '/Users/nmiles/PACMan_dist/training_data/training_corpus_cy24/cycle_24_hand_classifications.txt'

In [37]:
flist_test = glob.glob(f"/Users/nmiles/PACMan_dist/training_data/training_corpus_cy24/*training.txt") 
proposal_numbers_test = [int(val.split('/')[-1].split('_')[0]) for val in flist_test]
flist_and_pnum_test = list(zip(flist_test, proposal_numbers_test))
flist_and_pnum_test.sort(key=lambda val: val[1])
flist_test_sorted, proposal_num_test = list(zip(*flist_and_pnum_test))

ValueError: not enough values to unpack (expected 2, got 0)

In [29]:
test_text_df = pacman2020.read_in_dataset(flist=flist_test_sorted, parallel=True)

INFO [pacman2020.read_in_dataset:177] Reading in 1093 proposals...


[########################################] | 100% Completed | 16min  4.7s


INFO [pacman2020.read_in_dataset:204] Total time for preprocessing: 16.087


In [30]:
test_text_df.head()

Unnamed: 0,text,cleaned_text,fname
0,PG1159 stars are H-deficient (pre-) white dwar...,pg1159 star h deficient pre- white dwarf surfa...,/Users/nmiles/PACMan_dist/training_data/traini...
1,"On December 7, 2015 JAXA`s Akatsuki/Venus Clim...",december jaxa`s akatsuki venus climate orbiter...,/Users/nmiles/PACMan_dist/training_data/traini...
2,The vacuum ultraviolet (UV) contains many stro...,vacuum ultraviolet uv contain strong transitio...,/Users/nmiles/PACMan_dist/training_data/traini...
3,One of the most outstanding issues in exoplane...,outstanding exoplanet characterization underst...,/Users/nmiles/PACMan_dist/training_data/traini...
4,WFC3 extends to the infrared the capacity of H...,wfc3 extend infrared capacity hst resolve red ...,/Users/nmiles/PACMan_dist/training_data/traini...


In [31]:
nb_predictions = read_in_cls.predict(test_text_df['cleaned_text'])
nb_prediction_probabilities = nb_tfidf.predict_proba(test_text_df['cleaned_text'])

In [32]:
cnb_predictions = cnb_tfidf.predict(test_text_df['cleaned_text'])
cnb_prediction_probabilities = nb_tfidf.predict(test_text_df['cleaned_text'])

In [33]:
encoder.classes_

array(['galaxies and the igm', 'large scale structure of the universe',
       'planets and planet formation', 'solar system', 'stellar physics',
       'stellar populations and the ism',
       'supermassive black holes and active galaxies'], dtype=object)

In [34]:
for i, p in enumerate(nb_prediction_probabilities[1]):
    print(f"class: {encoder.classes_[i]}, probability: {p:0.2%}")

class: galaxies and the igm, probability: 0.04%
class: large scale structure of the universe, probability: 0.09%
class: planets and planet formation, probability: 61.60%
class: solar system, probability: 37.29%
class: stellar physics, probability: 0.71%
class: stellar populations and the ism, probability: 0.19%
class: supermassive black holes and active galaxies, probability: 0.08%


In [35]:
final_test_df = test_text_df.loc[:,['fname']]

In [36]:
final_test_df['proposal_num'] = proposal_num_test
final_test_df['nb_encoded_classification'] = nb_predictions
final_test_df['cnb_encoded_classification'] = cnb_predictions
final_test_df['nb_classes'] = [encoder.classes_[val] for val in nb_predictions]
final_test_df['cnb_classes'] = [encoder.classes_[val] for val in cnb_predictions]

In [37]:
final_test_df

Unnamed: 0,fname,proposal_num,nb_encoded_classification,cnb_encoded_classification,nb_classes,cnb_classes
0,/Users/nmiles/PACMan_dist/training_data/traini...,1,4,4,stellar physics,stellar physics
1,/Users/nmiles/PACMan_dist/training_data/traini...,7,2,2,planets and planet formation,planets and planet formation
2,/Users/nmiles/PACMan_dist/training_data/traini...,12,4,4,stellar physics,stellar physics
3,/Users/nmiles/PACMan_dist/training_data/traini...,13,2,2,planets and planet formation,planets and planet formation
4,/Users/nmiles/PACMan_dist/training_data/traini...,15,1,1,large scale structure of the universe,large scale structure of the universe
...,...,...,...,...,...,...
1088,/Users/nmiles/PACMan_dist/training_data/traini...,1109,4,4,stellar physics,stellar physics
1089,/Users/nmiles/PACMan_dist/training_data/traini...,1110,2,2,planets and planet formation,planets and planet formation
1090,/Users/nmiles/PACMan_dist/training_data/traini...,1111,0,0,galaxies and the igm,galaxies and the igm
1091,/Users/nmiles/PACMan_dist/training_data/traini...,1112,0,0,galaxies and the igm,galaxies and the igm


In [38]:
merged_df = pd.merge(proposal_classifications_cy24_nonans, final_test_df, on='proposal_num')

In [39]:
merged_df

Unnamed: 0,proposal_num,hand_classification,true_classification_encoded,fname,nb_encoded_classification,cnb_encoded_classification,nb_classes,cnb_classes
0,1,stellar physics,4,/Users/nmiles/PACMan_dist/training_data/traini...,4,4,stellar physics,stellar physics
1,7,solar system,3,/Users/nmiles/PACMan_dist/training_data/traini...,2,2,planets and planet formation,planets and planet formation
2,12,stellar physics,4,/Users/nmiles/PACMan_dist/training_data/traini...,4,4,stellar physics,stellar physics
3,13,planets and planet formation,2,/Users/nmiles/PACMan_dist/training_data/traini...,2,2,planets and planet formation,planets and planet formation
4,15,galaxies and the igm,0,/Users/nmiles/PACMan_dist/training_data/traini...,1,1,large scale structure of the universe,large scale structure of the universe
...,...,...,...,...,...,...,...,...
1088,1109,stellar physics,4,/Users/nmiles/PACMan_dist/training_data/traini...,4,4,stellar physics,stellar physics
1089,1110,planets and planet formation,2,/Users/nmiles/PACMan_dist/training_data/traini...,2,2,planets and planet formation,planets and planet formation
1090,1111,galaxies and the igm,0,/Users/nmiles/PACMan_dist/training_data/traini...,0,0,galaxies and the igm,galaxies and the igm
1091,1112,galaxies and the igm,0,/Users/nmiles/PACMan_dist/training_data/traini...,0,0,galaxies and the igm,galaxies and the igm


In [40]:
merged_df.head()

Unnamed: 0,proposal_num,hand_classification,true_classification_encoded,fname,nb_encoded_classification,cnb_encoded_classification,nb_classes,cnb_classes
0,1,stellar physics,4,/Users/nmiles/PACMan_dist/training_data/traini...,4,4,stellar physics,stellar physics
1,7,solar system,3,/Users/nmiles/PACMan_dist/training_data/traini...,2,2,planets and planet formation,planets and planet formation
2,12,stellar physics,4,/Users/nmiles/PACMan_dist/training_data/traini...,4,4,stellar physics,stellar physics
3,13,planets and planet formation,2,/Users/nmiles/PACMan_dist/training_data/traini...,2,2,planets and planet formation,planets and planet formation
4,15,galaxies and the igm,0,/Users/nmiles/PACMan_dist/training_data/traini...,1,1,large scale structure of the universe,large scale structure of the universe


In [41]:
for i, val in enumerate(nb_prediction_probabilities[0]):
    print(f"{encoder.classes_[i]}: {val:.2%}")

galaxies and the igm: 0.00%
large scale structure of the universe: 0.00%
planets and planet formation: 0.00%
solar system: 0.00%
stellar physics: 99.65%
stellar populations and the ism: 0.34%
supermassive black holes and active galaxies: 0.00%


In [43]:
data_out = defaultdict(list)
for i, row in merged_df.iterrows():
    data_out['fname'].append(row['fname'])
    data_out['nb_encoded_classification'].append(row['nb_encoded_classification'])
    data_out['nb_classes'].append(row['nb_classes'])
    data_out['hand_classification'].append(row['hand_classification'])
    data_out['hand_encoded_classification'].append(row['true_classification_encoded'])
    for j, class_prob in enumerate(nb_prediction_probabilities[i]):
        data_out[f"{encoder.classes_[j].replace(' ','_')}_prob"].append(class_prob)

In [44]:
df = pd.DataFrame(data_out)

In [45]:
df.head()

Unnamed: 0,fname,nb_encoded_classification,nb_classes,hand_classification,hand_encoded_classification,galaxies_and_the_igm_prob,large_scale_structure_of_the_universe_prob,planets_and_planet_formation_prob,solar_system_prob,stellar_physics_prob,stellar_populations_and_the_ism_prob,supermassive_black_holes_and_active_galaxies_prob
0,/Users/nmiles/PACMan_dist/training_data/traini...,4,stellar physics,stellar physics,4,5.029069e-06,2.659886e-06,7e-06,6.907907e-06,0.9965369,0.003439741,1.297075e-06
1,/Users/nmiles/PACMan_dist/training_data/traini...,2,planets and planet formation,solar system,3,0.00043422,0.0009483614,0.615979,0.3729015,0.007077454,0.001869559,0.0007896306
2,/Users/nmiles/PACMan_dist/training_data/traini...,4,stellar physics,stellar physics,4,6.921021e-05,1.004414e-06,0.000177,7.687057e-06,0.9991847,0.0005347413,2.591507e-05
3,/Users/nmiles/PACMan_dist/training_data/traini...,2,planets and planet formation,planets and planet formation,2,2.686037e-10,8.159139e-09,1.0,3.410443e-07,1.30547e-07,6.286491e-09,1.114522e-09
4,/Users/nmiles/PACMan_dist/training_data/traini...,1,large scale structure of the universe,galaxies and the igm,0,0.02823897,0.9190719,3e-06,4.298953e-05,0.0005872466,0.05188218,0.0001737468


In [46]:
df['hand_classification'].value_counts()

galaxies and the igm                            335
stellar physics                                 261
supermassive black holes and active galaxies    131
planets and planet formation                    130
stellar populations and the ism                 116
solar system                                     60
large scale structure of the universe            60
Name: hand_classification, dtype: int64

In [None]:
df.to_csv('cycle_24_classification_results.txt', header=True, index=False)

In [None]:
#final_test_df.loc[:,['proposal_num', 'classification']].to_csv('Cycle24_classification_predictions.txt', header=True, index=False)

The last step is to evaluate the performance of the classifier.

We will do this in two ways:
- Using the classification metrics in scikit-learn
     - This will simply look to see how many times the most probable classification matched the true classification.
- Computing them by hand using a custom metric
    - Here we will count a proposal as correctly classified if the top two most probable classifications contains the true classification

In [60]:
roc = roc_auc_score(merged_df['true_classification_encoded'], merged_df['nb_encoded_classification'])

ValueError: multiclass format is not supported

In [47]:
# Multinomial Naive Bayes Results
print(classification_report(merged_df['true_classification_encoded'], merged_df['nb_encoded_classification'], target_names=encoder.classes_))

                                              precision    recall  f1-score   support

                        galaxies and the igm       0.84      0.81      0.83       335
       large scale structure of the universe       0.50      0.57      0.53        60
                planets and planet formation       0.86      0.96      0.91       130
                                solar system       0.98      0.80      0.88        60
                             stellar physics       0.91      0.89      0.90       261
             stellar populations and the ism       0.74      0.77      0.75       116
supermassive black holes and active galaxies       0.89      0.92      0.90       131

                                    accuracy                           0.84      1093
                                   macro avg       0.82      0.82      0.82      1093
                                weighted avg       0.85      0.84      0.84      1093



In [48]:
# Complement Naive Bayes Results
print(classification_report(merged_df['true_classification_encoded'], merged_df['cnb_encoded_classification'], target_names=encoder.classes_))

                                              precision    recall  f1-score   support

                        galaxies and the igm       0.82      0.86      0.84       335
       large scale structure of the universe       0.54      0.47      0.50        60
                planets and planet formation       0.79      0.97      0.87       130
                                solar system       0.98      0.70      0.82        60
                             stellar physics       0.91      0.89      0.90       261
             stellar populations and the ism       0.81      0.68      0.74       116
supermassive black holes and active galaxies       0.89      0.92      0.91       131

                                    accuracy                           0.84      1093
                                   macro avg       0.82      0.78      0.79      1093
                                weighted avg       0.84      0.84      0.83      1093



In [49]:
i, row = next(df.iterrows())

In [50]:
df.columns

Index(['fname', 'nb_encoded_classification', 'nb_classes',
       'hand_classification', 'hand_encoded_classification',
       'galaxies_and_the_igm_prob',
       'large_scale_structure_of_the_universe_prob',
       'planets_and_planet_formation_prob', 'solar_system_prob',
       'stellar_physics_prob', 'stellar_populations_and_the_ism_prob',
       'supermassive_black_holes_and_active_galaxies_prob'],
      dtype='object')

In [51]:
list(row[row.index.str.contains('prob')].sort_values(ascending=False)[:2].index)

['stellar_physics_prob', 'stellar_populations_and_the_ism_prob']

In [52]:
custom_accuracy = 0
custom_accuracy_dict = {}
for c in encoder.classes_:
    custom_accuracy_dict[c] = {}
for key in custom_accuracy_dict.keys():
    custom_accuracy_dict[key]['top'] = []
    custom_accuracy_dict[key]['top_two'] = []
    custom_accuracy_dict[key]['misclassified'] = []
for num, row in df.iterrows():
    hand_classification = row['hand_classification']
#     print(hand_classification)
    top_two = row[row.index.str.contains('prob')].sort_values(ascending=False)[:2]
    categories = list(top_two.index)
    categories = [val.replace('_prob','').replace('_',' ') for val in categories]
    probabilites = list(top_two.values)
    if hand_classification == categories[0]:
        custom_accuracy_dict[hand_classification]['top'].append(1)
        custom_accuracy +=1
    elif hand_classification in categories:
        custom_accuracy_dict[hand_classification]['top_two'].append(1)
        custom_accuracy +=1
    else:
        custom_accuracy_dict[hand_classification]['misclassified'].append(1)
#     break

Check to make sure all proposals are categorized as either `top`, `top_two`, or `misclassified`

In [58]:
acurracy = custom_accuracy/len(df)
print(f"{acurracy:.2%}")

95.52%


In [53]:
proposal_numbers = df['hand_classification'].value_counts()

In [54]:
proposal_numbers['galaxies and the igm']

335

In [55]:
computed_results = {'misclassified':[], 'top_two':[], 'top':[]}
index=[]

In [56]:
for cat in custom_accuracy_dict.keys():
    index.append(cat)
    for key in custom_accuracy_dict[cat].keys():
        num_per_key = sum(custom_accuracy_dict[cat][key])
        frac_of_dataset = num_per_key/proposal_numbers[cat]
        computed_results[key].append(frac_of_dataset)
        print(f"Total number of {cat} proposals in {key}: {num_per_key/proposal_numbers[cat]:.2%}")

Total number of galaxies and the igm proposals in top: 81.19%
Total number of galaxies and the igm proposals in top_two: 14.63%
Total number of galaxies and the igm proposals in misclassified: 4.18%
Total number of large scale structure of the universe proposals in top: 56.67%
Total number of large scale structure of the universe proposals in top_two: 30.00%
Total number of large scale structure of the universe proposals in misclassified: 13.33%
Total number of planets and planet formation proposals in top: 96.15%
Total number of planets and planet formation proposals in top_two: 2.31%
Total number of planets and planet formation proposals in misclassified: 1.54%
Total number of solar system proposals in top: 80.00%
Total number of solar system proposals in top_two: 13.33%
Total number of solar system proposals in misclassified: 6.67%
Total number of stellar physics proposals in top: 88.89%
Total number of stellar physics proposals in top_two: 8.43%
Total number of stellar physics prop

In [57]:
computed_results

{'misclassified': [0.041791044776119404,
  0.13333333333333333,
  0.015384615384615385,
  0.06666666666666667,
  0.02681992337164751,
  0.06896551724137931,
  0.04580152671755725],
 'top_two': [0.14626865671641792,
  0.3,
  0.023076923076923078,
  0.13333333333333333,
  0.0842911877394636,
  0.16379310344827586,
  0.03816793893129771],
 'top': [0.8119402985074626,
  0.5666666666666667,
  0.9615384615384616,
  0.8,
  0.8888888888888888,
  0.7672413793103449,
  0.916030534351145]}

In [None]:
computed_results_df = pd.DataFrame(computed_results, index=index)
computed_results_df = computed_results_df[['top','top_two','misclassified']]

In [None]:
computed_results_df.plot.barh(stacked=True)

In [None]:
custom_accuracy/len(df)

In [None]:
confusion_mat = confusion_matrix(merged_df['true_classification_encoded'], merged_df['nb_encoded_classification'])

In [None]:
confusion_mat

In [None]:
plot_confusion_matrix(merged_df['true_classification_encoded'], merged_df['nb_encoded_classification'], encoder.classes_, normalize=True)

In [None]:
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    fig.savefig('confusion_matrix_test.png', format='png', dpi=300)
    return ax