### Text Classification

The goal of this notebook is to walk through the machine learning step of the text classification process.

1) Encoding

2) Partitioning the dataset into distinct subgroups

3) Vectorization (Term Frequency Inverse Document Frequency (TF-IDF))


In [1]:
%matplotlib widget
import glob
import sys
sys.path.append('../')

from astropy.visualization import ImageNormalize, LinearStretch, ZScaleInterval
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import numpy as np
import pandas as pd
from utils.tokenizer import PACManTokenizer
import pacman_classes
from pacman_classes import PACManPipeline

from sklearn.feature_selection import chi2
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.pipeline import Pipeline
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

import os
from os import path

In [2]:
def read_category_label(fname):
    flabel = fname.replace(fname.split('/')[-1],'Cycle25_hand_classifications.txt')
    with open(flabel, 'r') as fobj:
        lines = fobj.readlines()
    print(lines)

In [3]:
cy24 = '../training_data/Cycle24/'
cy25 = '../training_data/Cycle25/'

In [4]:
os.chdir(path.abspath(path.join(os.getcwd(),"..")))
print(os.getcwd())

fname = os.getcwd()+'/training_data/Cycle25/0001_training.txt'

/Users/tking/Documents/PACMan_private


In [5]:

read_category_label(fname)

['proposal_num,hand_classification\n', '0001,stellar physics and stellar types\n', '0002,galaxies\n', '0003,stellar physics and stellar types\n', '0004,stellar physics and stellar types\n', '0005,stellar populations and the interstellar medium\n', '0006,stellar populations and the interstellar medium\n', '0007,galaxies\n', '0008,stellar physics and stellar types\n', '0009,intergalactic medium and the circumgalactic medium\n', '0010,large scale structure of the universe\n', '0011,stellar physics and stellar types\n', '0012,galaxies\n', '0013,stellar populations and the interstellar medium\n', '0014,stellar populations and the interstellar medium\n', '0015,large scale structure of the universe\n', '0016,intergalactic medium and the circumgalactic medium\n', '0017,supermassive black holes and active galaxies\n', '0018,null\n', '0019,galaxies\n', '0020,galaxies\n', '0021,stellar physics and stellar types\n', '0022,stellar physics and stellar types\n', '0023,galaxies\n', '0024,null\n', '002

In [6]:
pacman = PACManTokenizer()
pacman.get_stop_words(fname=os.getcwd()+'/utils/stopwords.txt')

In [7]:
text, cleaned_text, tokens = pacman.run_tokenization(fname=fname, N=20, plot=True)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [8]:
print(text[:650])

The Hubble Space Telescope (HST) has been instrumental in elucidating the nature of the intriguing
superluminous supernovae (SLSNe) explosions by providing unparalleled observations of the progenitor stars,
supernova imposters such as "Luminous Blue Variables" (LBVs) and their host galaxy properties. Furthermore,
HST has directly imaged one of the earliest SLSN discovered, SN 2006gy, more than two years after the
explosion. Now, more than a decade since the first modern discovery of SLSNe and with more than a hundred
members of the class observed, the question on the explosion and energy input mechanism of these
unprecedented events still div


In [9]:
print(cleaned_text[:500])

hubble space telescope hst instrumental elucidate nature intriguing superluminous supernova slsne explosion provide unparalleled observation progenitor star supernova imposter luminous blue variable lbvs host galaxy property furthermore hst directly image early slsn discover sn year explosion decade modern discovery slsne member class observe question explosion energy input mechanism unprecedented event divide supernova massive stellar evolution theorist bring team transient supernova observer t


In [10]:
flist_text = glob.glob(f"{cy25}/training_corpus/*training.txt")
flist_label = glob.glob(f"{cy25}/training_corpus/*_Scientific_Category.txt")

In [12]:
train_df, data = pacman_classes.read_in_dataset(flist=flist_text, parallel=False)

AttributeError: module 'pacman_classes' has no attribute 'read_in_dataset'

In [None]:
train_df.head()

In [None]:
categories = train_df['category'].value_counts()
fig, ax = plt.subplots(nrows=1, ncols=1)
categories.plot.barh(ax=ax)

In [None]:
def create_balanced_subset(df, categories=[]):
    subsets = {}
    for category in categories:
        data = df[df['category'] == category].iloc[:150,:]
        subsets[category] = data
    return subsets

In [None]:
train_df.info()

In [None]:
subsets = create_balanced_subset(train_df, categories=np.unique(train_df['category']))

In [None]:
[print(len(subsets[key])) for key in subsets.keys()]

In [None]:
train_df['category'].factorize()

In [None]:
train_df['category_id'] = train_df['category'].factorize()[0]

In [None]:
train_df.tail()

In [None]:
category_id_df_train = train_df[['category','category_id']]
category_to_id_train = dict(category_id_df_train.values)
id_to_category_train = dict(category_id_df_train[['category_id', 'category']].values)


In [None]:
id_to_category_test[0] = 'stellar populations and the ism'
id_to_category_test[1] = id_to_category_test[1].lower() 
id_to_category_test[2] = id_to_category_test[2].lower()
id_to_category_test[3] = 'planets and planet formation'
id_to_category_test[4] = 'galaxies and the igm'
id_to_category_test[5] = 'large scale structure of the universe'
id_to_category_test[6] = 'supermassive black holes and active galaxies'

In [None]:
id_to_category_train

In [None]:
tfidf_vect = TfidfVectorizer(max_features=10000,
                             stop_words='english',
                             use_idf=True,
                             norm='l2',
                             ngram_range=(1, 2))

In [None]:
count_vect = CountVectorizer(max_features=10000, tokenizer=pacman_classes.spacy_tokenizer)

In [None]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(train_df['text'], train_df['category_id'], test_size=0.2, train_size=0.8)

In [None]:
count_vect = count_vect.fit(x_train)

In [None]:
tfidf_vectorizer_vectors=tfidf_vect.fit_transform(x_train)

In [None]:
first_vector_tfidfvectorizer=tfidf_vectorizer_vectors[0]
 
# place tf-idf values in a pandas data frame
df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vect.get_feature_names(), columns=["tfidf"])
df.sort_values(by=["tfidf"],ascending=False)

In [None]:
Encoder = LabelEncoder()
y_train = Encoder.fit_transform(y_train)
y_test = Encoder.fit_transform(y_test)

In [None]:
nb_tfidf = Pipeline([('vect', tfidf_vect),
               ('clf', MultinomialNB(alpha=0.05)),
              ])


In [None]:
nb_count = Pipeline([('vect', count_vect),
               ('clf', MultinomialNB()),
              ])

In [None]:
nb_tfidf.fit(train_df['text'], train_df['category_id'])

In [None]:
nb_count.fit(train_df['text'], train_df['category_id'])

In [None]:
flist_text_test = glob.glob(f"{cy24}/training_corpus/*training.txt")
flist_label_test = glob.glob(f"{cy24}/training_corpus/*_Scientific_Category.txt")
test_df, data = pacman_classes.read_in_dataset(flist_text=flist_text_test, flist_label=flist_label_test, notebook=True)

In [None]:
test_df['category'].factorize()

In [None]:
test_df['category_id'] = test_df['category'].factorize()[0]
category_id_df_test = test_df[['category','category_id']]
category_to_id_test = dict(category_id_df_test.values)
id_to_category_test = dict(category_id_df_test[['category_id', 'category']].values)

In [None]:
id_to_category_test

In [None]:
id_to_category_test[0] = 'stellar populations and the ism'
id_to_category_test[1] = id_to_category_test[1].lower() 
id_to_category_test[2] = id_to_category_test[2].lower()
id_to_category_test[3] = 'planets and planet formation'
id_to_category_test[4] = 'galaxies and the igm'
id_to_category_test[5] = 'large scale structure of the universe'
id_to_category_test[6] = 'supermassive black holes and active galaxies'

In [None]:
predictions = nb_tfidf.predict(test_df['text'])

In [None]:
accuracy_score(test_df['category_id'], predictions)

In [None]:
predictions_count = nb_count.predict(test_df['text'])

In [None]:
accuracy_score(test_df['category_id'], predictions_count)

In [None]:
confusion_mat = confusion_matrix(test_df['category_id'], predictions)

In [None]:
confusion_mat_count = confusion_matrix(test_df['category_id'], predictions_count)

In [None]:
print(confusion_mat_count)

In [None]:
print(classification_report(test_df['category_id'], predictions_count , target_names=list(id_to_category_test.values())))

In [None]:
print(classification_report(test_df['category_id'], predictions , target_names=list(id_to_category_test.values())))

### Cycle 25 testing using the UAT categories

In [None]:
proposal_classifications = pd.read_csv('../cycle_25_classifications.txt')

Parse the filenames to get the proposal number

In [None]:
proposal_numbers = [int(val.split('/')[-1].split('_')[0]) for val in flist_text]
flist_num = list(zip(flist_text, proposal_numbers))
flist_num.sort(key=lambda val: val[1])
flist_sorted, proposal_num = list(zip(*flist_num))

In [None]:
hand_classified_null = proposal_classifications[proposal_classifications['classification'].isnull()]

In [None]:
proposal_classifications.info()

In [None]:
a = np.ediff1d(proposal_num)
idx = list(map(int, np.where(a>1)[0]))
missing_proposals = [proposal_num[val]+1 for val in idx]

In [None]:
missing_proposals

In [None]:
hand_classified_null

In [None]:
proposal_classifications['fname'] = [np.nan]*len(proposal_classifications)

In [None]:
proposal_classifications.head()

In [None]:
for num, fname in zip(proposal_num, flist_sorted, ):
    proposal_classifications['fname'].loc[num-1] = fname

In [None]:
proposal_classifications['classification'].factorize()

In [None]:
df, data = pacman_classes.read_in_dataset(flist_label=flist_label, flist_text=flist_sorted)