In [None]:
# native python
import os
import sys
from os import path
cwd = os.getcwd()
os.chdir(path.abspath(path.join(os.getcwd(),"..")))

# open source packages
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score, train_test_split

# custom packages that are all in the github repo
from pacman_classes import PACManTrain, PACManPipeline
from utils.proposal_scraper import ProposalScraper
from utils.analyzer import PACManAnalyze

### 1. Proposal Scraping
We use the `ProposalScraper` class contained in the `proposal_scraper` module in the `utils` subpackage. We specify that we are scraping the proposals with the intention of using them for training and that we only want to scrape proposals in Cycle 24.
- By setting `for_training=True`, the software automatically looks for a file containing the hand classifications for the list of proposals and saves the scraped proposal information in an subdirectory of `~/PACMan_dist/training_data/`. In this example, the subdirectory will be named `training_corpus_cy24` and it will contain all of the training data for the given cycle, as well as the file containing the hand classifications.
- For the hand classifications, we adopt the following naming convention: cycle_CYCLENUMBER_hand_classifications.txt
   - e.g. cycle_24_hand_classifications.txt contains the hand classification of each proposal for cycle 24.
- Additionally, the file should only contain two columns, `proposal_num` and `hand_classification`. Below is an example snippet of what the file should look like:
    
```console

proposal_num,hand_classification
0001,stellar physics
0002,stellar physics
        .
        . 
        .
```


In [None]:
# Make an instance of the proposal scraping and scrape each cycle
pacman_scraper = ProposalScraper(for_training=True, cycles_to_analyze=[24, 25])
pacman_scraper.scrape_cycles()

### 2. Text Preprocessing (it could be a while... )
The `PACManTrain` class contained in the `pacman2020` module to is capable of performing all of the necessary preprocessing steps. Just like before, we specify the cycles we want to analyze and in this case it is just cycle 24.

In summary, this step is processing each input proposal with the `spaCy` NLP package to generate a `Doc` object, which is a sequence of tokens. Each token is an individual word that contains a variety of semantic information derived from the word and its context in a sentence. We leverage this information to filter out stop words, punctuations,  etc... This is the slowest step of the entire process and if needed, it can be improved using the multithreading behavior of `spaCy`.

The text preprocessing steps taken about 11 minutes per cycle.

In [None]:
pacman_training = PACManTrain(cycles_to_analyze=[24, 25])
pacman_training.read_training_data(parallel=False)

For each proposal cycle in the `cycle_to_analyze` argument, the tokenizer will perform the necessary preprocessing steps and save the proposal number, text, cleaned text, filename, the hand classified science category, and the encoded value of the hand classified category. The results are stored in a pandas DataFrame in the `PACManTrain.proposal_data` attribute

In [None]:
print('Found proposal information for:\n'+'\n'.join(pacman_training.proposal_data.keys())+'\n')

# Print the first 5 rows of the DataFrame for cycle 24
for key in pacman_training.proposal_data.keys():
    print(f"Displaying some information for {key}...")
    print(pacman_training.proposal_data[key].info())
    print('-'*58)

Let's examine the first proposal in the Cycle 24 DataFrame

In [None]:
first_row = pacman_training.proposal_data['Cycle24'].iloc[0]
msg = (
    f"HST Cycle 24 proposal number: {first_row['proposal_num']}\n"
    f"Hand Classification: {first_row['hand_classification']}\n"
    f"Raw Text:\n{first_row['text'][:100]}...\n"
    f"Cleaned Text:\n{first_row['cleaned_text'][:100]}...\n"

)
print(msg)

#### Side note: pandas is cool.

We can use the resulting DataFrame to quickly examine the distribution of proposal categories for each cycles.

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(5,7), gridspec_kw={'hspace':0.3})
for i, key in enumerate(pacman_training.proposal_data.keys()):
    proposal_categories = pacman_training.proposal_data[key]['hand_classification'].value_counts()
    proposal_categories.sort_index(inplace=True)
    ax = proposal_categories.plot.barh(label=key, ax=axes[i])
    ax.set_title(key)

### 3. Training

Now that we have all the proposal information loaded, we can train a classifier. When no model or vectorizer is specified, the software will use the default classifier (Multinomial Naive Bayes) and the default vectorizer (term frequency-inverse document frequency TFIDF). In theory, you can pass any combination of vectorizer and classifier that you want!

We test on cycle 25 because the original work was evaluated on cycle 24 data and these are the only two proposal cycles we have that have been hand classified. 

In [None]:
pacman_training.fit_model(pacman_training.proposal_data["Cycle25"])

In [None]:
print(pacman_training.model)

### 4. Testing 
Finally, we evaluate the performance of the model we just trained. To do so, we use it to make predictions on a completely different proposal cycle that has also been hand classified. We compare the predictions to the hand classifications and voila.

In [None]:
pacman_training.apply_model(df=pacman_training.proposal_data["Cycle24"], training=True)
print("scikit-learn classification report")
print(
    classification_report(
        y_true = pacman_training.model_results['encoded_hand_classification'],
        y_pred = pacman_training.model_results['encoded_model_classification'],
        target_names=pacman_training.encoder.classes_
    )
)

Finally, we use the analysis class to compute our customized accuracy to allow for a comparison with the previous package.

In [None]:
pacman_analyzing = PACManAnalyze()
pacman_analyzing.encoder = pacman_training.encoder

In [None]:
pacman_analyzing.compute_accuracy_measurements(df=pacman_training.model_results, normalize=True)

In [None]:
print(f"computed accuracy: {pacman_analyzing.computed_accuracy['top'].sum()/pacman_analyzing.computed_accuracy.sum().sum():.0%}")

In [None]:
pacman_analyzing.cycle=24

In [None]:
pacman_analyzing.plot_barh(100*pacman_analyzing.computed_accuracy.loc[:,['top','top_two','misclassified']], fout='test.png')

#### Saving the results and model

To provide a means of benchmarking various models, the classes have the functionality for saving the model results, as well as the trained model. By passing the `training=True` in the cell below, we are telling the code to save the results in the training subdirectory of the results directory. When `training=False` is passed, the results are written to the production directory. The intention here is to keep the results from training separate from the results when new proposals are analyzed. The path to each directory is given below:       
    
- ~/PACMan_dist/model_results/training/      
- ~/PACMan_dist/model_results/production/

In [None]:
pacman_training.save_model_results(fout='example_pacmaproposal_datacycle24.txt')#, training=True)
pacman_training.save_model(fname='example_pacman_model.joblib')