In [None]:
%matplotlib widget
from collections import defaultdict
import glob
import sys
sys.path.append('/Users/nmiles/PACMan_dist/')


from joblib import dump, load
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import numpy as np
import pandas as pd
import pacman2020
from utils import tokenizer


from sklearn.datasets import make_classification
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels

In [None]:
train_pacman = pacman2020.PACManTrain(cycles_to_analyze=[24, 25])

In [None]:
train_pacman.read_training_data(parallel=False)

In [None]:
print(train_pacman.proposal_data['cycle_25'].iloc[0]['hand_classification'])


In [None]:
train_pacman.proposal_data['cycle_25']

In [None]:
train_pacman.fit_model(train_pacman.proposal_data['cycle_25'])

In [None]:
train_pacman.apply_model(train_pacman.proposal_data['cycle_24'], training=True)

In [None]:
train_pacman.proposal_data['cycle_24']

In [None]:
train_pacman.proposal_data['cycle_24']['cleaned_text'].iloc[0]

In [None]:
pred = train_pacman.model.predict(train_pacman.proposal_data['cycle_24']['cleaned_text'])
pred_prob = train_pacman.model.predict_proba(train_pacman.proposal_data['cycle_24']['cleaned_text'])

In [None]:
pred_prob[0]

In [None]:
train_pacman.proposal_data['cycle_24']['encoded_pred_classification'] = pred

In [None]:
train_pacman.proposal_data['cycle_24']['pred_classification'] = train_pacman.encoder.inverse_transform(pred)

In [None]:
train_pacman.proposal_data['cycle_24']

In [None]:
train_pacman.model_results

In [None]:
print(classification_report(train_pacman.model_results['encoded_hand_classification'], 
                            train_pacman.model_results['encoded_model_classification']))

In [None]:
train_pacman.save_model_results(fout='pacman_results_cycle24.txt', training=True)

In [None]:
analyze_pacman = pacman2020.PACManAnalyze()
analyze_pacman.encoder = train_pacman.encoder

In [None]:
analyze_pacman.compute_accuracy_measurements(df=train_pacman.model_results)

In [None]:
analyze_pacman.computed_accuracy.head()

In [None]:
analyze_pacman.plot_barh(df=analyze_pacman.computed_accuracy)

In [None]:
for i, row in analyze_pacman.computed_accuracy.iterrows():
    row /= row.sum()
    print(row)

In [None]:
cumulative = analyze_pacman.computed_accuracy.apply(sum, axis=0)

In [None]:
cumulative/cumulative.sum()

In [None]:
train_pacman.save_model(fname='pacman_production_model.joblib')

In [None]:
from sklearn.model_selection import cross_val_score, train_test_split

In [None]:
scores = cross_val_score(
    train_pacman.model, train_pacman.proposal_data['cycle_24']['cleaned_text'], train_pacman.proposal_data['cycle_24']['encoded_hand_classification'], cv=5, scoring='f1_macro')

In [None]:
print(scores)

In [None]:
def combine_proposals(pman):
    df1 = pman.proposal_data['cycle_24']
    df2 = pman.proposal_data['cycle_25']
    df = df2.append(df1, sort=True)
    return df

In [None]:
train_pacman.proposal_data['cycle_24'].columns

In [None]:
train_pacman.proposal_data['cycle_25'].columns

In [None]:
train_pacman.proposal_data['cycle_24'].head()

In [None]:
total_dataset = combine_proposals(train_pacman)

In [None]:
total_dataset.loc[:,['encoded_hand_classification']].info()

In [None]:
total_dataset['hand_classification'].unique()

In [None]:
total_dataset.info()

In [None]:
solar_system_df = total_dataset[total_dataset['hand_classification'] == 'solar system']


In [None]:
indices = [i for i in range(30, 60, 1)]

In [None]:
indices

In [None]:
len(solar_system_df.iloc[indices])

In [None]:
solar_system_df.

In [None]:
def get_balanced_subset(df, proposal_counts):
    min_num_proposals = proposal_counts.min()
    data = {}
    for proposal_type in df['hand_classification'].unique():
        proposal_df = df[df['hand_classification'] == proposal_type]
        indices = np.random.randint(low=0, high=len(proposal_df), size=100)
        data[proposal_type] = proposal_df.iloc[indices]
    
    final_df = pd.DataFrame()
    for key in data.keys():
        final_df = final_df.append(data[key])
    return final_df

In [None]:
balanced_df = get_balanced_subset(total_dataset, a)

In [None]:
balanced_df

In [None]:
a = balanced_df['hand_classification'].value_counts()

In [None]:
a.sum()

In [None]:
weights = a/a.sum()

In [None]:
weights.values

In [None]:
scorestotal = cross_val_score(
    train_pacman.model, 
    total_dataset['cleaned_text'], 
    total_dataset['encoded_hand_classification'], 
    cv=4, 
    scoring='f1_macro'
)
 

In [None]:
scorestotal

In [None]:
scoresbalanced = cross_val_score(
    train_pacman.model, 
    balanced_df['cleaned_text'], 
    balanced_df['encoded_hand_classification'], 
    cv=4, 
    scoring='f1_macro'
)

In [None]:
scoresbalanced

In [None]:
print(f"{scorestotal.mean():.0%} +/- {scorestotal.std():.1%}")

In [None]:
print(f"{scoresbalanced.mean():.0%} +/- {scoresbalanced.std():.1%}")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(total_dataset['cleaned_text'], total_dataset['encoded_hand_classification'], test_size=0.15, random_state=42)

In [None]:
len(y_train)/ len(total_dataset['cleaned_text'])

In [None]:
train_df = pd.DataFrame()
train_df['cleaned_text'] = X_train
train_df['encoded_hand_classification'] = y_train

In [None]:
test_df = pd.DataFrame()
test_df['cleaned_text'] = X_test
test_df['encoded_hand_classification'] = y_test

In [None]:
train_pacman.fit_model(train_df)

In [None]:
test_df

In [None]:
pred = train_pacman.model.predict(test_df['cleaned_text'])

In [None]:
print(classification_report(test_df['encoded_hand_classification'], pred))

In [None]:
len(pred)

In [None]:
data_out = defaultdict(list)
for i, row in pman.proposal_data['cycle_24'].iterrows():
    data_out['fname'].append(row['fname'])
    data_out['encoded_pred_classification'].append(row['encoded_pred_classification'])
    data_out['pred_classification'].append(row['pred_classification'])
    data_out['hand_classification'].append(row['hand_classification'])
    data_out['encoded_hand_classification'].append(row['encoded_hand_classification'])
    for j, class_prob in enumerate(pred_prob[i]):
        data_out[f"{pman.encoder.classes_[j].replace(' ','_')}_prob"].append(class_prob)

Testing ~sample_weight~ OneVsRest

In [None]:
vect = TfidfVectorizer(
                max_features=10000,
                use_idf=True,
                norm='l2',
                ngram_range=(1, 2)
            )

In [None]:
clf = MultinomialNB(alpha=0.05)

In [None]:
input_tfidf25 = vect.fit_transform(pman.proposal_data['cycle_25']['cleaned_text'])

In [None]:
input_tfidf24 = vect.transform(pman.proposal_data['cycle_24']['cleaned_text'])

In [None]:
train_df =  pman.proposal_data['cycle_25']

In [None]:
clf.fit(input_tfidf25, train_df['encoded_hand_classification'])

In [None]:
pred = clf.predict(input_tfidf24)

In [None]:
sum(pred == pman.proposal_data['cycle_24']['encoded_hand_classification'])/len(pred)

In [None]:
pred_prob = clf.predict_proba(input_tfidf24)

In [None]:
pred_prob[0]

In [None]:
np.sum(pred_prob[1])

In [None]:
sum(np.exp(clf.feature_log_prob_[2]))