In [1]:
import pandas as pd
import numpy as np
import os
import nltk
pd.set_option('display.max_colwidth', 1000)

In [2]:
def analyze(filename):
    df = pd.read_csv(filename, header=-1)
    df.columns = ['pred', 'label'] # Distribution of templates
    
    for id in df['label'].value_counts().index:
        subset = df[df['label'] == id]
        correct = (subset['pred'] == subset['label']).sum()
        accuracy = correct / len(subset)

        false_positive_subset = df[df['pred'] == id]
        false_positives = (false_positive_subset['pred'] != false_positive_subset['label']).sum()
        false_positive_rate = false_positives / max(len(false_positive_subset), 1)

        print("=" * 50)
        print("Template #{}: count={}, correct={}, false_positives={}, accuracy={}, false_positive_rate={}"
              .format(id, len(subset), correct, false_positives, accuracy, false_positive_rate))
        print("=" * 50)
    return df

In [3]:
df = analyze("analysis/tmp7/expname=added embedding learning rate for toks embedding,input_dim=386,mem_dim=150,epochs=15,current_epoch=11,test_acc=0.858,loss=0.6856640524906495.csv")

Template #1: count=261, correct=224, false_positives=23, accuracy=0.8582375478927203, false_positive_rate=0.0931174089068826
Template #3: count=223, correct=199, false_positives=39, accuracy=0.8923766816143498, false_positive_rate=0.1638655462184874
Template #16: count=104, correct=89, false_positives=9, accuracy=0.8557692307692307, false_positive_rate=0.09183673469387756
Template #8: count=79, correct=72, false_positives=9, accuracy=0.9113924050632911, false_positive_rate=0.1111111111111111
Template #151: count=73, correct=71, false_positives=1, accuracy=0.9726027397260274, false_positive_rate=0.013888888888888888
Template #105: count=70, correct=58, false_positives=10, accuracy=0.8285714285714286, false_positive_rate=0.14705882352941177
Template #6: count=51, correct=35, false_positives=15, accuracy=0.6862745098039216, false_positive_rate=0.3
Template #15: count=40, correct=29, false_positives=9, accuracy=0.725, false_positive_rate=0.23684210526315788
Template #7: count=26, correct=2

In [10]:
template = 6
df[ (df['pred'] == template) & (df['label'] != template) ]['label'].value_counts()

1      8
3      5
105    2
Name: label, dtype: int64

In [39]:
X = []
y = []
with open('data/lc-quad/train/input.pos', 'r') as myfile:
    X += myfile.read().split("\n")[:-1]

with open('data/lc-quad/test/input.pos', 'r') as myfile:
    X += myfile.read().split("\n")[:-1]    
    
with open('data/lc-quad/train/output.txt', 'r') as myfile:
    y += myfile.read().split("\n")[:-1]

with open('data/lc-quad/test/output.txt', 'r') as myfile:
    y += myfile.read().split("\n")[:-1]    

In [40]:
df = pd.concat([pd.Series(x) for x in [X, y]], axis=1)
df.columns = ['X', 'y']

In [41]:
words = " ".join(df[df['y'] == '1']['X'].tolist()).split()
nltk.FreqDist(words)

FreqDist({"''": 3,
          ',': 56,
          '-LRB-': 50,
          '-RRB-': 50,
          '.': 1197,
          ':': 7,
          'CC': 27,
          'CD': 72,
          'DT': 949,
          'FW': 56,
          'IN': 1058,
          'JJ': 642,
          'JJR': 6,
          'JJS': 13,
          'LS': 6,
          'MD': 2,
          'NN': 3406,
          'NNP': 72,
          'NNPS': 2,
          'NNS': 649,
          'PDT': 11,
          'POS': 57,
          'PRP': 23,
          'PRP$': 24,
          'RB': 31,
          'RP': 1,
          'TO': 93,
          'VB': 124,
          'VBD': 240,
          'VBG': 34,
          'VBN': 290,
          'VBP': 248,
          'VBZ': 684,
          'WDT': 372,
          'WP': 566,
          'WP$': 94,
          'WRB': 100,
          '``': 2})

In [42]:
words = " ".join(df[df['y'] == '3']['X'].tolist()).split()
nltk.FreqDist(words)

FreqDist({"''": 3,
          ',': 114,
          '-LRB-': 7,
          '-RRB-': 7,
          '.': 1158,
          ':': 7,
          'CC': 32,
          'CD': 64,
          'DT': 1757,
          'EX': 16,
          'FW': 95,
          'IN': 1744,
          'JJ': 645,
          'JJR': 5,
          'JJS': 12,
          'LS': 6,
          'MD': 34,
          'NN': 3416,
          'NNP': 34,
          'NNS': 1239,
          'PDT': 21,
          'POS': 57,
          'PRP': 46,
          'PRP$': 32,
          'RB': 51,
          'RP': 3,
          'SYM': 2,
          'TO': 104,
          'VB': 173,
          'VBD': 426,
          'VBG': 53,
          'VBN': 581,
          'VBP': 464,
          'VBZ': 709,
          'WDT': 571,
          'WP': 645,
          'WP$': 138,
          'WRB': 205,
          '``': 2})