In [1]:
import numpy as np
import pandas as pd
import nltk.data
from collections import Counter
from nltk.tokenize import WordPunctTokenizer
from scipy import sparse
import json

In [2]:
wp_tokenizer = WordPunctTokenizer()
counter = Counter()
counter.update(['<pad>', '<unk>'])

import os

for dirname, _, filenames in os.walk('/kaggle/input/feedback-prize-2021/train'):
    for filename in filenames:
        full_path = os.path.join(dirname, filename)
        with open(full_path) as f:
            data = f.read()
        tokens = wp_tokenizer.tokenize(data.lower())
        counter.update(tokens)

In [3]:
# VOCAB CONSTRUCTION

print(f"Vocab size before frequency filtering: {len(counter)}")

vocab = {}
i = 0
for word, count in counter.items():
    if (count >= 5 or word in ['<pad>', '<unk>']):
        vocab[word] = i
        i += 1

print(f"Vocab size after frequency filtering: {len(vocab)}")
output_filepath = ('/kaggle/working/unigram_vocab.json')
json.dump(vocab, open(output_filepath, mode='w'))

Vocab size before frequency filtering: 52579
Vocab size after frequency filtering: 13679


In [4]:
# FEATURES + LABELS CONSTRUCTION - NEED ONE FOR EACH DISCOURSE TYPE

train_data = pd.read_csv('/kaggle/input/feedback-prize-2021/train.csv')
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')


tot_lines = 0
for index, row in train_data.iterrows():
    lines = sent_tokenizer.tokenize(row['discourse_text']) 
    tot_lines += len(lines)

# CLAIM, CONCLUDING STATEMENT, COUNTERCLAIM, EVIDENCE, LEAD, POSITION, REBUTTAL  
claim_labels_array = [0]*tot_lines
con_stat_labels_array = [0]*tot_lines
coun_claim_labels_array = [0]*tot_lines
evidence_labels_array = [0]*tot_lines
lead_labels_array = [0]*tot_lines
position_labels_array = [0]*tot_lines
rebuttal_labels_array = [0]*tot_lines

feat_array = np.zeros((tot_lines, len(vocab)))

line_idx = 0
for index, row in train_data.iterrows():
    lines = sent_tokenizer.tokenize(row['discourse_text'])
    for line in lines:
        if row['discourse_type'] == 'Claim':
            claim_labels_array[line_idx] = 1
        elif row['discourse_type'] == 'Concluding Statement':
            con_stat_labels_array[line_idx] = 1
        elif row['discourse_type'] == 'Counterclaim':
            coun_claim_labels_array[line_idx] = 1
        elif row['discourse_type'] == 'Evidence':
            evidence_labels_array[line_idx] = 1
        elif row['discourse_type'] == 'Lead':
            lead_labels_array[line_idx] = 1
        elif row['discourse_type'] == 'Position':
            position_labels_array[line_idx] = 1
        elif row['discourse_type'] == 'Rebuttal':
            rebuttal_labels_array[line_idx] = 1
        
        tokens = wp_tokenizer.tokenize(line.lower())

        for tok in tokens:
            try:
                index = vocab[tok]
            except KeyError:
                index = vocab['<unk>']
            feat_array[line_idx][index] = 1 
        
        line_idx += 1

In [5]:
# TEST LOADING & TOKENIZATION

test_dir = '/kaggle/input/feedback-prize-2021/test'

def test_sent(test_file):
    '''
    generates tokenized sentences for test data with corresponding list of essay IDs
    '''
    
    full_path = os.path.join(test_dir, test_file)
    with open(full_path) as f:
        data = f.read()
    lines = sent_tokenizer.tokenize(data.lower())
    
    tokens = []
    for line in lines:
        tok_line = wp_tokenizer.tokenize(line)
        tokens.append(tok_line)
        
    lines_len = len(lines)
    id_list = [test_file.replace('.txt', '')] * lines_len
    
    return tokens, id_list, lines

test1 = test_sent('0FB0700DAF44.txt')
test2 = test_sent('18409261F5C2.txt')
test3 = test_sent('D46BCB48440A.txt')
test4 = test_sent('D72CB1C11673.txt')
test5 = test_sent('DF920E0A7337.txt')

In [6]:
def test_binary(test_tokens):
    '''
    docstring
    '''
    
    feat_array = np.zeros((len(test_tokens), len(vocab)))
    
    for line in test_tokens:
        for tok in line:
            try:
                index = vocab[tok]
            except KeyError:
                index = vocab['<unk>']
            feat_array[test_tokens.index(line)][index] = 1    

    return feat_array


In [7]:
features_sparse_matrix = sparse.csr_matrix(feat_array)
sparse.save_npz('/kaggle/working/unigram_binary_features.npz', features_sparse_matrix)
np.savez('/kaggle/working/claim_labels.npz', claim_labels_array)
np.savez('/kaggle/working/con_stat_labels.npz', con_stat_labels_array)
np.savez('/kaggle/working/coun_claim_labels.npz', coun_claim_labels_array)
np.savez('/kaggle/working/evidence_labels.npz', evidence_labels_array)
np.savez('/kaggle/working/lead_labels.npz', lead_labels_array)
np.savez('/kaggle/working/position_labels.npz', position_labels_array)
np.savez('/kaggle/working/rebuttal_labels.npz', rebuttal_labels_array)

In [8]:
# USE THIS BLOCK WHEN COMMITTING

train_features = sparse.load_npz('/kaggle/working/unigram_binary_features.npz')
claim_labels = np.load('/kaggle/working/claim_labels.npz')['arr_0']
con_stat_labels = np.load('/kaggle/working/con_stat_labels.npz')['arr_0']
coun_claim_labels = np.load('/kaggle/working/coun_claim_labels.npz')['arr_0']
evidence_labels = np.load('/kaggle/working/evidence_labels.npz')['arr_0']
lead_labels = np.load('/kaggle/working/lead_labels.npz')['arr_0']
position_labels = np.load('/kaggle/working/position_labels.npz')['arr_0']
rebuttal_labels = np.load('/kaggle/working/rebuttal_labels.npz')['arr_0']
vocab = json.load(open('/kaggle/working/unigram_vocab.json'))


# USE THIS BLOCK WHEN INITIALLY LOADING FROM SAVE
                       
#train_features = sparse.load_npz('../input/nlp35100-unigrambinary/unigram_binary_features.npz')
#claim_labels = np.load('../input/nlp35100-unigrambinary/claim_labels.npz')['arr_0']
#con_stat_labels = np.load('../input/nlp35100-unigrambinary/con_stat_labels.npz')['arr_0']
#coun_claim_labels = np.load('../input/nlp35100-unigrambinary/coun_claim_labels.npz')['arr_0']
#evidence_labels = np.load('../input/nlp35100-unigrambinary/evidence_labels.npz')['arr_0']
#lead_labels = np.load('../input/nlp35100-unigrambinary/lead_labels.npz')['arr_0']
#position_labels = np.load('../input/nlp35100-unigrambinary/position_labels.npz')['arr_0']
#rebuttal_labels = np.load('../input/nlp35100-unigrambinary/rebuttal_labels.npz')['arr_0']
#vocab = json.load(open('../input/nlp35100-unigrambinary/unigram_vocab.json'))

In [9]:
def print_important_weights(weights, words):
    """
    Print important pairs of weights and words.
    # Parameters
    weights : `Iterable`, required.
        Weights from a learned model.
    words : `Iterable`, required.
        Word types of the vocabulary.  
        It must be true that `len(weights) == len(words)`.
    # Returns
        `None`
    """

    def print_pairs(pairs):
        for weight, word in pairs:
            print("{: .4f} | {}".format(weight, word))

    assert len(weights) == len(words)
    pairs = list(zip(weights, words))
    pairs = sorted(pairs, key=lambda x: x[0], reverse=True)
    print("Most positive words:")
    print_pairs(pairs[:10])
    print("\nMost negative words:")
    print_pairs(reversed(pairs[-10:]))

    #pairs = list(zip(abs(weights), words))
    #pairs = sorted(pairs, key=lambda x: x[0], reverse=False)
    #print("\nMost neutral words:")
    #print_pairs(pairs[:10])

In [10]:
# CLAIM MODEL
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

lr_claim = LogisticRegression(solver='sag', class_weight = {0:0.25, 1:0.75}).fit(train_features, claim_labels)
probs_claim = lr_claim.predict_proba(train_features)
train_pred_claim = lr_claim.predict(train_features)

print("Claim Train Accuracy:", accuracy_score(claim_labels, train_pred_claim))
print("Claim Train F1 Score:", f1_score(claim_labels, train_pred_claim))



Claim Train Accuracy: 0.8247047315411926
Claim Train F1 Score: 0.5019728264598916


In [11]:
probs_claim = probs_claim[:,lr_claim.classes_ == 1]
weights_claim = lr_claim.coef_[0]
print_important_weights(weights=weights_claim, words=vocab.keys())

Most positive words:
 2.4905 | secondly
 2.0561 | firstly
 1.7618 | thirdly
 1.5203 | cancelled
 1.4232 | commence
 1.3994 | foremost
 1.3886 | lastly
 1.2517 | darkness
 1.2383 | poplular
 1.1898 | frist

Most negative words:
-2.3594 | ."
-2.3575 | ?
-2.3287 | ?"
-2.3255 | !"
-2.3250 | ".
-2.2342 | .".
-2.1773 | generic_name
-2.0520 | argue
-2.0478 | conclusion
-1.9764 | %.


In [12]:
# CONCLUDING STATEMENT MODEL

lr_con_stat = LogisticRegression(solver='sag', class_weight = {0:0.25, 1:0.75}).fit(train_features, con_stat_labels)
probs_con_stat = lr_con_stat.predict_proba(train_features)
train_pred_con_stat = lr_con_stat.predict(train_features)

print("Concluding Statement Train Accuracy:", accuracy_score(con_stat_labels, train_pred_con_stat))
print("Concluding Statement Train F1 Score:", f1_score(con_stat_labels, train_pred_con_stat))



Concluding Statement Train Accuracy: 0.8564294857860542
Concluding Statement Train F1 Score: 0.44305724725943973


In [13]:
probs_con_stat = probs_con_stat[:,lr_con_stat.classes_ == 1]
weights_con_stat = lr_con_stat.coef_[0]
print_important_weights(weights=weights_con_stat, words=vocab.keys())

Most positive words:
 3.4369 | conclusion
 2.8127 | conclution
 2.5220 | conclude
 2.2550 | !
 2.1570 | inconclusion
 2.1218 | thank
 2.1136 | !!
 2.0628 | hope
 1.9719 | .
 1.9623 | wrap

Most negative words:
-1.6745 | loud
-1.6116 | dear
-1.5694 | example
-1.5428 | wondered
-1.4682 | 70
-1.3848 | residents
-1.3336 | displays
-1.3319 | instance
-1.2905 | bmw
-1.2852 | english


In [14]:
# COUNTERCLAIM STATEMENT MODEL

lr_coun_claim = LogisticRegression(solver='sag', class_weight = {0:0.1, 1:0.9}).fit(train_features, coun_claim_labels)
probs_coun_claim = lr_coun_claim.predict_proba(train_features)
train_pred_coun_claim = lr_coun_claim.predict(train_features)

print("Counterclaim Statement Train Accuracy:", accuracy_score(coun_claim_labels, train_pred_coun_claim))
print("Counterclaim Statement Train F1 Score:", f1_score(coun_claim_labels, train_pred_coun_claim))

Counterclaim Statement Train Accuracy: 0.963550346593242
Counterclaim Statement Train F1 Score: 0.38917278016696183




In [15]:
probs_coun_claim = probs_coun_claim[:,lr_coun_claim.classes_ == 1]
weights_coun_claim = lr_coun_claim.coef_[0]
print_important_weights(weights=weights_coun_claim, words=vocab.keys())

Most positive words:
 2.9199 | argue
 2.6585 | downside
 2.1839 | critics
 2.1756 | hand
 2.0894 | argued
 2.0776 | opponents
 1.8324 | contrary
 1.7972 | manufactors
 1.7904 | argument
 1.7889 | yes

Most negative words:
-1.7578 | conclusion
-1.6992 | imagine
-1.6211 | !
-1.4814 | smog
-1.4601 | helped
-1.4326 | cowboys
-1.4074 | math
-1.3594 | .
-1.3520 | happier
-1.2901 | america


In [16]:
# EVIDENCE MODEL

lr_ev = LogisticRegression(solver='sag').fit(train_features, evidence_labels)
probs_ev = lr_ev.predict_proba(train_features)
train_pred_ev = lr_ev.predict(train_features)

print("Evidence Train Accuracy:", accuracy_score(evidence_labels, train_pred_ev))
print("Evidence Train F1 Score:", f1_score(evidence_labels, train_pred_ev))



Evidence Train Accuracy: 0.740550201676207
Evidence Train F1 Score: 0.7575415589838504


In [17]:
probs_ev = probs_ev[:,lr_ev.classes_ == 1]
weights_ev = lr_ev.coef_[0]
print_important_weights(weights=weights_ev, words=vocab.keys())

Most positive words:
 2.5322 | .''
 2.5188 | .".
 2.5091 | '.
 2.4813 | kennedy
 2.3385 | %.
 2.3367 | ".
 2.2987 | ."
 2.2709 | !!!
 2.2243 | .'"
 2.2065 | ?".

Most negative words:
-2.4579 | conclution
-1.9852 | conclusion
-1.8098 | delt
-1.7833 | opinon
-1.7389 | ugh
-1.7194 | wondered
-1.7079 | airport
-1.6752 | conculsion
-1.6723 | march
-1.6481 | ticked


In [18]:
# LEAD MODEL

lr_lead = LogisticRegression(solver='sag', class_weight={0: 0.2, 1: 0.8}).fit(train_features, lead_labels)
probs_lead = lr_lead.predict_proba(train_features)
train_pred_lead = lr_lead.predict(train_features)

print("Lead Train Accuracy:", accuracy_score(lead_labels, train_pred_lead))
print("Lead Train F1 Score:", f1_score(lead_labels, train_pred_lead))

Lead Train Accuracy: 0.8986184575996908
Lead Train F1 Score: 0.44523376837931605




In [19]:
probs_lead = probs_lead[:,lr_lead.classes_ == 1]
weights_lead = lr_lead.coef_[0]
print_important_weights(weights=weights_lead, words=vocab.keys())

Most positive words:
 2.0452 | ?
 1.9597 | wondered
 1.9094 | !"
 1.7410 | debate
 1.6770 | ?"
 1.6338 | ?.
 1.6003 | debated
 1.5961 | wondering
 1.5748 | ?!
 1.5351 | imagine

Most negative words:
-2.5531 | conclusion
-2.0159 | mechanical
-1.9859 | secondly
-1.7549 | furthermore
-1.6827 | certainty
-1.6818 | lastly
-1.5913 | markings
-1.5499 | conclude
-1.5490 | tie
-1.5205 | concentrate


In [20]:
# POSITION MODEL

lr_position = LogisticRegression(solver='sag', class_weight={0: 0.2, 1: 0.8}).fit(train_features, position_labels)
probs_position = lr_position.predict_proba(train_features)
train_pred_position = lr_position.predict(train_features)

print("Position Train Accuracy:", accuracy_score(position_labels, train_pred_position))
print("Position Train F1 Score:", f1_score(position_labels, train_pred_position))

Position Train Accuracy: 0.9436695408545275
Position Train F1 Score: 0.5445046628582588




In [21]:
probs_position = probs_position[:,lr_position.classes_ == 1]
weights_position = lr_position.coef_[0]
print_important_weights(weights=weights_position, words=vocab.keys())

Most positive words:
 1.6182 | advantages
 1.5284 | valuble
 1.5159 | valuable
 1.4432 | aleins
 1.4323 | benefit
 1.4037 | landformation
 1.3963 | auther
 1.3594 | drives
 1.3389 | sould
 1.3332 | dear

Most negative words:
-2.3157 | ?"
-2.1477 | lastly
-2.0113 | conclusion
-1.9456 | secondly
-1.8034 | ."
-1.7652 | example
-1.7162 | finally
-1.7086 | material
-1.6499 | drunk
-1.6150 | sleep


In [22]:
# REBUTTAL MODEL

lr_rebuttal = LogisticRegression(solver='sag', class_weight={0: 0.1, 1: 0.9}).fit(train_features, rebuttal_labels)
probs_rebuttal = lr_rebuttal.predict_proba(train_features)
train_pred_rebuttal = lr_rebuttal.predict(train_features)

print("Rebuttal Train Accuracy:", accuracy_score(rebuttal_labels, train_pred_rebuttal))
print("Rebuttal Train F1 Score:", f1_score(rebuttal_labels, train_pred_rebuttal))

Rebuttal Train Accuracy: 0.96456778494312
Rebuttal Train F1 Score: 0.2653980971457186




In [23]:
probs_rebuttal = probs_rebuttal[:,lr_rebuttal.classes_ == 1]
weights_rebuttal = lr_rebuttal.coef_[0]
print_important_weights(weights=weights_rebuttal, words=vocab.keys())

Most positive words:
 2.4530 | however
 1.9514 | incorrect
 1.7372 | but
 1.6201 | true
 1.5624 | chatting
 1.5203 | briefly
 1.4795 | pattern
 1.4296 | presidant
 1.4042 | valid
 1.3824 | carpool

Most negative words:
-1.8272 | conclusion
-1.6521 | lastly
-1.6147 | advice
-1.2989 | sad
-1.2799 | advise
-1.2784 | paris
-1.2487 | limiting
-1.2460 | dream
-1.2239 | wonderful
-1.2121 | secondly


In [24]:
# GO THROUGH MODELS

model_dict = {0: 'Claim', 1: 'Concluding Statement', 2: 'Counterclaim', 3: 'Evidence', 4: 'Lead', 5: 'Position', 6:'Rebuttal'}
tag_counts = {}
output_labels = []

all_probs = np.hstack((probs_claim, probs_con_stat, probs_coun_claim, probs_ev, probs_lead, probs_position, probs_rebuttal))

for row in all_probs:
    max_prob = np.max(row)
    max_idx = np.argmax(row)
    tag = model_dict[max_idx]
    if max_prob < 0.5 and tag == 'Evidence':
        row = np.delete(row, max_idx)
        max_idx = np.argmax(row)
        tag = model_dict[max_idx]

    tag_counts[tag] = tag_counts.get(tag, 0) + 1
    output_labels.append(tag)
    
tag_counts

{'Lead': 27937,
 'Evidence': 170884,
 'Concluding Statement': 43515,
 'Counterclaim': 7619,
 'Rebuttal': 5416,
 'Position': 17726,
 'Claim': 58127}

In [25]:
print('Lead:', sum(lead_labels))
print('Rebuttal:', sum(rebuttal_labels))
print('Claim:', sum(claim_labels))
print('Counterclaim:', sum(coun_claim_labels))
print('Evidence:', sum(evidence_labels))
print('Concluding Statement:', sum(con_stat_labels))
print('Position:', sum(position_labels))

Lead: 28462
Rebuttal: 6619
Claim: 55441
Counterclaim: 7530
Evidence: 172748
Concluding Statement: 42204
Position: 18220


In [26]:
train_data = pd.read_csv('/kaggle/input/feedback-prize-2021/train.csv')
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

correct_labels = 0
for index, row in train_data.iterrows():
    lines = sent_tokenizer.tokenize(row['discourse_text'])
    for line in lines:
        #print(line, row['discourse_type'], output_labels[index])
        if output_labels[index] == row['discourse_type']:
            correct_labels += 1

In [27]:
correct_labels/len(output_labels)

0.3340609376132164

In [28]:
len(output_labels)
print("Length of labels is " + str(len(output_labels)))
# train_data.discourse_text
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

all_lines = []
essay_ids = []
for index, row in train_data.iterrows():
    lines = sent_tokenizer.tokenize(row['discourse_text'])
    essay_id = row['id']
    for line in lines:
        all_lines.append(line)
        essay_ids.append(essay_id)
        
print("Length of inputs is " + str(len(all_lines)))
print("Length of IDs is " + str(len(essay_ids)))

Length of labels is 331224
Length of inputs is 331224
Length of IDs is 331224


In [29]:
def test_predict(test_features):
    '''
    docstring
    '''
    
    test_claim = lr_claim.predict_proba(test_features)
    probs_claim_t = test_claim[:,lr_claim.classes_ == 1]

    test_con = lr_con_stat.predict_proba(test_features)
    probs_con_stat_t = test_con[:,lr_con_stat.classes_ == 1]

    test_coun_claim = lr_coun_claim.predict_proba(test_features)
    probs_coun_claim_t = test_coun_claim[:,lr_coun_claim.classes_ == 1]

    test_ev = lr_ev.predict_proba(test_features)
    probs_ev_t = test_ev[:,lr_ev.classes_ == 1]

    test_lead = lr_lead.predict_proba(test_features)
    probs_lead_t = test_lead[:,lr_lead.classes_ == 1]

    test_position = lr_position.predict_proba(test_features)
    probs_position_t = test_position[:,lr_position.classes_ == 1]

    test_rebuttal = lr_rebuttal.predict_proba(test_features)
    probs_rebuttal_t = test_rebuttal[:,lr_rebuttal.classes_ == 1]

    tag_dict = {0: 'Claim', 1: 'Concluding Statement', 2: 'Counterclaim', 3: 'Evidence', 4: 'Lead', 5: 'Position', 6:'Rebuttal'}
    test_labels = []

    test_probs = np.hstack((probs_claim_t, probs_con_stat_t, probs_coun_claim_t, probs_ev_t, probs_lead_t, probs_position_t, probs_rebuttal_t))

    for row in test_probs:
        max_prob = np.max(row)
        max_idx = np.argmax(row)
        tag = tag_dict[max_idx]
        if max_prob < 0.5 and tag == 'Evidence':
            row = np.delete(row, max_idx)
            max_idx = np.argmax(row)
            tag = tag_dict[max_idx]

        test_labels.append(tag)
    
    return test_labels


In [30]:
def create_predictionstring(text, incr):
    '''
    A function to strip the punction and create punctuation string
    '''
    split_text = text.split()
    prediction_string = []
    for word in split_text:
        prediction_string.append(str(incr))
        incr += 1
    prediction_string = " ".join(prediction_string)
    return prediction_string, incr

In [31]:
def build_submission(all_lines, essay_ids, output_labels):
    list_dicts = []
    building_text = ''
    incr = 0
    for idx, line in enumerate(all_lines):
        label = output_labels[idx]
        essay_id = essay_ids[idx]
        if incr == 0:
            prev_label = label
            prev_id = essay_id
            building_text += line
            incr = 1
            continue
        if label == prev_label and essay_id == prev_id:
            building_text += line
            prev_label = label
            prev_id = essay_id
        else:
            prediction_string, new_incr = create_predictionstring(building_text, incr)
            list_dicts.append({'id': prev_id, 'class': prev_label, 'original text': building_text, 'predictionstring': prediction_string})
            if essay_id != prev_id:
                incr = 0
            else:
                incr = new_incr
                prev_label = label
                building_text = line
    return list_dicts

In [32]:
list_dicts = build_submission(all_lines, essay_ids, output_labels)
submission = pd.DataFrame(data=list_dicts)
submission.to_csv('/kaggle/working/submission.csv')

In [33]:
test1_dicts = build_submission(test1[2], test1[1], test_predict(test_binary(test1[0])))
test1_sub = pd.DataFrame(data=test1_dicts)

test2_dicts = build_submission(test2[2], test2[1], test_predict(test_binary(test2[0])))
test2_sub = pd.DataFrame(data=test2_dicts)

test3_dicts = build_submission(test3[2], test3[1], test_predict(test_binary(test3[0])))
test3_sub = pd.DataFrame(data=test3_dicts)

test4_dicts = build_submission(test4[2], test4[1], test_predict(test_binary(test4[0])))
test4_sub = pd.DataFrame(data=test4_dicts)

test5_dicts = build_submission(test5[2], test5[1], test_predict(test_binary(test5[0])))
test5_sub = pd.DataFrame(data=test5_dicts)

test_cat = pd.concat([test1_sub, test2_sub, test3_sub, test4_sub, test5_sub], ignore_index = True)
test_cat.drop(columns = ['original text'], inplace = True)

test_cat.to_csv('/kaggle/working/test_submission.csv')