# Dependencies

In [1]:
!cp -r /kaggle/input/training-output-v2-commonlit/* .

In [2]:
import os
is_kaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
if not is_kaggle:
    !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
    !pip install -q scikit-learn pandas numpy matplotlib seaborn fastai
    !pip install -q kaggle

In [3]:
from pathlib import Path
import pandas as pd
import numpy as np
import random

pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns



# Prepare Data

Download the data if not in kaggle

In [4]:
competition_name = "commonlit-evaluate-student-summaries"

if is_kaggle:
    input_path = Path(f"/kaggle/input/{competition_name}")
else:
    input_path = Path.home() / ".kaggle" / "input" / competition_name
    !kaggle competitions download -p {input_path.parent} {competition_name}
    !unzip -o {input_path}.zip -d {input_path}

Load the csv files.

In [5]:
summ_train_df = pd.read_csv(input_path / "summaries_train.csv")
summ_test_df = pd.read_csv(input_path / "summaries_test.csv")
prompts_train_df = pd.read_csv(input_path / "prompts_train.csv")
prompts_test_df = pd.read_csv(input_path / "prompts_test.csv")

Merge the summaries and prompts dataframes for both training and testing

In [6]:
train_df = pd.merge(summ_train_df, prompts_train_df, on="prompt_id")
test_df = pd.merge(summ_test_df, prompts_test_df, on="prompt_id")
train_df

Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
1,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
2,0095993991fe,814d6b,The third wave only started as an experiment w...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
3,00c20c6ddd23,814d6b,The experimen was orginally about how even whe...,0.567975,0.969062,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
4,00d40ad10dc9,814d6b,The third wave developed so quickly due to the...,-0.910596,-0.081769,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
...,...,...,...,...,...,...,...,...
7160,fef3e85236e5,39c16e,"It has to be made on a complex storyline, with...",-0.981265,-1.548900,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...
7161,ff0f65eecf02,39c16e,Aristotle descirbes an ideal tradgedy as being...,-0.511077,-1.589115,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...
7162,ff186473ea0a,39c16e,A tragedy should have a complex plan not a sim...,-0.834946,-0.593749,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...
7163,ff5e9e6068da,39c16e,Aristotle believed that the ideal tradegy shou...,-0.157460,-0.165811,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...


In [7]:
from fastai.text.all import *

In [8]:
custom_tokens = ["<prompt_title>", "<prompt_text>", "<prompt_question>", "<summary>"]
special_tokens = defaults.text_spec_tok + custom_tokens
custom_tokenizer = WordTokenizer(special_toks=special_tokens)

In [9]:
def add_combined_column(df):
    separator = " " * 100
    df['inputs'] = ('<prompt_title>' +
                    df.prompt_title +
                    '<prompt_text>' +
                    df.prompt_text +
                    '<prompt_question>' +
                    df.prompt_question +
                    '<summary>' +
                    df.text)
    return df

train_df = add_combined_column(train_df)
test_df = add_combined_column(test_df)

In [10]:
train_df.inputs[0]

'<prompt_title>The Third Wave<prompt_text>Background \r\nThe Third Wave experiment took place at Cubberley High School in Palo Alto, California during the first week of April 1967. History teacher Ron Jones, finding himself unable to explain to his students how people throughout history followed the crowd even when terrible things were happening, decided to demonstrate it to his students through an experiment. Jones announced that he was starting a movement aimed to eliminate democracy. Jones named the movement “The Third Wave” as a symbol of strength, referring to the mythical belief that the third in a series of waves is the strongest. One of the central points of this movement was that democracy’s main weakness is that it favors the individual over the whole community. Jones emphasized this main point of the movement when he created this catchy motto: “Strength through discipline, strength through community, strength through action, strength through pride.” \r\nThe Experiment \r\nJo

In [11]:
targets = ['content', 'wording']

In [12]:
def fmse_loss(preds, targets): return F.mse_loss(torch.squeeze(preds), torch.squeeze(targets))

# Inference

In [13]:
inf_learners = []
for target in targets:
    learner = load_learner(f'{target}_learner')
    learner = learner.load(f'{target}_model')
    inf_learners.append(learner)

Make predictions on the test set.

In [14]:
test_df_subm = test_df[['inputs']]
test_df_subm = test_df_subm.rename(columns={'inputs': 'text'})

test_dls = [learner.dls.test_dl(test_df_subm) for learner in inf_learners]
test_dls[0].show_batch(max_n=1)

Unnamed: 0,text
0,xxbos <prompt_title> example xxmaj title 1 <prompt_text> xxunk \n xxmaj text … <prompt_question> summarize … <summary> example text 1


In [15]:
preds = [inf_learners[i].get_preds(dl=test_dls[i], with_targs=False) for i in range(len(targets))]
preds = [pred for pred, _ in preds]
preds

[tensor([[2.6030],
         [2.6109],
         [2.5540],
         [2.5630]]),
 tensor([[ 0.0008],
         [ 0.0234],
         [-0.0071],
         [ 0.0018]])]

# Submission

In [16]:
summ_test_df

Unnamed: 0,student_id,prompt_id,text
0,000000ffffff,abc123,Example text 1
1,111111eeeeee,def789,Example text 2
2,222222cccccc,abc123,Example text 3
3,333333dddddd,def789,Example text 4


In [17]:
submission_df = summ_test_df.copy()

for i, target in enumerate(targets):
    submission_df[target] = preds[i]
    
submission_df = submission_df[['student_id', 'content', 'wording']]
submission_df

Unnamed: 0,student_id,content,wording
0,000000ffffff,2.603012,0.000796
1,111111eeeeee,2.610867,0.023417
2,222222cccccc,2.553971,-0.00708
3,333333dddddd,2.562981,0.001834


In [18]:
submission_df.to_csv("submission.csv", index=False)