### Convert txt files to csv

In [2]:
import os
from tqdm import tqdm
import pandas as pd

mainpath = "./analyzer/data/train"
file_ls = os.listdir(mainpath)
txt_ls = [open(os.path.join(mainpath, fp), "r").read() for fp in tqdm(file_ls)]

df = pd.DataFrame.from_dict({
    "id": file_ls,
    "text": txt_ls
})
df.head()

  0%|          | 0/15594 [00:00<?, ?it/s]

100%|██████████| 15594/15594 [00:44<00:00, 354.01it/s]


Unnamed: 0,id,text
0,0000D23A521A.txt,"Some people belive that the so called ""face"" o..."
1,00066EA9880D.txt,Driverless cars are exaclty what you would exp...
2,000E6DE9E817.txt,Dear: Principal\n\nI am arguing against the po...
3,001552828BD0.txt,Would you be able to give your car up? Having ...
4,0016926B079C.txt,I think that students would benefit from learn...


In [4]:
df.to_csv("./analyzer/data/train.csv", index=False)

In [7]:
import pandas as pd

df = pd.read_csv("./analyzer/data/train_essay.csv")
df["id"] = df["id"].apply(lambda x: x.replace(".txt", ""))
df.to_csv("./analyzer/data/train_essay.csv", index=False)
df.head()

Unnamed: 0,id,text
0,0000D23A521A,"Some people belive that the so called ""face"" o..."
1,00066EA9880D,Driverless cars are exaclty what you would exp...
2,000E6DE9E817,Dear: Principal\n\nI am arguing against the po...
3,001552828BD0,Would you be able to give your car up? Having ...
4,0016926B079C,I think that students would benefit from learn...


### Random

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoConfig
# tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased', force_download=True)
config = AutoConfig.from_pretrained("google-bert/bert-base-cased", force_download=True)
model = AutoModelForTokenClassification.from_pretrained("google-bert/bert-base-cased", force_download=True)

### Preprocessing

In [21]:
import pandas as pd 
from tqdm import tqdm

essay_df = pd.read_csv(r"D:\Learning\NLP\Projects\EssayInsightAI\analyzer\data\train_essay.csv").iloc[12126:]
label_df = pd.read_csv(r"D:\Learning\NLP\Projects\EssayInsightAI\analyzer\data\train_labels.csv")

predictions = []
for _, row in tqdm(essay_df.iterrows()):
    seq_len = len(row["text"].split())
    preds = ["O"]*seq_len
    for _, det_row in label_df[label_df["id"] == row["id"]].iterrows():
        string = [int(k) for k in det_row["predictionstring"].split()]
        discourse_type = det_row["discourse_type"]

        preds[string[0]] = f"B-{discourse_type}"
        for id_ in string[1:]: preds[id_] = f"I-{discourse_type}"
    predictions.append(preds)

essay_df["prediction"] = predictions

3468it [00:58, 58.99it/s]


In [19]:
row["id"]

'C647D6659C92'

In [12]:
"A34FF5F00E3A", "C647D6659C92"

'I disagree with the principal because some kids may like to go home ,or do not want to do anything after school, they could be busy ,or do not have the right actions or just are not as good at any of the sports. Some kids at the school may not think of the sports as something to do or dont like there school team and could think its not for them to play on a school team or they just cant do it at all . Although,\n\nkids might not wanna do sports at all and think itÃƒÂ…Ã¢Â€Âº a bad team or it just not the type of things they prefer to do . Most children like staying home and playing games or doing other fun things or just eat a lot of food and be lazy but possibly , kids are having a busy life and have to do things and do not have time for the sports like help grandparents or siblings or just watch over the house while no ones there. It may be a bit hard for them to get into any sports if students cant control their actions as in helping or showing there doing something or communication

In [27]:
import os
import glob

datapath = r"D:\Learning\NLP\Projects\EssayInsightAI\analyzer\data"

test_paths = os.listdir(os.path.join(datapath, "test"))
test_texts = [open(os.path.join(datapath, "test", f), "r").read() for f in test_paths]

### Inference

In [10]:
import os
import sys
import json 
import torch

sys.path.append("./")
sys.path.append("./analyzer")
# from analyzer.train import load_saved_model
# from analyzer.utils import load_config, prepare_test_data
from analyzer.metrics import competition_metric

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
pred_df = pd.read_csv(r"D:\Learning\NLP\Projects\EssayInsightAI\analyzer\output\distilbert\submission.csv")
true_df = pd.read_csv(r"D:\Learning\NLP\Projects\EssayInsightAI\analyzer\output\distilbert\submission.csv")
true_df["discourse_type"] = true_df["class"]

true_df.drop('class', axis=1, inplace=True)

competition_metric(pred_df, true_df)

1.0

In [2]:
datapath = r"D:\Learning\NLP\Projects\EssayInsightAI\analyzer\data"
ckpt_path = r"D:\Learning\NLP\Projects\EssayInsightAI\analyzer\output\distilbert\lightning_logs\version_0\checkpoints\epoch=0-step=5.ckpt"
config_path = r"D:\Learning\NLP\Projects\EssayInsightAI\analyzer\configs\config.yaml"

config = load_config(config_path)
with open(os.path.join(datapath, "label2id.json"), "r") as f:
    label2id = json.load(f)
num_classes = len(label2id)
id2label = {v:k for k, v in label2id.items()}
data, network, trainer = load_saved_model(
    config, 
    ckpt_path, 
    [label2id, id2label, num_classes]
    )

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at D:\Learning\NLP\Projects\EssayInsightAI\analyzer\models\distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [3]:
test_df = prepare_test_data(datapath)
test_loader = data.test_dataloader(test_df)

logits = trainer.predict(network, dataloaders=test_loader)

d:\Learning\NLP\envs\essay_ai\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:420: Consider setting `persistent_workers=True` in 'predict_dataloader' to speed up the dataloader worker initialization.


Predicting DataLoader 0: 100%|██████████| 3/3 [00:03<00:00,  0.95it/s]


In [None]:
word_ids = []
for batch in test_loader:
    word_ids.extend(batch["word_ids"].numpy())
len(word_ids)

In [9]:
import pandas as pd


def get_out(x, y):
    return x+1, y+2

df = pd.DataFrame.from_dict({
    "X": [1, 2, 3],
    "Y": [7, 8, 9],
    "Z": [-3, -6, -9]
})

df[["A", "B"]] = df.apply(lambda row : get_out(row.X, row.Y), axis=1, result_type='expand')
df.head()

Unnamed: 0,X,Y,Z,A,B
0,1,7,-3,2,9
1,2,8,-6,3,10
2,3,9,-9,4,11
