In [None]:
!git clone https://github.com/JaeminBest/cs376_qiqc

In [None]:
%cd cs376_qiqc
!mkdir input
!cp /kaggle/input/quora-insincere-questions-classification/*.csv ./input/
!unzip /kaggle/input/quora-insincere-questions-classification/embeddings.zip 
%cd ..

In [None]:
# 현재 디렉토리 세팅, pip 패키지 설치
!git checkout -b rollback-old origin/rollback-old
!pip install -r requirements.txt

!python -m nltk.downloader punkt
!python setup.py build_ext
!python setup.py develop

%env DATADIR=input

In [None]:
import numpy as np
import torch

In [None]:
from qiqc.config import ExperimentConfigBuilderBase
from qiqc.modules import BinaryClassifier
from qiqc.presets.hparam import TextNormalizerPresets
from qiqc.presets.hparam import TextTokenizerPresets
from qiqc.presets.hparam import WordEmbeddingFeaturizerPresets
from qiqc.presets.hparam import WordExtraFeaturizerPresets
from qiqc.presets.hparam import SentenceExtraFeaturizerPresets
from qiqc.presets.hparam import PreprocessorPresets
from qiqc.presets.hparam import EmbeddingPresets
from qiqc.presets.hparam import EncoderPresets
from qiqc.presets.hparam import AggregatorPresets
from qiqc.presets.hparam import MLPPresets
from qiqc.presets.hparam import EnsemblerPresets  # NOQA

In [None]:
import time
import os
from pathlib import Path

dir_time = time.strftime('%c', time.localtime(time.time()))
dir_str = 'output'

dir_path = Path(dir_str)
os.makedirs(dir_path)

In [None]:
# Submit setting (from original repo)

class ExperimentConfigBuilder(ExperimentConfigBuilderBase):

    default_config = dict(
        test=False,
        device=0,
        maxlen=72,
        vocab_mincount=5,
        scale_batchsize=[],
        validate_from=4,
		outdir_share = dir_path,
        
        scheduler='exponential',
        gamma=0.5,
    )

    @property
    def modules(self):
        return [
            TextNormalizer,
            TextTokenizer,
            WordEmbeddingFeaturizer,
            WordExtraFeaturizer,
            SentenceExtraFeaturizer,
            Embedding,
            Encoder,
            Aggregator,
            MLP,
        ]


def build_model(config, embedding_matrix, n_sentence_extra_features):
    embedding = Embedding(config, embedding_matrix)
    encoder = Encoder(config, embedding.out_size)
    aggregator = Aggregator(config)
    mlp = MLP(config, encoder.out_size + n_sentence_extra_features)
    out = torch.nn.Linear(config.mlp_n_hiddens[-1], 1)
    lossfunc = torch.nn.BCEWithLogitsLoss()

    return BinaryClassifier(
        embedding=embedding,
        encoder=encoder,
        aggregator=aggregator,
        mlp=mlp,
        out=out,
        lossfunc=lossfunc,
    )


# =======  Preprocessing modules  =======

class TextNormalizer(TextNormalizerPresets):
    pass


class TextTokenizer(TextTokenizerPresets):
    pass


class WordEmbeddingFeaturizer(WordEmbeddingFeaturizerPresets):
    pass


class WordExtraFeaturizer(WordExtraFeaturizerPresets):

    default_config = dict(
        word_extra_features=['idf', 'unk'],
    )


class SentenceExtraFeaturizer(SentenceExtraFeaturizerPresets):

    default_config = dict(
        sentence_extra_features=['char', 'word'],
    )


class Preprocessor(PreprocessorPresets):

    embedding_sampling = 400

    def build_word_features(self, word_embedding_featurizer,
                            embedding_matrices, word_extra_features):
        embedding = np.stack(list(embedding_matrices.values()))

        # Concat embedding
        embedding = np.concatenate(embedding, axis=1)
        vocab = word_embedding_featurizer.vocab
        embedding[vocab.lfq & vocab.unk] = 0

        # Embedding random sampling
        n_embed = embedding.shape[1]
        n_select = self.embedding_sampling
        idx = np.random.permutation(n_embed)[:n_select]
        embedding = embedding[:, idx]

        word_features = np.concatenate(
            [embedding, word_extra_features], axis=1)
        return word_features


# =======  Training modules  =======

class Embedding(EmbeddingPresets):
    pass


class Encoder(EncoderPresets):
    pass


class Aggregator(AggregatorPresets):
    pass


class MLP(MLPPresets):
    pass


class Ensembler(EnsemblerPresets):
    pass

In [None]:
from qiqc.utils import rmtree_after_confirmation
from train import train
import argparse
import pickle

# FIXME: you should change some hparam with your own model name and hparam
# for more detail, checkout qiqc/config.py
# if you want to change hparam just change it in ExperimentConfigBuilder class
config = ExperimentConfigBuilder().build(args=['--modelfile','test'])

config.encoder='lstmgru'
config.outdir=Path('output')

# this will remove recursively all file in outdir and test
rmtree_after_confirmation(config.outdir, config.test) 
train(config, build_model, Preprocessor, TextNormalizer,
      TextTokenizer, WordEmbeddingFeaturizer, WordExtraFeaturizer,
      SentenceExtraFeaturizer, Ensembler)