# Installing a Package from a Dataset when Internet is not allowed in a competition

Sometimes you have a specific Python package that is not included in Kaggle's standard notebook install but you would like to use it. Disabling internet makes the standard Pip install impossible as the Pypi repository cannot be accessed. 

This notebook shows how to convert a package into a dataset that can be installed in an offline kernel. In this notebook I am using the [Keras Self-Attention](https://github.com/CyberZHG/keras-self-attention) package.

If the dataset requires downloading external data (e.g., Huggingface models), those need to be handled separately. Overall, this approach should work for pip packages though.

In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from tqdm.auto import tqdm
tqdm.pandas()


# The files in the pre-created dataset

As I created the package dataset, it ended up for some reason having a directory structure starting with "d". It would be simple to fix by trimming the dataset structure, but it actually makes for a better example to illustrate all the issues like this. So why not.

In [None]:
!ls ../input
#the d below is where it is at for this dataset

In [None]:
!ls ../input/d/donkeys/kerasselfattention/mypip
#and going a bit deeper, below is listing of the actual dataset contents. 
#the keras-self-attention-0.49.0 directory should actually be a tar.gz file but Kaggle seems to unzip it from the dataset
#so have to convert it later back to tar.gz to install it

First a general look at the data in this competition:

In [None]:
DATA_PATH = "../input/commonlitreadabilityprize"

df_t = pd.read_csv(f"{DATA_PATH}/train.csv")

In [None]:
df_t.head()

This will be a traditional Keras LSTM model with word embeddings from Glove etc. so lets get those out of the way first:

In [None]:
def load_word_vectors(glove_dir):
    print('Indexing word vectors.')

    embeddings_index = {}
    glove_path = os.path.join(glove_dir, 'glove.6B.300d.txt')
    len(list(open(glove_path)))
    with open(glove_path, encoding='utf8') as f:
        for line in tqdm(f, total=400_000):
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    print('Found %s word vectors.' % len(embeddings_index))
    return embeddings_index



In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

def tokenize_text(vocab_size, texts, seq_length):
    tokenizer = Tokenizer(num_words=vocab_size)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)

    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    X = pad_sequences(sequences, maxlen=seq_length)
    print('Shape of data tensor:', X.shape)

    return X, tokenizer


In [None]:
def embedding_index_to_matrix(embeddings_index, vocab_size, embedding_dim, word_index):
    global num_words
    print('Preparing embedding matrix.')

    # prepare embedding matrix
    #+1 because have to match num_words to zero-index embedding matrix, as the tokenizer word-index starts at 1
    num_words = min(vocab_size, len(word_index)+1)
    embedding_matrix = np.zeros((num_words, embedding_dim))
    for word, i in word_index.items():
        if i >= vocab_size:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [None]:
def create_callbacks(name):
    checkpoint_callback = ModelCheckpoint(filepath=f"./model-weights-" + name + ".hdf5",
                                          monitor='val_loss', verbose=1, save_best_only=True)
    return [checkpoint_callback]

In [None]:
glove_dir = "../input/glove6b"
embeddings_index = load_word_vectors(glove_dir)

In [None]:
features = df_t["excerpt"]

In [None]:
vocab_size = 30000
seq_length = 256 #seems there is a max of about 230 tokens in an excerpt in the training data, so I just use 256
X, tokenizer = tokenize_text(vocab_size, features.astype(str), seq_length)

In [None]:
embedding_dim = 300
embedding_matrix = embedding_index_to_matrix(embeddings_index=embeddings_index,
                                                 vocab_size=vocab_size,
                                                 embedding_dim=embedding_dim,
                                                 word_index=tokenizer.word_index)

In [None]:
targets = df_t["target"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, targets, test_size=.2, random_state=6969)


In [None]:
def show_training_history(history):
    # list all data in history
    print(history.history.keys())
    # summarize history for accuracy
    plt.plot(history.history['MSE'])
    plt.plot(history.history['val_MSE'])
    plt.title('model MSE')
    plt.ylabel('MSE')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    # summarize history for loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()

# Installing the external package from dataset

As I noted before, the keras-self-attention-0.49.0 directory from the dataset is unzipped by Kaggle, so I will restore it to tar.gz to be able to do pip install:

In [None]:
!ls /kaggle/input/d/donkeys/kerasselfattention/mypip
!pwd

Kaggle seems to have created this strange nested directory with the file path containing the directory multiple times. It does not matter as I am copying its contents elsewhere, but was a bit confusing.
The contents of this directory are what I need for the tar.gz file:

In [None]:
!ls -l /kaggle/input/d/donkeys/kerasselfattention/mypip/keras-self-attention-0.49.0/keras-self-attention-0.49.0

In [None]:
#on Kaggle, the /kaggle/working directory (where we usually are located) is generally writable
#however, the dataset directories cannot be written to, so have to copy them to workspace to create the tar.gz for the installable version
!rm -r mypip
!mkdir mypip
#this create the tar.gz file under the /kaggle/working/mypip directory
!tar -czvf mypip/keras-self-attention-0.49.0.tar.gz -C /kaggle/input/d/donkeys/kerasselfattention/mypip/keras-self-attention-0.49.0/keras-self-attention-0.49.0 .

In [None]:
!ls -l mypip
#there is the tar.gz just created

In [None]:
!tar tvf mypip/keras-self-attention-0.49.0.tar.gz
#and it has the contents we need

Remember, the downloaded package dataset also contains a number of .whl files for installing all the dependencies of our external package.
Need to copy those as well so that the pip install command does not need to check the internet for them either.

In [None]:
!ls /kaggle/input/d/donkeys/kerasselfattention/mypip


In [None]:
!cp /kaggle/input/d/donkeys/kerasselfattention/mypip/*.whl mypip

In [None]:
!ls mypip

Finally, this is the actual command to install the package from the mypip directory that was just created above:

In [None]:
!pip install --no-index --find-links file:///kaggle/working/mypip keras-self-attention

# Using the installed package

It is just the same as any python package once the install is success:

In [None]:
from keras_self_attention import SeqSelfAttention, SeqWeightedAttention

In [None]:
from keras.layers import Dense, Input, GlobalMaxPooling1D, Bidirectional
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout
from keras.models import Model
from keras.callbacks import TensorBoard, ModelCheckpoint
from keras.layers import LSTM#, CuDNNLSTM 

def build_model_lstm_attention(vocab_size, embedding_dim, embedding_matrix, sequence_length, embeddings_trainable):
    input = Input(shape=(sequence_length,), name="Input")
    if embedding_matrix is None:
        embedding = Embedding(input_dim=num_words, 
                              output_dim=embedding_dim, 
                              input_length=sequence_length,
                              trainable=embeddings_trainable,
                              name="embedding")(input)
    else:
        embedding = Embedding(input_dim=num_words, 
                              weights=[embedding_matrix],
                              output_dim=embedding_dim, 
                              input_length=sequence_length,
                              trainable=embeddings_trainable,
                              name="embedding")(input)
    lstm1_bi1 = Bidirectional(LSTM(128, return_sequences=True, name='lstm1'), name="lstm-bi1")(embedding)
    attention1 = SeqSelfAttention(attention_width=attention_width)(lstm1_bi1)
    lstm2_bi2 = Bidirectional(LSTM(64, return_sequences=True, name='lstm2'), name="lstm-bi2")(attention1)
    attention2 = SeqWeightedAttention()(lstm2_bi2)

    dense64 = Dense(64, activation = 'relu')(attention2)
    dense32 = Dense(32, activation = 'relu')(dense64)
    output = Dense(1, activation = 'linear')(dense32)

    model = Model(inputs=input, outputs=output)
    model.compile(optimizer='adam', loss='MSE', metrics=['MSE'])
    return model

In [None]:
attention_width = None
batch_size = 32
epochs = 4

def train_model_attention(seed, embedding_dim, embedding_matrix, X_train, y_train, X_val, y_val, embeddings_trainable):
    callbacks = create_callbacks(f"{seed}")

    model = build_model_lstm_attention(vocab_size=vocab_size,
                        embedding_dim=embedding_dim,
                        sequence_length=seq_length,
                        embedding_matrix=embedding_matrix,
                        embeddings_trainable=embeddings_trainable) 

    history = model.fit(X_train, y_train,
              batch_size=batch_size,
              epochs=epochs,
              validation_data=(X_val, y_val),
              callbacks=callbacks)

    #the callback should have saved the model at best epoch, so just use it
    print(f"loading model weights: ./model-weights-{seed}.hdf5")
    model.load_weights(f"./model-weights-{seed}.hdf5")

    return history, model



In [None]:
history, model = train_model_attention(6969, embedding_dim, embedding_matrix, X_train, y_train, X_test, y_test, True)

In [None]:
show_training_history(history)

In [None]:
test_df = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
#x_test = X_test
test_features = test_df["excerpt"]
sequences = tokenizer.texts_to_sequences(test_features.astype(str))
x_test = pad_sequences(sequences, maxlen=seq_length)
results = model.predict(x_test)
ids = test_df['id']
results = pd.Series(np.squeeze(results, 1))
submission = pd.concat([ids, results], axis = 1)
submission.rename({0:'target'}, axis = 1, inplace = True)
submission.to_csv('submission.csv', index = False)

In [None]:
submission.head()

# Creating the zip file for the dataset

The following commands will download the package and its dependencies, and zip them. The resulting zip file should be downloadable from the kernel output, and can then be added to a dataset. 



In [None]:
!pip download -d mydataset keras-self-attention
!zip -r mydataset.zip mydataset
