In [3]:
#!pip install pandas

In [4]:
#!pip install keras

In [5]:
#!pip install --upgrade tensorflow-gpu

In [1]:
import numpy as np
import pandas as pd
from keras.preprocessing import text, sequence

Using TensorFlow backend.


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [3]:
!echo $CONDA_DEFAULT_ENV

fastpy3


In [6]:
EMBEDDING_FILES = [
    '/home/ravi/embeds/crawl-300d-2M.vec',
    '/home/ravi/embeds/glove.840B.300d.txt'
]

In [7]:
NUM_MODELS = 2
BATCH_SIZE = 512
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
EPOCHS = 4
MAX_LEN = 220

In [8]:
IDENTITY_COLUMNS = [
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness'
]
AUX_COLUMNS = ['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']
TEXT_COLUMN = 'comment_text'
TARGET_COLUMN = 'target'
CHARS_TO_REMOVE = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n“”’\'∞θ÷α•à−β∅³π‘₹´°£€\×™√²—'

In [7]:
from typing import List

In [8]:
# a string of literals to work and embeds pair
def get_pairs(word:str,*embeds:List[str]):
    return word,np.array(embeds,dtype='float32')

In [9]:
get_pairs('dsa','1','2','3','5','6')

('dsa', array([1., 2., 3., 5., 6.], dtype=float32))

In [10]:
f_name = EMBEDDING_FILES[0]

In [11]:
!ls

 cat.txt
 cuda-repo-ubuntu1804_10.0.130-1_amd64.deb
 keras_code.py
 __pycache__
'Simple LSTM - PyTorch version 8d4be5.ipynb'
'Simple LSTM - PyTorch version 8d4be5.py'
 Untitled.ipynb
 Untitled.py


In [12]:
f = open('cat.txt')

In [13]:
a = f.readlines(100)

In [14]:
a

['ravi 1\n',
 'geya 2\n',
 'bat 5\n',
 'pokemon 7\n',
 'lol 8\n',
 'bad 9\n',
 'great 10\n']

In [15]:
#iterate through the lines in a file:
# make this step faster to make pipeline faster
# insted of line in file made it to line in readlines(char)
embed_dict = {}
with open(f_name) as f:
        embed_dict = dict([get_pairs( *line.strip().split(' ') ) for line in f.readlines(7000)])

In [16]:
def make_line(word:str,*embed):
    return word,np.array(embed)

In [17]:
import gc

In [18]:
del embed_dict
gc.collect()

200

In [19]:
make_line(1,2,3)

(1, array([2, 3]))

In [20]:
dict(((1,2),(3,4)))

{1: 2, 3: 4}

In [21]:
embeds = {}

In [22]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

# edited to make pipeline faster
def load_embeddings(path):
    with open(path) as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in f.readlines(5000))


def build_matrix(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            pass
    return embedding_matrix
    

In [23]:
# def build_model(embedding_matrix, num_aux_targets):
#     words = Input(shape=(None,))
#     x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)
#     x = SpatialDropout1D(0.2)(x)
#     x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)
#     x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)

#     hidden = concatenate([
#         GlobalMaxPooling1D()(x),
#         GlobalAveragePooling1D()(x),
#     ])
#     hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
#     hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
#     result = Dense(1, activation='sigmoid')(hidden)
#     aux_result = Dense(num_aux_targets, activation='sigmoid')(hidden)
    
#     model = Model(inputs=words, outputs=[result, aux_result])
#     model.compile(loss='binary_crossentropy', optimizer='adam')

#     return model
    

In [25]:
class bias_killer(nn.Module):
    def __init__(self,n_hidden,hid_dim,inp_dim):
        super().__init__()
        self._n_hidden = n_hidden
        self._hid_dim = hid_dim
        self.inp_dim = inp_dim
        self.modlist = nn.ModuleList()
        self.temp1 = self.inp_dim
        self.temp2 = self._hid_dim
        for i in range(self._n_hidden):
            self.modlist.append(nn.Linear(self.temp1,self.temp2))
            self.temp1 = self.temp2
        self.head = nn.Linear(self.temp2,1)
    def forward(self,x,y,apply_sig=False,dbug=False):
        for lyr in self.modlist:
            x = lyr(x)
        x = self.head(x)
        if apply_sig:
            return F.sigmoid(x)
        else:
            return x

In [26]:
m  = bias_killer(1,2,4)

In [27]:
m

bias_killer(
  (modlist): ModuleList(
    (0): Linear(in_features=4, out_features=2, bias=True)
  )
  (head): Linear(in_features=2, out_features=1, bias=True)
)

In [28]:
a = torch.Tensor(1,4);
a

tensor([[-4.9845e-17,  3.0770e-41,  0.0000e+00,  1.4013e-45]])

In [29]:
b = torch.LongTensor([0]);
b

tensor([0])

In [30]:
m(a,b)

tensor([[0.3553]], grad_fn=<AddmmBackward>)

In [4]:
train_df = pd.read_csv('../../data/train.csv')
#test_df = pd.read_csv('../../data/test.csv')

In [7]:
train_df.shape,test_df.shape

((1804874, 45), (1000, 2))

In [44]:
x_train = train_df[TEXT_COLUMN].astype(str)
y_train = train_df[TARGET_COLUMN].values
y_aux_train = train_df[AUX_COLUMNS].values
x_test = test_df[TEXT_COLUMN].astype(str)

In [47]:
train_df.head(1)

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
0,59848,0.0,"This is so cool. It's like, 'would you want yo...",0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4


In [48]:
train_df.columns

Index(['id', 'target', 'comment_text', 'severe_toxicity', 'obscene',
       'identity_attack', 'insult', 'threat', 'asian', 'atheist', 'bisexual',
       'black', 'buddhist', 'christian', 'female', 'heterosexual', 'hindu',
       'homosexual_gay_or_lesbian', 'intellectual_or_learning_disability',
       'jewish', 'latino', 'male', 'muslim', 'other_disability',
       'other_gender', 'other_race_or_ethnicity', 'other_religion',
       'other_sexual_orientation', 'physical_disability',
       'psychiatric_or_mental_illness', 'transgender', 'white', 'created_date',
       'publication_id', 'parent_id', 'article_id', 'rating', 'funny', 'wow',
       'sad', 'likes', 'disagree', 'sexual_explicit',
       'identity_annotator_count', 'toxicity_annotator_count'],
      dtype='object')

In [49]:
len(train_df.columns)

45

In [34]:
for column in IDENTITY_COLUMNS + [TARGET_COLUMN]:
    train_df[column] = np.where(train_df[column] >= 0.5, True, False)

In [35]:
tokenizer = text.Tokenizer(filters=CHARS_TO_REMOVE)
tokenizer.fit_on_texts(list(x_train) + list(x_test))

In [36]:
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

In [37]:
x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)

In [44]:
sample_weights = np.ones(len(x_train), dtype=np.float32)

In [45]:
sample_weights += train_df[IDENTITY_COLUMNS].sum(axis=1)

In [46]:
sample_weights += train_df[TARGET_COLUMN] * (~train_df[IDENTITY_COLUMNS]).sum(axis=1)

In [47]:
sample_weights += (~train_df[TARGET_COLUMN]) * train_df[IDENTITY_COLUMNS].sum(axis=1) * 5

In [48]:
sample_weights /= sample_weights.mean()

In [49]:
embedding_matrix = np.concatenate(
    [build_matrix(tokenizer.word_index, f) for f in EMBEDDING_FILES], axis=-1)

In [50]:
checkpoint_predictions = []
weights = []

In [51]:
from tqdm import tqdm_notebook as tqdm

In [None]:
def class(nn.Module):
    def __init__(self):
        super().__init__()
        

In [52]:
predictions = np.average(checkpoint_predictions, weights=weights, axis=0)

ZeroDivisionError: Weights sum to zero, can't be normalized

In [53]:
submission = pd.DataFrame.from_dict({
    'id': test_df.id,
    'prediction': predictions
})
submission.to_csv('submission.csv', index=False)

NameError: name 'predictions' is not defined

In [5]:
a = torch.randn(3,2,2);
a

tensor([[[-0.5043, -0.2505],
         [-1.1373, -1.4548]],

        [[-0.4494, -0.3689],
         [ 1.5110, -0.8743]],

        [[ 1.7195, -1.0202],
         [ 0.0778,  1.4121]]])

In [6]:
l = nn.RNN(2,2,2,batch_first=True,bias=False)

In [51]:
toxicity_typ = ['severe_toxicity','obscene','threat','insult','identity_attack','sexual_explicit']
eval_identities = ['male','female','homosexual_gay_or_lesbian','christian','jewish','muslim'
                  ,'black','white','psychiatric_or_mental_illness']

In [17]:
np.unique(tgt,return_counts=True)

(array([0, 1]), array([1698436,  106438]))

In [25]:
for cat in eval_identities:
    df = train_df[train_df[cat] == 1]
    tot = len(df)
    pos = len(df[df['target'] == 1])
    neg = len(df[df['target'] == 0])
    print(cat+" : "+"pos: "+str(float(pos)/tot)+" neg: "+str(float(neg)/tot) )

male : pos: 0.1032359905288082 neg: 0.8967640094711918
female : pos: 0.09616085329093378 neg: 0.9038391467090662
homosexual_gay_or_lesbian : pos: 0.21312787672603561 neg: 0.7868721232739644
christian : pos: 0.06918496029892574 neg: 0.9308150397010743
jewish : pos: 0.11248112732762959 neg: 0.8875188726723704
muslim : pos: 0.15973207249802995 neg: 0.8402679275019701
black : pos: 0.2547021943573668 neg: 0.7452978056426333
white : pos: 0.2109888430383616 neg: 0.7890111569616384
psychiatric_or_mental_illness : pos: 0.12977707006369427 neg: 0.8702229299363057


In [2]:
! conda install -c anaconda gensim -y

Collecting package metadata: done
Solving environment: - ^C
failed

CondaError: KeyboardInterrupt

