### **Checking gpu instance if any.**

In [0]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

# **Imports google and other dependencies**

In [0]:
from google.colab import auth
from googleapiclient.discovery import build

In [0]:
import io , requests
import sys

import pandas as pd
import os

In [0]:
auth.authenticate_user()

In [0]:
from googleapiclient.discovery import build
drive_service = build('drive', 'v3')

In [0]:
SOURCE_FOLDER='/content/datalab/'

# ***`Get parent folder function`***

In [0]:
def get_parent_folder(folder_name):
  page_token = None
  folder_array = []
  query = "name='%s' and mimeType='application/vnd.google-apps.folder'" % folder_name
  while True:
      response = drive_service.files().list(q=query,
                                          spaces='drive',
                                          fields='nextPageToken, files(id, name)',
                                          pageToken=page_token).execute()
      for file in response.get('files', []):
          # Process change
          #print (file.get('name'), file.get('id'))
          folder_array.append({"name" : file.get('name'), "id" : file.get('id')})
      page_token = response.get('nextPageToken', None)
      if page_token is None:
          break
  return folder_array

# **`Get all the files from the parent folder`**

In [0]:
def get_files_from_parent(parent_id):
  page_token = None
  folder_array = dict()
  query = "'%s' in parents" % parent_id
  while True:
      response = drive_service.files().list(q=query,
                                          spaces='drive',
                                          fields='nextPageToken, files(id, name)',
                                          pageToken=page_token).execute()
      for file in response.get('files', []):
          # Process change
          #print (file.get('name'), file.get('id'))
          folder_array.update({file.get('name'):file.get('id')})
      page_token = response.get('nextPageToken', None)
      if page_token is None:
          break
  return folder_array

# ***`Download the file from google drive and return the file buffer`***

In [0]:
def get_file_buffer(file_id, verbose=0):
  from googleapiclient.http import MediaIoBaseDownload
  request = drive_service.files().get_media(fileId=file_id)
  downloaded = io.BytesIO()
  downloader = MediaIoBaseDownload(downloaded, request)
  done = False
  while done is False:
    # _ is a placeholder for a progress object that we ignore.
    # (Our file is small, so we skip reporting progress.)
    progress, done = downloader.next_chunk()
    if verbose:
      sys.stdout.flush()
      sys.stdout.write('\r')
      percentage_done = progress.resumable_progress * 100/progress.total_size
      sys.stdout.write("[%-100s] %d%%" % ('='*int(percentage_done), int(percentage_done)))
  downloaded.seek(0)
  return downloaded

## **Downloading the files and keeping in the local folder.**

In [51]:
parent_folder = get_parent_folder('Kaggle_comp')
print(parent_folder)
parent_folder[0]["id"]

[{'name': 'Kaggle_comp', 'id': '1olKu9Hm0dDDlHls6vFEFlXe_wLcUcgfq'}]


'1olKu9Hm0dDDlHls6vFEFlXe_wLcUcgfq'

In [52]:
input_file_meta = get_files_from_parent(parent_folder[0]["id"])
print(input_file_meta)

{'train.csv': '1SX6kKPsNg2g5uG2ZeDpxGpydG3dSWggi', 'test.csv': '1ZZVWvnyn_4dLkKy-05ZhwU1haQZbezQM', 'sample_submission.csv': '1eZ7rUlzLmgXbipXiRAnhrNGFm5kDAlge', 'glove.6B.50d.txt': '1UJi72JDh9TVfegP2ZIdWjedTmBTaZpbl'}


In [33]:
for file, id in input_file_meta.items():
  downloaded = get_file_buffer(id, verbose=1)
  dest_file = os.path.join(SOURCE_FOLDER, file)
  print("processing %s data" % file)
  with open(dest_file, "wb") as out:
    out.write(downloaded.read())
    print("Done %s" % dest_file)
  

Done /content/datalab/train.csv
Done /content/datalab/test.csv
Done /content/datalab/sample_submission.csv

Done /content/datalab/glove.6B.50d.txt


## ***Once Downloaded you do not want to download it again***

In [44]:
!ls -ltrh datalab/adc.json

total 293M
-rw-r--r-- 1 root root 2.2K Jan 30 14:18 adc.json
-rw-r--r-- 1 root root  66M Jan 30 14:33 train.csv
-rw-r--r-- 1 root root  58M Jan 30 14:40 test.csv
-rw-r--r-- 1 root root 6.0M Jan 30 14:40 sample_submission.csv
-rw-r--r-- 1 root root 164M Jan 30 14:42 glove.6B.50d.txt


## **Loading the downloaded file as dataframe for machine learning.**

In [0]:
EMBEDDING_FILE=os.path.join(SOURCE_FOLDER,'glove.6B.50d.txt')
TRAIN_DATA_FILE=os.path.join(SOURCE_FOLDER,'train.csv')
TEST_DATA_FILE=os.path.join(SOURCE_FOLDER,'test.csv')
SUBMISSION_SAMPLE_FILE=os.path.join(SOURCE_FOLDER,'sample_submission.csv')

In [0]:
train = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)

***Installing keras if needed.***

In [0]:
# !pip install keras

**Importing the dependencies for ML**

In [56]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

Using TensorFlow backend.


In [0]:
embed_size = 50 # how big is each word vector
max_features = 20000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a comment to use

In [0]:
train = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)

list_sentences_train = train["comment_text"].fillna("_na_").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("_na_").values

In [0]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [0]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))

In [61]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

(0.020940498, 0.6441043)

In [0]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [0]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [64]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 50)           1000000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 100)          40400     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 50)                5050      
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 306       
Total para

In [65]:
model.fit(X_t, y, batch_size=1024, epochs=1) # validation_split=0.1);

Epoch 1/1


<keras.callbacks.History at 0x7fb7cc6b6828>

In [66]:
model.fit(X_t, y, batch_size=1024, epochs=7) # validation_split=0.1);

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
 15360/159571 [=>............................] - ETA: 1:07 - loss: 0.0476 - acc: 0.9823

Epoch 5/7
Epoch 6/7
Epoch 7/7
 30720/159571 [====>.........................] - ETA: 1:01 - loss: 0.0425 - acc: 0.9838



<keras.callbacks.History at 0x7fb7ca53eeb8>

In [0]:
model.save(os.path.join(SOURCE_FOLDER,'lstm.h5'))

In [78]:
!ls -ltrh datalab/

total 326M
-rw-r--r-- 1 root root 2.2K Jan 30 14:18 adc.json
-rw-r--r-- 1 root root  66M Jan 30 14:33 train.csv
-rw-r--r-- 1 root root  58M Jan 30 14:40 test.csv
-rw-r--r-- 1 root root 6.0M Jan 30 14:40 sample_submission.csv
-rw-r--r-- 1 root root 164M Jan 30 14:42 glove.6B.50d.txt
-rw-r--r-- 1 root root  13M Jan 30 15:07 lstm.h5
-rw-r--r-- 1 root root  22M Jan 30 15:12 my_submission.csv


In [76]:
y_test = model.predict([X_te], batch_size=1024, verbose=1)



In [0]:
sample_submission = pd.read_csv(SUBMISSION_SAMPLE_FILE)
my_submission = os.path.join(SOURCE_FOLDER, "my_submission.csv")
sample_submission[list_classes] = y_test
sample_submission.to_csv(my_submission, index=False)

In [79]:
!head datalab/my_submission.csv

id,toxic,severe_toxic,obscene,threat,insult,identity_hate
00001cee341fdb12,0.9813286066055298,0.32150861620903015,0.9645172357559204,0.06272788345813751,0.8748438358306885,0.24955222010612488
0000247867823ef7,0.00016412425611633807,3.433727329138492e-07,0.00011456113134045154,3.0945102480472997e-06,2.8234480851097032e-05,5.867563231731765e-06
00013b17ad220c46,0.0009202632354572415,2.9844879918528022e-06,0.000265269773080945,1.999957748921588e-05,0.0001547763531561941,2.0764724467881024e-05
00017563c3f7919a,0.00026241716113872826,3.0800524086771475e-07,7.621410622959957e-05,2.838007048922009e-06,4.035261372337118e-05,2.7426312954048626e-06
00017695ad8997eb,0.0018909960053861141,3.7926963614154374e-06,0.00037204561522230506,2.3830991267459467e-05,0.00025817350251600146,2.8350983484415337e-05
0001ea8717f6de06,0.00031233043409883976,3.449679013556306e-07,7.782545435475186e-05,2.9242271466500824e-06,3.848387495963834e-05,3.7007250739407027e-06
00024115d4cbde0f,0.000926559092476964,8.

### **Downloading the submission.csv for uploading to kaggle.**

---

This will download the given file to your local system.

In [0]:
from google.colab import files
files.download(my_submission)