# 1. Import dependencies

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pickle

Using TensorFlow backend.


# 2. Hyperparameters and user-inputs

In [2]:
TEST_SPLIT = 0.05
MAX_FEATURES = 50000
MAX_SEQ_LEN = 50
EMBEDDING_DIM = 300

DATA_PATH = '/home/rakesh47/Downloads/NanoNets/Assignment/Data/train.csv'
GLOVE_PATH = f'/home/rakesh47/Downloads/NanoNets/Assignment/glove.6B/glove.6B.{EMBEDDING_DIM}d.txt'

# 3. Explore and prepare data

## 3.1. Load and explore data

In [3]:
data = pd.read_csv(DATA_PATH)
data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [4]:
data.shape

(363861, 6)

In [5]:
data = data.iloc[:, 3:]
data.head()

Unnamed: 0,question1,question2,is_duplicate
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [6]:
data.shape

(363861, 3)

In [7]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [8]:
X.head()

Unnamed: 0,question1,question2
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?


In [9]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: is_duplicate, dtype: int64

In [10]:
print(f'Class-imbalance ratio: {y.sum() / len(y)}')

Class-imbalance ratio: 0.37150175479097786


## 3.2. Create train-test splits

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SPLIT, random_state=0)

In [12]:
print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'y_test shape: {y_test.shape}')

X_train shape: (345667, 2)
X_test shape: (18194, 2)
y_train shape: (345667,)
y_test shape: (18194,)


In [13]:
X_train_Q1 = X_train.iloc[:, 0]
X_train_Q2 = X_train.iloc[:, 1]

X_test_Q1 = X_test.iloc[:, 0]
X_test_Q2 = X_test.iloc[:, 1]

## 3.3. Print average sequence lengths

In [14]:
print(f'Average train-Q1 sequence length: {np.mean(list(map(len, X_train_Q1)))}')
print(f'Average train-Q2 sequence length: {np.mean(list(map(len, X_train_Q2)))}')
print(f'Average test-Q1 sequence length: {np.mean(list(map(len, X_test_Q1)))}')
print(f'Average test-Q2 sequence length: {np.mean(list(map(len, X_test_Q2)))}')

Average train-Q1 sequence length: 59.473157692229805


TypeError: object of type 'float' has no len()

## 3.4. Remove nan's from X_train_Q2

In [15]:
nans_count = 0
nans_idx_list = []

for i in range(len(X_train_Q2)):
    try: 
        np.isnan(X_train_Q2.iloc[i])
        nans_count += 1
        nans_idx_list.append(i)
    except: continue
        
print(f"Nan's count: {nans_count}")
print(f"Nan's indices: {nans_idx_list}")

Nan's count: 2
Nan's indices: [11766, 147547]


In [16]:
for i, nan_idx in enumerate(nans_idx_list):
    X_train_Q1 = X_train_Q1.drop(X_train_Q1.index[nan_idx-i])
    X_train_Q2 = X_train_Q2.drop(X_train_Q2.index[nan_idx-i])
    y_train = y_train.drop(y.index[nan_idx-i])

## 3.5. Print average sequence lengths once again

In [17]:
print(f'Average train-Q1 sequence length: {np.mean(list(map(len, X_train_Q1)))}')
print(f'Average train-Q2 sequence length: {np.mean(list(map(len, X_train_Q2)))}')
print(f'Average test-Q1 sequence length: {np.mean(list(map(len, X_test_Q1)))}')
print(f'Average test-Q2 sequence length: {np.mean(list(map(len, X_test_Q2)))}')

Average train-Q1 sequence length: 59.47332243646305
Average train-Q2 sequence length: 60.076666714882904
Average test-Q1 sequence length: 59.70413323073541
Average test-Q2 sequence length: 59.91618115862372


## 3.6. Tokenize data

In [18]:
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(list(X_train_Q1) + list(X_train_Q2) + list(X_test_Q1) + list(X_test_Q2))

X_train_seq1 = tokenizer.texts_to_sequences(X_train_Q1)
X_train_seq2 = tokenizer.texts_to_sequences(X_train_Q2)

X_test_seq1 = tokenizer.texts_to_sequences(X_test_Q1)
X_test_seq2 = tokenizer.texts_to_sequences(X_test_Q2)

word_index = tokenizer.word_index
print('No. of tokens: ', len(word_index))

No. of tokens:  91013


## 3.7. Pad sequences

In [19]:
X_train_seq1 = pad_sequences(X_train_seq1, maxlen=MAX_SEQ_LEN)
X_train_seq2 = pad_sequences(X_train_seq2, maxlen=MAX_SEQ_LEN)

X_test_seq1 = pad_sequences(X_test_seq1, maxlen=MAX_SEQ_LEN)
X_test_seq2 = pad_sequences(X_test_seq2, maxlen=MAX_SEQ_LEN)

print(f'X_train_seq1 shape: {X_train_seq1.shape}')
print(f'X_train_seq2 shape: {X_train_seq2.shape}')
print(f'X_test_seq1 shape: {X_test_seq1.shape}')
print(f'X_test_seq2 shape: {X_test_seq2.shape}')

X_train_seq1 shape: (345665, 50)
X_train_seq2 shape: (345665, 50)
X_test_seq1 shape: (18194, 50)
X_test_seq2 shape: (18194, 50)


## 3.8. Load word-vectors in a dictionary

In [20]:
embeddings_index = {}
f = open(GLOVE_PATH, 'r')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('No. of word vectors: %d' % len(embeddings_index))

No. of word vectors: 400000


## 3.9. Prepare embedding matrix

In [21]:
num_words = min(MAX_FEATURES, len(word_index))
embedding_matrix = np.zeros((num_words+1, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [22]:
pd.DataFrame(embedding_matrix).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.04656,0.21318,-0.007436,-0.45854,-0.035639,0.23643,-0.28836,0.21521,-0.13486,-1.6413,...,-0.013064,-0.29686,-0.079913,0.195,0.031549,0.28506,-0.087461,0.009061,-0.20989,0.053913
2,-0.20017,0.14302,0.052055,-0.000809,0.017009,0.014899,-0.25524,-0.17907,-0.046713,-2.0547,...,0.045239,-0.35298,0.3335,0.28104,0.20338,-0.4788,-0.039697,0.034939,-0.12599,0.21863
3,-0.1749,0.22956,0.24924,-0.20512,-0.12294,0.021297,-0.23815,0.13737,-0.08913,-2.0607,...,0.31357,-0.13407,0.18465,0.23426,0.076272,0.10502,0.21521,-0.24131,-0.40402,0.054744
4,-0.2852,-0.013883,0.31607,-0.19182,0.059983,0.60524,-0.18121,-0.20191,0.056732,-2.1441,...,0.305,-0.30684,0.15291,-0.027711,0.27281,-0.4436,-0.15616,-0.10859,-0.14354,0.1485


In [23]:
embedding_matrix.shape

(50001, 300)

# 4. Save prepared data

In [24]:
with open('PreparedData/train.pkl', 'wb') as f:
    pickle.dump([X_train_seq1, X_train_seq2, y_train], f)
    
with open('PreparedData/test.pkl', 'wb') as f:
    pickle.dump([X_test_seq1, X_test_seq2, y_test], f) 
    
with open('SavedObjects/embedding.pkl', 'wb') as f:
    pickle.dump(embedding_matrix, f)

with open('SavedObjects/tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)