In [1]:
import keras
import re
import numpy as np
import matplotlib.pyplot as plt

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def sanitize_file(filename, username):
    file = open(filename+'.txt', 'r+', encoding='utf8')
    lines = file.readlines()
    # to filter the lines that have the username
    pattern = r'(?=.*\b' + username +'\\b)'
#     print(pattern)
    # to remove the username and the timestamp part
    pattern2 = r'([^:]*:)'
    sanitized_text = []
    for line in lines:
#         print(line)
        if re.match(pattern, line):
            to_write = re.sub(pattern2,'', line)
            to_write = to_write.strip()
            sanitized_text.append(to_write)
    file.close()
    
    # Write the sanitized data
    to_remove = ['This message was deleted', '<Media omitted>']
    filtered = list(filter(lambda text: text not in to_remove, sanitized_text))
    file_write = open(filename + '-Sanitized.txt', 'w+', encoding='utf8')
    for line in filtered:
        file_write.write(line + '\n')
    file_write.close()
    print('Sanitization Done !')

In [3]:
sanitize_file('Rishika', 'Rishika')

Sanitization Done !


In [4]:
sanitize_file('Shubham', '(MSRUAS)')

Sanitization Done !


In [5]:
sanitize_file('Shyamant','Shyamant Iron Fe')

Sanitization Done !


In [6]:
def file_to_lines_list(filename):
    file = open(filename + '.txt', encoding='utf8')
    lines = file.readlines()
    to_add = []
    for line in lines:
        to_add.append(line.rstrip())
    
    print('Lines List Generated ! for {0}'.format(filename))
    
    return to_add

In [7]:
def tokenize_file(filename):
    tokenizer = keras.preprocessing.text.Tokenizer(num_words=None, lower=False, split=' ', char_level=False, oov_token=None, document_count=0)

    to_add = file_to_lines_list(filename)
    tokenizer.fit_on_texts(to_add)
    
    print('Tokenization Done !')
    
    return tokenizer, to_add

Labels:
0. Rishika
1. Shubham
2. Shyamant

In [8]:
class_name = ['Rishika', 'Shubham', 'Shyamant']

In [9]:
all_lines_list = []
number_of_messages = []

lines_list = file_to_lines_list('Rishika-Sanitized')
number_of_messages.append(len(lines_list))
all_lines_list.extend(lines_list)

lines_list = file_to_lines_list('Shubham-Sanitized')
number_of_messages.append(len(lines_list))
all_lines_list.extend(lines_list)

lines_list = file_to_lines_list('Shyamant-Sanitized')
number_of_messages.append(len(lines_list))
all_lines_list.extend(lines_list)

tokenized = keras.preprocessing.text.Tokenizer()
tokenized.fit_on_texts(all_lines_list)

Lines List Generated ! for Rishika-Sanitized
Lines List Generated ! for Shubham-Sanitized
Lines List Generated ! for Shyamant-Sanitized


In [10]:
word_index = tokenized.word_index
reverse_word_index = {v: k for k, v in word_index.items()}

In [11]:
# summarize what was learned
print(tokenized.word_counts)
print(tokenized.document_count)
print(tokenized.word_index)
print(tokenized.word_docs)

8038


In [12]:
print(all_lines_list)



In [13]:
print(tokenized.word_index)



# Creating the Training Data

In [14]:
x_train = tokenized.texts_to_sequences(all_lines_list)

In [15]:
all_lines_list

['Hiii',
 'satyajit',
 'Kl de dena na',
 'Haha',
 'Yup',
 'Phy ka portion',
 'Arey',
 'Plz',
 'Ty',
 'Arey laser kitte marks ka aa rha',
 '😭',
 'Are you having a bad day',
 'Nhi pta',
 'Kissika call toh aaya tha',
 'Maine receive nhi kiya tha',
 'Y',
 'Sabita K??',
 'Haaawwww',
 'Kyuuuu',
 'Fuck shit',
 'Thank god',
 '🤦🏻\u200d♀',
 '🙄',
 'Aise nhi hta',
 'Bolo??',
 'Arey',
 'Hahaha',
 'This 🙄',
 'Arey haan',
 'Portion kya h',
 '🙄',
 'Yaad tha',
 'M waiting',
 'I will',
 'Jaana h tumhe',
 'Humaare saath',
 '🙄🙄',
 'Kya del kiya',
 '??',
 '🙄',
 'Achaaa',
 'Humlog mostly Goa jaa rhe',
 'So',
 '15k',
 'Socha h',
 'Aur baaki pta nhi',
 'Ooo',
 'Ok',
 'Parents',
 '😑',
 'Hey',
 'Suggest some power bank',
 '🙄',
 'I mean what brand',
 'Mi 😕',
 'Mi use kiye the',
 'Online ya in store ?',
 'Hmm',
 'Yeh toh 10k h',
 'Dis is 20k',
 'Achaa',
 'Croma??',
 'Or mi store?',
 'Haha',
 'Oooo',
 'Haan',
 'Ek lena h',
 'Ambrane kaisa h??',
 'Tera kon sa h',
 '😂',
 '🙄',
 'Haha',
 'Arey its okay',
 '🙄',
 '😇',
 

In [16]:
x_train

[[218],
 [1711],
 [135, 237, 403, 67],
 [600],
 [136],
 [280, 43, 705],
 [504],
 [601],
 [2436],
 [504, 2437, 342, 265, 43, 98, 72],
 [174],
 [175, 404, 934, 12, 1712, 361],
 [9, 80],
 [1713, 238, 16, 382, 30],
 [125, 2438, 9, 118, 30],
 [137],
 [2439, 546],
 [2440],
 [1714],
 [1715, 239],
 [547, 469],
 [324],
 [76],
 [470, 9, 281],
 [405],
 [504],
 [441],
 [25, 76],
 [504, 14],
 [705, 21, 3],
 [76],
 [406, 30],
 [362, 1119],
 [2, 84],
 [935, 3, 602],
 [1716, 343],
 [252],
 [21, 936, 118],
 [],
 [76],
 [706],
 [505, 937, 1368, 407, 219],
 [19],
 [2441],
 [1717, 3],
 [40, 650, 80, 9],
 [200],
 [32],
 [1718],
 [48],
 [153],
 [938, 75, 344, 939],
 [76],
 [2, 176, 52, 1369],
 [786, 548],
 [786, 73, 787, 1],
 [1719, 17, 10, 1120],
 [240],
 [70, 16, 2442, 3],
 [651, 6, 2443],
 [788],
 [1720],
 [35, 786, 1120],
 [600],
 [294],
 [14],
 [154, 789, 3],
 [2444, 652, 3],
 [383, 120, 603, 3],
 [38],
 [76],
 [600],
 [504, 604, 1721],
 [76],
 [1370],
 [471, 1722],
 [14],
 [153],
 [135, 472, 93, 155, 

In [17]:
number_of_messages

[4656, 229, 3153]

In [18]:
y_train = []
for _ in range(number_of_messages[0]):
    y_train.append(0)
for _ in range(number_of_messages[1]):
    y_train.append(1)
for _ in range(number_of_messages[2]):
    y_train.append(2)

## Just making sure the data is alright

In [19]:
y_train[6000], class_name[y_train[6000]]

(2, 'Shyamant')

In [20]:
text = x_train[6000]
text

[22, 63, 61, 271, 3878, 13, 919, 78]

In [21]:
all_lines_list[6000]

'Like there r two kinds of wattmeter noe?'

In [22]:
len(all_lines_list), len(x_train)

(8038, 8038)

In [23]:
' '.join([reverse_word_index.get(i, '?') for i in text])

'like there r two kinds of wattmeter noe'

In [24]:
x_train

[[218],
 [1711],
 [135, 237, 403, 67],
 [600],
 [136],
 [280, 43, 705],
 [504],
 [601],
 [2436],
 [504, 2437, 342, 265, 43, 98, 72],
 [174],
 [175, 404, 934, 12, 1712, 361],
 [9, 80],
 [1713, 238, 16, 382, 30],
 [125, 2438, 9, 118, 30],
 [137],
 [2439, 546],
 [2440],
 [1714],
 [1715, 239],
 [547, 469],
 [324],
 [76],
 [470, 9, 281],
 [405],
 [504],
 [441],
 [25, 76],
 [504, 14],
 [705, 21, 3],
 [76],
 [406, 30],
 [362, 1119],
 [2, 84],
 [935, 3, 602],
 [1716, 343],
 [252],
 [21, 936, 118],
 [],
 [76],
 [706],
 [505, 937, 1368, 407, 219],
 [19],
 [2441],
 [1717, 3],
 [40, 650, 80, 9],
 [200],
 [32],
 [1718],
 [48],
 [153],
 [938, 75, 344, 939],
 [76],
 [2, 176, 52, 1369],
 [786, 548],
 [786, 73, 787, 1],
 [1719, 17, 10, 1120],
 [240],
 [70, 16, 2442, 3],
 [651, 6, 2443],
 [788],
 [1720],
 [35, 786, 1120],
 [600],
 [294],
 [14],
 [154, 789, 3],
 [2444, 652, 3],
 [383, 120, 603, 3],
 [38],
 [76],
 [600],
 [504, 604, 1721],
 [76],
 [1370],
 [471, 1722],
 [14],
 [153],
 [135, 472, 93, 155, 

In [25]:
x_train = np.array(x_train)
y_train = np.array(y_train)
x_train_new = np.array(x_train)

In [26]:
x_train_new[0]

[218]

In [27]:
x_train.shape, y_train.shape

((8038,), (8038,))

In [28]:
x_train = keras.preprocessing.sequence.pad_sequences(x_train,
                                                    value=0,
                                                    padding='post',
                                                    maxlen=256)

# Build the Model

In [104]:
vocab_size = 10000

model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, 16))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16, activation='relu'))
model.add(keras.layers.Dense(16, activation='relu'))
# model.add(keras.layers.Dropout(0.1))
model.add(keras.layers.Dense(1, activation='sigmoid'))

In [105]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Fit the model

In [158]:
history = model.fit(x_train, y_train, epochs=40, batch_size=20, validation_split=0.2, verbose=1)

Train on 6430 samples, validate on 1608 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [107]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_16 (Embedding)     (None, None, 16)          160000    
_________________________________________________________________
global_average_pooling1d_16  (None, 16)                0         
_________________________________________________________________
dense_37 (Dense)             (None, 16)                272       
_________________________________________________________________
dense_38 (Dense)             (None, 16)                272       
_________________________________________________________________
dense_39 (Dense)             (None, 1)                 17        
Total params: 160,561
Trainable params: 160,561
Non-trainable params: 0
_________________________________________________________________


# Save the Model

In [108]:
model.save('whatsappv0.1.model')

# Testing

In [159]:
texts = ['Like there r two kinds of wattmeter noe?']
seq = tokenized.texts_to_sequences(texts)

In [160]:
seq

[[22, 63, 61, 271, 3878, 13, 919, 78]]

In [161]:
pred = model.predict(seq[0])
pred

array([[0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.]], dtype=float32)

In [157]:
idx = np.argmax(pred)
class_name[idx], pred.tolist()

('Shubham', [[0.0], [1.0]])