Data Extraction

In [1]:
import pandas as pd

In [2]:
name_df = pd.read_csv("name_data.csv",sep="\t")

In [3]:
name_df.shape

(170000, 2)

In [4]:
profane_df = pd.read_csv("profane_words.csv",sep="\t")

In [5]:
profane_df.shape

(169988, 2)

In [6]:
final_df = name_df.append(profane_df, ignore_index=True)

In [7]:
final_df=final_df.sample(frac=1).reset_index(drop=True)

In [8]:
final_df.isna().sum()

TEXT     2
LABEL    0
dtype: int64

In [9]:
final_df.head(n=10)

Unnamed: 0,TEXT,LABEL
0,verscend,NOTNAME
1,Unish Saru,NAME
2,hello u hi khul,NOTNAME
3,Sarovar Chandra Rajopadhyaye,NAME
4,anarchist,NOTNAME
5,Dronachandra Kadayat,NAME
6,Rajeshwori Sherstha,NAME
7,Rahul Hamo,NAME
8,Sworupa Pahi,NAME
9,Abindra Ballav Sakha,NAME


In [10]:
for l in final_df["TEXT"]:
    if type(l) != str:
        print("bad")
        print(l)

bad
nan
bad
nan


In [11]:
final_df.loc[:,"TEXT"] = final_df.TEXT.apply(lambda x : str(x))

final_df.loc[:,"TEXT"] = final_df.TEXT.apply(lambda x : str.lower(x))

Data splitting

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(final_df["TEXT"], final_df["LABEL"], 
                                                    test_size=.33, random_state=17)

In [13]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
vocabulary_size = 20000
tokenizer = Tokenizer(char_level=False, oov_token='UNK',num_words = vocabulary_size)
tokenizer.fit_on_texts(final_df["TEXT"])
sequences = tokenizer.texts_to_sequences(final_df['TEXT'])
data = pad_sequences(sequences,maxlen = 10)

Using TensorFlow backend.


In [14]:
tokenizer.index_word

{1: 'UNK',
 2: 'rana',
 3: 'chandra',
 4: 'kumar',
 5: 'nath',
 6: 'ballav',
 7: 'jung',
 8: 'bahadur',
 9: 'prasad',
 10: 'aditya',
 11: 'lol',
 12: 'september',
 13: 'com',
 14: 'haha',
 15: 'tyo',
 16: 'october',
 17: 'nai',
 18: 'bholi',
 19: 'hai',
 20: 'haru',
 21: 'hola',
 22: 'pani',
 23: 'kati',
 24: 'malai',
 25: 'haina',
 26: 'lala',
 27: 'timi',
 28: 'tero',
 29: 'xaina',
 30: 'okay',
 31: 'aja',
 32: 'bhayo',
 33: 'gardai',
 34: 'maile',
 35: 'chha',
 36: 'shubham',
 37: 'bhane',
 38: 'aah',
 39: 'na',
 40: 'jana',
 41: 'pratik',
 42: 'hunxa',
 43: 'bhan',
 44: 'bata',
 45: 'kasto',
 46: 'colz',
 47: 'chaina',
 48: 'andresh',
 49: 'garna',
 50: 'garne',
 51: 'timro',
 52: 'kaile',
 53: 'ghar',
 54: 'co',
 55: 'omar',
 56: 'huncha',
 57: 'tei',
 58: 'bhai',
 59: 'k',
 60: 'samma',
 61: 'lo',
 62: 'bhaneko',
 63: 'kei',
 64: 'sha',
 65: 'kun',
 66: 'ahh',
 67: 'sahil',
 68: 'www',
 69: 'wala',
 70: 'jasto',
 71: 'dherai',
 72: 'baje',
 73: 'xa',
 74: 'hahaha',
 75: 'chhaina'

In [15]:
final_df["LABEL"] = final_df["LABEL"].replace(["NAME" , "NOTNAME"] , [1 , 0])

In [16]:
labels = final_df["LABEL"]

In [17]:
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers import Dense,  LSTM, Conv1D, MaxPooling1D, Dropout, Activation
import numpy as np
from keras.layers import BatchNormalization

In [18]:
model2 = Sequential()
model2.add(Embedding(20000, 50, input_length=10))
model2.add(LSTM(50, dropout=0.2, recurrent_dropout=0.2))
model2.add(BatchNormalization())
model2.add(Dropout(0.4))
model2.add(Dense(50, activation="tanh"))
model2.add(BatchNormalization())
model2.add(Dropout(0.4))
model2.add(Dense(50, activation="tanh"))
model2.add(BatchNormalization())
model2.add(Dropout(0.4))
model2.add(Dense(1, activation="sigmoid"))
model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model2.fit(data, np.array(labels), validation_split=0.4, epochs=8)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Train on 203992 samples, validate on 135996 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f81588b59e8>

In [26]:
test_df = pd.read_csv("test_data.csv",sep="\t")

In [27]:
test = test_df["text"]
test_label = test_df["label"]

In [28]:
sequences_test = tokenizer.texts_to_sequences(test)
test_data = pad_sequences(sequences_test,maxlen = 10)

In [29]:
test_pred = model2.predict(test_data)

In [30]:
new_pred = []
for i in test_pred:
    if i>0.5:
        new_pred.append("NAME")
    else:
        new_pred.append("NOTNAME")

In [31]:
result = [(i,j)for i,j in zip(test,new_pred)]

In [32]:
result

[('samrat subedi', 'NAME'),
 ('shreyam adhikari', 'NAME'),
 ('tilak bhatt', 'NAME'),
 ('sujan thapa magar', 'NOTNAME'),
 ('binod chaudharya', 'NAME'),
 ('keshab aryal', 'NAME'),
 ('jeet khamcha', 'NAME'),
 ('jit bahadur  khamcha', 'NAME'),
 ('ishwor regmi', 'NAME'),
 ('iswor adhikari', 'NAME'),
 ('asmita khakurel', 'NAME'),
 ('asmi bhujel', 'NOTNAME'),
 ('khate', 'NOTNAME'),
 ('bhate', 'NOTNAME'),
 ('randi', 'NOTNAME'),
 ('fucker', 'NOTNAME'),
 ('sucker', 'NOTNAME'),
 ('sambridhhi acharya', 'NOTNAME'),
 ('archana regmni', 'NOTNAME'),
 ('mula', 'NOTNAME'),
 ('randiko chora', 'NOTNAME'),
 ('madar chot', 'NOTNAME'),
 ('chor', 'NOTNAME'),
 ('bhaate', 'NOTNAME'),
 ('khaate', 'NOTNAME'),
 ('ganjeee', 'NOTNAME'),
 ('maam paka', 'NOTNAME'),
 ('railaa', 'NOTNAME'),
 ('raila', 'NOTNAME'),
 ('kera jasto', 'NOTNAME'),
 ('tero bau', 'NOTNAME'),
 ('badar jasto tah', 'NOTNAME'),
 ('baadar', 'NOTNAME'),
 ('monkey', 'NOTNAME'),
 ('donkey', 'NOTNAME'),
 ('tah gadha', 'NOTNAME'),
 ('bhatuwa', 'NOTNAME'),