### Importing the necessary packages

In [1]:
import numpy as np 
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from sklearn.cross_validation import train_test_split
from keras.layers import LSTM
seed = 42
np.random.seed(seed)
import collections
import matplotlib.pyplot as plt

Using TensorFlow backend.


In [2]:
def data_check(file_name, train=True):
    file = open(file_name,'r')
    train_dict = dict()
    if train:
        label = []
        no_of_data = [] # for creating the dictionary, gather all data
        train_x = [] #
        for line in file:
            lx = line.split('\t')
            ly = lx[1].rstrip().split(',')
            label.append(lx[0]) # labels appending
            train_x.append(ly) # used for numbering array
            no_of_data.extend(ly) # used for counting unique numbers
        
        #for calculating the line length
        max_len = max([len(i) for i in train_x])
        min_len = min([len(i) for i in train_x])
        mean_len = np.mean([len(i) for i in train_x])
        std_len = np.std([len(i) for i in train_x])
        
        #print('list of lengths of sentences: ',[len(i) for i in train_x])
        length = [len(i) for i in train_x]
        plt.boxplot(length)
        print('max sentence length: ', max_len)
        print('min sentence length: ', min_len)
        print('mean:', mean_len)
        print('std:', std_len) 
        
        # for checking the frequency and unique words
        di = collections.Counter(no_of_data)
        print('unique words:',len(di.keys()))

        file.close()
        
        return di
    
    else:
        label = []
        no_of_data = [] # for creating the dictionary, gather all data
        train_x = [] #
        for line in file:
            #lx = line.split('\t')
            ly = line.rstrip().split(',')
            #label.append(lx[0]) # labels appending
            train_x.append(ly) # used for numbering array
            no_of_data.extend(ly) # used for counting unique numbers
        
        #for calculating the line length
        max_len = max([len(i) for i in train_x])
        min_len = min([len(i) for i in train_x])
        mean_len = np.mean([len(i) for i in train_x])
        std_len = np.std([len(i) for i in train_x])
        
        #print('list of lengths of sentences: ',[len(i) for i in train_x])
        print('max sentence length: ', max_len)
        print('min sentence length: ', min_len)
        print('mean:', mean_len)
        print('std:', std_len) 
        
        # for checking the frequency
        di = collections.Counter(no_of_data)
        print('unique words:',len(di.keys()))

        file.close()
        
    return di

In [3]:
di_train = data_check('training-data-small.txt')

max sentence length:  100
min sentence length:  2
mean: 26.2512
std: 17.84020455488109
unique words: 125


In [4]:
di_test = data_check('test-data-small.txt', train=False)

max sentence length:  102
min sentence length:  2
mean: 26.50456
std: 18.126201455528403
unique words: 127


In [5]:
print(di_train)

Counter({'Z4': 9642, 'Z15': 8314, 'X344420902': 6718, 'Y3143': 4832, 'Y1222': 4451, 'Y1940': 4351, 'Y2817': 4342, 'Y1555': 4193, 'Y2321': 4081, 'Y1522': 3890, 'X123474229': 3798, 'Y2307': 3698, 'Y1478': 3429, 'Y1080': 3379, 'Y2170': 3343, 'Y1132': 3326, 'Y3269': 3301, 'Y1702': 3296, 'X773579': 3234, 'Y1486': 3196, 'Y1705': 3105, 'Y2949': 3095, 'Y2680': 3004, 'Y1725': 2965, 'Y2793': 2954, 'Y3232': 2887, 'Y3021': 2876, 'Y2814': 2758, 'Y2072': 2723, 'Y3145': 2697, 'Y2667': 2663, 'X127436348': 2638, 'Y2640': 2636, 'Y1690': 2633, 'Y2024': 2632, 'Y1468': 2630, 'Y1791': 2609, 'Y3346': 2605, 'Y1353': 2602, 'Y3585': 2566, 'Y1238': 2559, 'Y2267': 2526, 'Y1456': 2506, 'Y977': 2506, 'Y2': 2479, 'Y2257': 2473, 'Y3250': 2466, 'Y2684': 2464, 'X62716661': 2464, 'Y1367': 2431, 'Y2849': 2421, 'Y2641': 2411, 'Y3233': 2409, 'Y1206': 2401, 'Y3141': 2387, 'Y2794': 2382, 'X51769735': 2368, 'X277692318': 2330, 'Y2557': 2305, 'X420128991': 2261, 'X174205309': 2239, 'X62718521': 1949, 'X137608184': 1861, 'X4878

In [6]:
count = 0
for k, v in di_train.items():
    if di_train[k] > 100:
        count+=1
print('top words to keep: ', count)

top words to keep:  118


In [7]:
def data_process(file_name, train=True, top_words=99):
    file = open(file_name,'r')
    if train:
        label = []
        no_of_data = [] # for creating the dictionary, gather all data
        train_x = [] #
        for line in file:
            lx = line.split('\t')
            ly = lx[1].rstrip().split(',')
            label.append(lx[0]) # labels appending
            train_x.append(ly) # used for numbering array
            no_of_data.extend(ly) # used for counting unique numbers
        
        #for calculating the line length
        max_len = max([len(i) for i in train_x])
        min_len = min([len(i) for i in train_x])
        mean_len = np.mean([len(i) for i in train_x])
        std_len = np.std([len(i) for i in train_x])

        print('max sentence length: ', max_len)
        print('min sentence length: ', min_len)
        print('mean:', mean_len)
        print('std:', std_len)

        # creating label array of shape(number,) 1-D row
        train_label = np.asarray(label).reshape(-1,)
        print('label array shape:', train_label.shape)
        
        # for checking the frequency
        di = collections.Counter(no_of_data)
        #print('di', di)
        
        
        ddd = dict(di)
        dddd = sorted([[value,key] for (key,value) in ddd.items()], reverse=True)
        
        # starting most frequent word with 3
        for i in range(len(dddd)):
            dddd[i][0] = i+3

        d = dict(dddd)

        res = {v:k for k,v in d.items()}
        
        # here we are assigning dictionary itens from '3', 0 is for padding, 1 is for start, 2 is for except top_words
        for i,j in res.items():
            if res[i]>top_words+3:
                res[i] = 2
        file.close()
        
        train_data = np.asarray(train_x).reshape(-1,)
        for index,i in enumerate(train_data):
            for pos,k in enumerate(i):
                train_data[index][pos] = res[train_data[index][pos]]#train_dict[train_data[index][pos]]
            train_data[index].insert(0,1)
            
        return (train_data, train_label, res)

def data_preprocess_test(filename, label_dictionary):
    file = open(filename,'r')
    test_x = [] #
    for line in file:
        #lx = line.split('\t')
        ly = line.rstrip().split(',')
        test_x.append(ly) # used for labeling
            
    max_len = max([len(i) for i in test_x])
    min_len = min([len(i) for i in test_x])
    mean_len = np.mean([len(i) for i in test_x])
    std_len = np.std([len(i) for i in test_x])

    print('max sentence length: ',max_len)
    print('min sentence length: ', min_len)
    print('mean:', mean_len)
    print('std:', std_len)

    file.close()
    test_data = np.asarray(test_x).reshape(-1,)
    for index,i in enumerate(test_data):
        for pos,k in enumerate(i):
            test_data[index][pos] = label_dictionary.get(test_data[index][pos],0) 
        test_data[index] = [j for j in i if j!=0]
        test_data[index].insert(0,1)   
    return test_data

In [8]:
data, target, dictionary = data_process('training-data-small.txt', top_words=118)
print(data)

max sentence length:  100
min sentence length:  2
mean: 26.2512
std: 17.84020455488109
label array shape: (10000,)
[list([1, 21, 35, 31, 3, 4])
 list([1, 78, 13, 75, 87, 70, 60, 34, 85, 5, 66, 92, 59, 67, 65, 86, 82, 99, 72, 14, 19, 11, 6, 9, 44, 25, 48, 58, 27, 29, 28, 3, 4])
 list([1, 84, 21, 5, 8, 23, 3, 4, 107]) ...
 list([1, 5, 19, 18, 30, 44, 48, 58, 27, 49, 55, 28, 16, 10, 7, 31, 43, 32, 57, 6, 39, 20, 25, 23, 50, 33, 26, 54, 35, 17, 56, 42, 36, 52, 41, 24, 29, 61, 14, 53, 15, 11, 22, 37, 12, 9, 40, 46, 8, 45, 38, 3, 4, 97])
 list([1, 105, 71, 5, 73, 58, 53, 3, 4, 110])
 list([1, 13, 87, 62, 81, 63, 60, 59, 102, 105, 75, 5, 73, 84, 103, 83, 47, 18, 30, 55, 16, 10, 7, 43, 32, 6, 20, 25, 56, 52, 41, 12, 9, 8, 3, 4])]


In [9]:
test_data = data_preprocess_test('test-data-small.txt', dictionary)

max sentence length:  102
min sentence length:  2
mean: 26.50456
std: 18.126201455528403


In [10]:
print(test_data)

[list([1, 90, 98, 59, 44, 48, 31, 17, 11, 53, 3, 4, 97])
 list([1, 7, 56, 20, 3, 4, 97])
 list([1, 87, 21, 91, 5, 73, 81, 18, 7, 43, 3, 4, 97]) ...
 list([1, 64, 63, 86, 51, 5, 103, 21, 96, 106, 83, 74, 73, 81, 69, 13, 95, 60, 90, 98, 48, 49, 55, 7, 6, 61, 11, 38, 10, 20, 23, 37, 9, 45, 3, 4])
 list([1, 62, 63, 66, 73, 81, 103, 28, 32, 30, 33, 53, 3, 113])
 list([1, 62, 5, 73, 60, 63, 96, 83, 14, 30, 2, 109])]


In [11]:
max_len = 45 # maximum sequence length
data = sequence.pad_sequences(data, maxlen=max_len)
test_data = sequence.pad_sequences(test_data, maxlen=max_len)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.1, random_state=42)

In [13]:
print(X_train.shape, X_test.shape)

(9000, 45) (1000, 45)


In [14]:
# create the model
top_words = 118+4 # we created the words with '0' to '118+3'
model = Sequential()
model.add(Embedding(top_words, 32, input_length=max_len))
model.add(Flatten())

#model.add(LSTM(128))
#model.add(Dropout(0.2))

model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 45, 32)            3904      
_________________________________________________________________
flatten_1 (Flatten)          (None, 1440)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 250)               360250    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 251       
Total params: 364,405
Trainable params: 364,405
Non-trainable params: 0
_________________________________________________________________
None


In [15]:
# Fit the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=20, batch_size=32, verbose=2)


Train on 9000 samples, validate on 1000 samples
Epoch 1/20
 - 2s - loss: 0.5545 - acc: 0.7153 - val_loss: 0.4890 - val_acc: 0.7500
Epoch 2/20
 - 2s - loss: 0.4771 - acc: 0.7722 - val_loss: 0.4922 - val_acc: 0.7550
Epoch 3/20
 - 2s - loss: 0.4411 - acc: 0.7926 - val_loss: 0.5026 - val_acc: 0.7490
Epoch 4/20
 - 2s - loss: 0.4019 - acc: 0.8166 - val_loss: 0.5243 - val_acc: 0.7330
Epoch 5/20
 - 2s - loss: 0.3477 - acc: 0.8416 - val_loss: 0.5858 - val_acc: 0.7090
Epoch 6/20
 - 2s - loss: 0.2895 - acc: 0.8733 - val_loss: 0.6047 - val_acc: 0.7230
Epoch 7/20
 - 2s - loss: 0.2327 - acc: 0.9056 - val_loss: 0.6843 - val_acc: 0.7140
Epoch 8/20
 - 2s - loss: 0.1874 - acc: 0.9259 - val_loss: 0.7428 - val_acc: 0.7210
Epoch 9/20
 - 2s - loss: 0.1513 - acc: 0.9420 - val_loss: 0.8307 - val_acc: 0.7210
Epoch 10/20
 - 2s - loss: 0.1238 - acc: 0.9560 - val_loss: 0.8993 - val_acc: 0.7170
Epoch 11/20
 - 2s - loss: 0.1011 - acc: 0.9633 - val_loss: 1.0221 - val_acc: 0.6980
Epoch 12/20
 - 2s - loss: 0.0840 - ac

<keras.callbacks.History at 0x16449e80748>

In [16]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32, verbose=2)

Train on 9000 samples, validate on 1000 samples
Epoch 1/10
 - 2s - loss: 0.0315 - acc: 0.9888 - val_loss: 1.6195 - val_acc: 0.7030
Epoch 2/10
 - 2s - loss: 0.0310 - acc: 0.9894 - val_loss: 1.6975 - val_acc: 0.7050
Epoch 3/10
 - 2s - loss: 0.0273 - acc: 0.9889 - val_loss: 1.7493 - val_acc: 0.7010
Epoch 4/10
 - 2s - loss: 0.0261 - acc: 0.9896 - val_loss: 1.7979 - val_acc: 0.6980
Epoch 5/10
 - 2s - loss: 0.0246 - acc: 0.9908 - val_loss: 1.8333 - val_acc: 0.7000
Epoch 6/10
 - 2s - loss: 0.0771 - acc: 0.9769 - val_loss: 1.7950 - val_acc: 0.7090
Epoch 7/10
 - 2s - loss: 0.0535 - acc: 0.9822 - val_loss: 1.7099 - val_acc: 0.6950
Epoch 8/10
 - 2s - loss: 0.0273 - acc: 0.9903 - val_loss: 1.7303 - val_acc: 0.6890
Epoch 9/10
 - 2s - loss: 0.0225 - acc: 0.9916 - val_loss: 1.8074 - val_acc: 0.6940
Epoch 10/10
 - 2s - loss: 0.0208 - acc: 0.9907 - val_loss: 1.7917 - val_acc: 0.7010


<keras.callbacks.History at 0x16449e80908>

In [17]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32, verbose=2)

Train on 9000 samples, validate on 1000 samples
Epoch 1/10
 - 2s - loss: 0.0215 - acc: 0.9909 - val_loss: 1.8377 - val_acc: 0.6970
Epoch 2/10
 - 2s - loss: 0.0203 - acc: 0.9914 - val_loss: 1.8379 - val_acc: 0.6920
Epoch 3/10
 - 2s - loss: 0.0189 - acc: 0.9919 - val_loss: 1.8692 - val_acc: 0.7040
Epoch 4/10
 - 2s - loss: 0.0197 - acc: 0.9911 - val_loss: 1.8877 - val_acc: 0.6940
Epoch 5/10
 - 2s - loss: 0.0192 - acc: 0.9910 - val_loss: 1.9518 - val_acc: 0.6990
Epoch 6/10
 - 2s - loss: 0.0203 - acc: 0.9914 - val_loss: 1.9074 - val_acc: 0.6940
Epoch 7/10
 - 2s - loss: 0.0191 - acc: 0.9910 - val_loss: 1.9683 - val_acc: 0.6930
Epoch 8/10
 - 2s - loss: 0.0201 - acc: 0.9908 - val_loss: 2.0653 - val_acc: 0.6930
Epoch 9/10
 - 2s - loss: 0.0267 - acc: 0.9898 - val_loss: 1.9942 - val_acc: 0.6920
Epoch 10/10
 - 2s - loss: 0.0457 - acc: 0.9848 - val_loss: 1.9034 - val_acc: 0.6900


<keras.callbacks.History at 0x16440f1eef0>

In [18]:
test_output = model.predict(test_data, batch_size=32)

In [19]:
print(test_output.shape)

(100000, 1)


In [20]:
for i in range(test_output.shape[0]):
    if test_output[i] > 0.5:
        test_output[i] = 1.0
    else:
        test_output[i] = 0.0

In [21]:
np.savetxt('test_small.txt', test_output, delimiter=',')

In [22]:
text = np.loadtxt('test_small.txt')

In [24]:
print(text)

[0. 0. 0. ... 1. 0. 0.]


In [None]:
# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")
 
# later...
 
# load json and create model
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")
 
# evaluate loaded model on test data
loaded_model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
score = loaded_model.evaluate(X, Y, verbose=0)
print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))


In [None]:
with open ("test.txt","w")as fp:
   for line in list12:
       fp.write(line+"\n")