### Importing the necessary packages

In [1]:
import numpy as np 
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from sklearn.cross_validation import train_test_split
from keras.layers import LSTM
seed = 42
np.random.seed(seed)
import collections
import matplotlib.pyplot as plt

Using TensorFlow backend.


In [2]:
def data_check(file_name, train=True):
    file = open(file_name,'r')
    train_dict = dict()
    if train:
        label = []
        no_of_data = [] # for creating the dictionary, gather all data
        train_x = [] #
        for line in file:
            lx = line.split('\t')
            ly = lx[1].rstrip().split(',')
            label.append(lx[0]) # labels appending
            train_x.append(ly) # used for numbering array
            no_of_data.extend(ly) # used for counting unique numbers
        
        #for calculating the line length
        max_len = max([len(i) for i in train_x])
        min_len = min([len(i) for i in train_x])
        mean_len = np.mean([len(i) for i in train_x])
        std_len = np.std([len(i) for i in train_x])
        
        #print('list of lengths of sentences: ',[len(i) for i in train_x])
        length = [len(i) for i in train_x]
        plt.boxplot(length)
        print('max sentence length: ', max_len)
        print('min sentence length: ', min_len)
        print('mean:', mean_len)
        print('std:', std_len) 
        
        # for checking the frequency and unique words
        di = collections.Counter(no_of_data)
        print('unique words:',len(di.keys()))

        file.close()
        
        return di
    
    else:
        label = []
        no_of_data = [] # for creating the dictionary, gather all data
        train_x = [] #
        for line in file:
            #lx = line.split('\t')
            ly = line.rstrip().split(',')
            #label.append(lx[0]) # labels appending
            train_x.append(ly) # used for numbering array
            no_of_data.extend(ly) # used for counting unique numbers
        
        #for calculating the line length
        max_len = max([len(i) for i in train_x])
        min_len = min([len(i) for i in train_x])
        mean_len = np.mean([len(i) for i in train_x])
        std_len = np.std([len(i) for i in train_x])
        
        #print('list of lengths of sentences: ',[len(i) for i in train_x])
        print('max sentence length: ', max_len)
        print('min sentence length: ', min_len)
        print('mean:', mean_len)
        print('std:', std_len) 
        
        # for checking the frequency
        di = collections.Counter(no_of_data)
        print('unique words:',len(di.keys()))

        file.close()
        
    return di

In [3]:
di_train = data_check('training-data-large.txt')

max sentence length:  5042
min sentence length:  2
mean: 71.308072
std: 78.98943759543307
unique words: 65464


In [4]:
di_test = data_check('test-data-large.txt', train=False)

max sentence length:  2014
min sentence length:  2
mean: 80.15714
std: 80.37204978735083
unique words: 29972


In [5]:
print(di_train)

Counter({'Z4': 770363, 'Z15': 661562, 'X344420902': 539437, 'Y3143': 385711, 'Y1222': 358409, 'Y1940': 350276, 'Y2817': 345873, 'Y1555': 339130, 'Y2321': 326496, 'Y1522': 312553, 'X123474229': 305840, 'Y2307': 287402, 'Y1478': 273975, 'Y1080': 272582, 'Y2170': 268575, 'Y1132': 267734, 'Y3269': 262662, 'Y1702': 261746, 'X773579': 256575, 'Y1486': 256548, 'Y1705': 249640, 'Y2949': 245349, 'Y2680': 241446, 'Y1725': 238939, 'Y2793': 232856, 'Y3232': 232779, 'Y3021': 231224, 'Y2814': 220307, 'Y2072': 218860, 'Y3145': 216411, 'Y1468': 215733, 'Y2024': 215405, 'Y2667': 214361, 'Y1690': 213021, 'Y2640': 212897, 'X127436348': 212844, 'Y3346': 212840, 'Y1791': 207816, 'Y1353': 205573, 'Y3585': 205102, 'Y1238': 202494, 'Y2267': 202079, 'Y977': 201865, 'Y2257': 200815, 'Y2849': 198633, 'Y1456': 197377, 'Y2684': 196151, 'Y3233': 196096, 'X62716661': 196005, 'Y3250': 195687, 'Y2794': 195295, 'Y2': 193731, 'Y3141': 193011, 'Y1367': 192578, 'Y2557': 192177, 'Y1206': 192057, 'Y2641': 191909, 'Y1533': 1

In [6]:
count = 0
for k, v in di_train.items():
    if di_train[k] > 20:
        count+=1
print('top words to keep: ', count)

top words to keep:  12604


In [7]:
def data_process(file_name, train=True, top_words=99):
    file = open(file_name,'r')
    if train:
        label = []
        no_of_data = [] # for creating the dictionary, gather all data
        train_x = [] #
        for line in file:
            lx = line.split('\t')
            ly = lx[1].rstrip().split(',')
            label.append(lx[0]) # labels appending
            train_x.append(ly) # used for numbering array
            no_of_data.extend(ly) # used for counting unique numbers
        
        #for calculating the line length
        max_len = max([len(i) for i in train_x])
        min_len = min([len(i) for i in train_x])
        mean_len = np.mean([len(i) for i in train_x])
        std_len = np.std([len(i) for i in train_x])

        print('max sentence length: ', max_len)
        print('min sentence length: ', min_len)
        print('mean:', mean_len)
        print('std:', std_len)

        # creating label array of shape(number,) 1-D row
        train_label = np.asarray(label).reshape(-1,)
        print('label array shape:', train_label.shape)
        
        # for checking the frequency
        di = collections.Counter(no_of_data)
        #print('di', di)
        
        
        ddd = dict(di)
        dddd = sorted([[value,key] for (key,value) in ddd.items()], reverse=True)
        
        # starting most frequent word with 3
        for i in range(len(dddd)):
            dddd[i][0] = i+3

        d = dict(dddd)

        res = {v:k for k,v in d.items()}
        
        # here we are assigning dictionary itens from '3', 0 is for padding, 1 is for start, 2 is for except top_words
        for i,j in res.items():
            if res[i]>top_words+3:
                res[i] = 2
        file.close()
        
        train_data = np.asarray(train_x).reshape(-1,)
        for index,i in enumerate(train_data):
            for pos,k in enumerate(i):
                train_data[index][pos] = res[train_data[index][pos]]#train_dict[train_data[index][pos]]
            train_data[index].insert(0,1)
            
        return (train_data, train_label, res)

def data_preprocess_test(filename, label_dictionary):
    file = open(filename,'r')
    test_x = [] #
    for line in file:
        #lx = line.split('\t')
        ly = line.rstrip().split(',')
        test_x.append(ly) # used for labeling
            
    max_len = max([len(i) for i in test_x])
    min_len = min([len(i) for i in test_x])
    mean_len = np.mean([len(i) for i in test_x])
    std_len = np.std([len(i) for i in test_x])

    print('max sentence length: ',max_len)
    print('min sentence length: ', min_len)
    print('mean:', mean_len)
    print('std:', std_len)

    file.close()
    test_data = np.asarray(test_x).reshape(-1,)
    for index,i in enumerate(test_data):
        for pos,k in enumerate(i):
            test_data[index][pos] = label_dictionary.get(test_data[index][pos],0) 
        test_data[index] = [j for j in i if j!=0]
        test_data[index].insert(0,1)   
    return test_data

In [8]:
data, target, dictionary = data_process('training-data-large.txt', top_words=10000)
print(data)

max sentence length:  5042
min sentence length:  2
mean: 71.308072
std: 78.98943759543307
label array shape: (1000000,)
[list([1, 1791, 1790, 2715, 2779, 581])
 list([1, 1776, 1777, 1775, 2729, 579])
 list([1, 109, 540, 817, 5, 415, 362, 560, 68, 191, 3, 409]) ...
 list([1, 73, 5979, 7530, 109, 312, 192, 667, 5, 314, 68, 309, 2, 736, 218, 98, 83, 7, 43, 32, 235, 274, 78, 161, 15, 159, 12, 178, 39, 33, 30, 238, 10, 325, 23, 49, 26, 168, 369, 36, 164, 29, 262, 191, 8, 3, 4])
 list([1, 1800, 1799, 2782, 579])
 list([1, 21, 1652, 270, 456, 5, 3534, 87, 405, 472, 156, 106, 6, 388, 4422, 57, 381, 39, 685, 30, 27, 72, 25, 35, 37, 42, 24, 164, 293, 397, 322, 9, 3, 561, 357])]


In [9]:
test_data = data_preprocess_test('test-data-large.txt', dictionary)

max sentence length:  2014
min sentence length:  2
mean: 80.15714
std: 80.37204978735083


In [10]:
print(test_data)

[list([1, 21, 19, 87, 1274, 372, 206, 220, 76, 44, 46, 154, 52, 237, 634, 840, 28, 735, 120, 705, 249, 295, 31, 32, 225, 55, 6, 199, 64, 105, 62, 163, 246, 1439, 66, 364, 17, 81, 261, 500, 638, 724, 388, 264, 274, 517, 92, 161, 57, 138, 2, 14, 366, 353, 505, 112, 708, 11, 421, 251, 223, 1122, 178, 74, 175, 228, 39, 75, 533, 1009, 30, 53, 27, 72, 94, 439, 135, 575, 618, 347, 25, 338, 121, 49, 35, 37, 508, 102, 369, 336, 42, 125, 24, 29, 262, 141, 47, 1336, 191, 117, 250, 131, 101, 233, 34, 322, 9, 174, 165, 107, 849, 669, 1308, 3, 4, 689])
 list([1, 71, 21, 487, 38, 93, 843, 19, 87, 18, 2450, 79, 83, 80, 100, 183, 769, 50, 16, 28, 401, 32, 254, 6, 143, 122, 246, 17, 58, 2709, 2, 139, 196, 1271, 65, 201, 92, 15, 931, 60, 74, 888, 228, 39, 124, 96, 129, 553, 477, 449, 654, 1885, 426, 905, 1638, 454, 238, 140, 493, 523, 985, 10, 181, 613, 446, 263, 23, 848, 59, 42, 125, 24, 164, 116, 981, 265, 162, 3, 4, 528])
 list([1, 5, 128, 279, 319, 13, 375, 137, 68, 627, 93, 87, 18, 202, 169, 327, 20

In [11]:
max_len = 150 # maximum sequence length
data = sequence.pad_sequences(data, maxlen=max_len)
test_data = sequence.pad_sequences(test_data, maxlen=max_len)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.1, random_state=42)

In [13]:
print(X_train.shape, X_test.shape)

(900000, 150) (100000, 150)


In [14]:
# create the model
top_words = 10000+4 # we created the words with '0' to '118+3'
model = Sequential()
model.add(Embedding(top_words, 32, input_length=max_len))
model.add(Flatten())

#model.add(LSTM(128))
#model.add(Dropout(0.2))

model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 150, 32)           320128    
_________________________________________________________________
flatten_1 (Flatten)          (None, 4800)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 250)               1200250   
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 251       
Total params: 1,520,629
Trainable params: 1,520,629
Non-trainable params: 0
_________________________________________________________________
None


In [15]:
# Fit the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=2, batch_size=32, verbose=2)


Train on 900000 samples, validate on 100000 samples
Epoch 1/2
 - 554s - loss: 0.2485 - acc: 0.8951 - val_loss: 0.2333 - val_acc: 0.9032
Epoch 2/2
 - 558s - loss: 0.2277 - acc: 0.9052 - val_loss: 0.2445 - val_acc: 0.8966


<keras.callbacks.History at 0x291ca2fe828>

In [16]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32, verbose=2)

Train on 900000 samples, validate on 100000 samples
Epoch 1/10
 - 542s - loss: 0.2124 - acc: 0.9113 - val_loss: 0.2431 - val_acc: 0.9015
Epoch 2/10
 - 553s - loss: 0.1919 - acc: 0.9194 - val_loss: 0.2671 - val_acc: 0.8982
Epoch 3/10
 - 564s - loss: 0.1698 - acc: 0.9285 - val_loss: 0.2836 - val_acc: 0.8951
Epoch 4/10
 - 557s - loss: 0.1508 - acc: 0.9360 - val_loss: 0.3311 - val_acc: 0.8919
Epoch 5/10
 - 560s - loss: 0.1361 - acc: 0.9423 - val_loss: 0.3882 - val_acc: 0.8850
Epoch 6/10
 - 556s - loss: 0.1242 - acc: 0.9472 - val_loss: 0.4217 - val_acc: 0.8866
Epoch 7/10
 - 580s - loss: 0.1148 - acc: 0.9513 - val_loss: 0.4476 - val_acc: 0.8824
Epoch 8/10
 - 557s - loss: 0.1072 - acc: 0.9543 - val_loss: 0.4762 - val_acc: 0.8824
Epoch 9/10
 - 557s - loss: 0.1009 - acc: 0.9572 - val_loss: 0.5136 - val_acc: 0.8812
Epoch 10/10
 - 557s - loss: 0.0952 - acc: 0.9595 - val_loss: 0.5526 - val_acc: 0.8760


<keras.callbacks.History at 0x291ca2fe7f0>

In [17]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32, verbose=2)

Train on 900000 samples, validate on 100000 samples
Epoch 1/10
 - 562s - loss: 0.0910 - acc: 0.9617 - val_loss: 0.5674 - val_acc: 0.8820
Epoch 2/10
 - 562s - loss: 0.0871 - acc: 0.9633 - val_loss: 0.5970 - val_acc: 0.8785
Epoch 3/10
 - 557s - loss: 0.0833 - acc: 0.9649 - val_loss: 0.6313 - val_acc: 0.8770
Epoch 4/10
 - 557s - loss: 0.0794 - acc: 0.9663 - val_loss: 0.6676 - val_acc: 0.8758
Epoch 5/10
 - 565s - loss: 0.0769 - acc: 0.9674 - val_loss: 0.6803 - val_acc: 0.8768
Epoch 6/10
 - 558s - loss: 0.0742 - acc: 0.9687 - val_loss: 0.7038 - val_acc: 0.8761
Epoch 7/10
 - 553s - loss: 0.0720 - acc: 0.9696 - val_loss: 0.7105 - val_acc: 0.8758
Epoch 8/10
 - 564s - loss: 0.0697 - acc: 0.9706 - val_loss: 0.7209 - val_acc: 0.8758
Epoch 9/10
 - 562s - loss: 0.0673 - acc: 0.9715 - val_loss: 0.7487 - val_acc: 0.8762
Epoch 10/10
 - 559s - loss: 0.0654 - acc: 0.9725 - val_loss: 0.7823 - val_acc: 0.8757


<keras.callbacks.History at 0x2910db03208>

In [18]:
test_output = model.predict(test_data, batch_size=32)

In [19]:
print(test_output.shape)

(100000, 1)


In [20]:
for i in range(test_output.shape[0]):
    if test_output[i] > 0.5:
        test_output[i] = 1.0
    else:
        test_output[i] = 0.0

In [21]:
np.savetxt('test_large.txt', test_output, delimiter=',')

In [None]:
text = np.loadtxt('test_small.txt')

In [None]:
print(text)

In [None]:
# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")
 
# later...
 
# load json and create model
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")
 
# evaluate loaded model on test data
loaded_model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
score = loaded_model.evaluate(X, Y, verbose=0)
print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))


In [None]:
with open ("test.txt","w")as fp:
   for line in list12:
       fp.write(line+"\n")