## News Classification using Neural Network


### Importing necessary libraries

In [1]:
import pandas as pd
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import simplejson as json

### Reading Data 

In [2]:
data_json = []
for line in open('/home/rohan/CMI/SEM_2/DMML/Assignment_3/Data/News-Classification-DataSet.json', 'r'):
    data_json.append(json.loads(line))

In [3]:
data_json[1]['annotation']['label']
data_json[1]['content']

['SciTech']

'SPACE.com - TORONTO, Canada -- A second\\team of rocketeers competing for the  #36;10 million Ansari X Prize, a contest for\\privately funded suborbital space flight, has officially announced the first\\launch date for its manned rocket.'

## Preprocessing
### Making two lists- content & label

In [4]:
import re
p=re.compile(r'[a-zA-Z]+')
content=[]
label=[]
for i in range(len(data_json)):
    content.append(data_json[i]['content'])
    label.append(p.findall(str(data_json[i]['annotation']['label']))[0])

In [5]:
len(content)
len(label)
content[0]
label[0]

7600

7600

"Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul."

'Business'

### Creating a dataframe to get a look on the data

In [6]:
df=pd.DataFrame(list(zip(content,label)),
              columns=['content','label'])
df.tail()

Unnamed: 0,content,label
7595,Ukrainian presidential candidate Viktor Yushch...,World
7596,With the supply of attractive pitching options...,Sports
7597,Like Roger Clemens did almost exactly eight ye...,Sports
7598,SINGAPORE : Doctors in the United States have ...,Business
7599,EBay plans to buy the apartment and home renta...,Business


### To remove stopwords- Creating a list of stopwords

In [7]:
import nltk
#nltk.download('punkt')
from nltk.corpus import stopwords
s=stopwords.words('english')
add=[",","'",".","-","--","(",")",'``','\\\\',"''",";","#","..."]
for i in add:
    s.append(i)
stopset=set(s)    

In [8]:
from keras.preprocessing.text import Tokenizer
t = Tokenizer()
t.fit_on_texts(content)
# print(t.word_counts)
# print(t.document_count)
# print(t.word_index)
# print(t.word_docs)

Using TensorFlow backend.


### Removing stopwords

In [9]:
stopset = set(s)
for i in stopset:
    if i in t.word_index:
        t.word_index.pop(i)


406

17095

392

526

18421

1371

19

587

10247

184

18

134

31

9334

47

14892

64

396

6

44

1158

155

2254

1048

46

686

29

357

4675

201

4507

2702

148

408

15858

2537

23

65

5069

337

11

7100

1

294

175

11490

12

10313

2660

15

1262

2

3628

4321

277

15546

8

13

20

6974

1731

22

312

67

9229

17

173

2327

631

40

142

825

34

681

41

251

2276

2805

9200

5

4301

249

784

27

963

7

232

1135

477

292

168

14063

311

36

77

10588

266

710

7736

26

282

604

158

1581

3137

1210

550

1055

433

4

159

723

93

1848

1667

16194

497

12810

206

7939

10

2371

835

837

3111

514

2559

9336

193

3210

9014

665

926

56

758

57

101

63

360

1608

112

54

24

82

32

259

104

14

7077

12323

4991

66

3

919

156

120

28

59

71

179

1764

113

154

73

### Encoding the content using tokenizer
### Vocabulary of 20421 words

In [10]:
#integer encode documents
encoded_content = t.texts_to_matrix(content)
print(encoded_content)
# print(encoded_content[1])
len(encoded_content[1])
encoded_content[100] # an example

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


20421

array([0., 0., 0., ..., 0., 0., 0.])

### An example 

In [11]:
#printing position where the words of the first content lies in vocabulary
position=[]
for i in range(len(encoded_content[0])):
    if encoded_content[0][i]==1.0:
        position.append(i)
print(position)        

[161, 191, 223, 362, 434, 1606, 2338, 2871, 4190, 5439, 8130, 11247, 11248]


In [12]:
print("Shape of X is :" ,len(label))

Shape of X is : 7600


### Label encoding of target class

In [13]:
from sklearn.preprocessing import LabelEncoder

lb_make = LabelEncoder()

label_code = lb_make.fit_transform(label)

In [14]:
set(label_code)

{0, 1, 2, 3}

### Train-Test Split

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(encoded_content,label_code, random_state=17)


In [16]:
len(label)
len(X_train)
len(X_test)

7600

5700

1900

### One hot encoding of target class

In [17]:
def to_one_hot(labels, dimension=4):
    results = np.zeros((len(labels), dimension))
    for i, label in enumerate(labels):
        results[i, label] = 1.
    return results

# Our vectorized training labels
one_hot_train_labels = to_one_hot(y_train)
# Our vectorized test labels
one_hot_test_labels = to_one_hot(y_test)

#### Fitting models with different no of layers with different no of nodes and different activation functions

#### Traing Data : X_train , one_hot_train_labels(Target)
#### Test Data : X_test , one_hot_test_labels(Target)  

## Model 1

### k-fold cross-validation
#### Here k=10

In [18]:
from keras import models
from keras import layers
def model1():
    model = models.Sequential()
    model.add(layers.Dense(16, activation='relu', input_shape=(20421,)))
    model.add(layers.Dense(16, activation='relu'))
    model.add(layers.Dense(4, activation='softmax'))
    model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
    return model


In [19]:
from sklearn.model_selection import cross_val_score
from keras import models
from keras import layers
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_classification
neural_network = KerasClassifier(build_fn=model1, 
                                 epochs=10, 
                                 batch_size=50, 
                                 verbose=0)
cv_score_1 = cross_val_score(neural_network,X_train,one_hot_train_labels, cv=10)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


In [20]:
cv_score_1

array([0.84736841, 0.84561402, 0.85789474, 0.86666666, 0.85438597,
       0.85614034, 0.84561403, 0.8368421 , 0.85614033, 0.84736842])

In [21]:
from keras import models
from keras import layers

model = models.Sequential()
model.add(layers.Dense(16, activation='relu', input_shape=(20421,)))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(4, activation='softmax'))

In [22]:
model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [23]:
neural_1 =  model.fit(X_train,one_hot_train_labels,epochs=10,batch_size=50)
results1 = model.evaluate(X_test,one_hot_test_labels)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [24]:
results1

[0.6732649003028085, 0.8531578947368421]

In [25]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

In [26]:
result1 = model.predict(X_test)

pred_label_1 = []
for array in result1:
    pred_label_1.append(np.argmax(array))

In [27]:
print(result1)

[[2.3522509e-02 9.6115917e-01 1.1349554e-03 1.4183379e-02]
 [1.7779470e-05 9.9454135e-01 1.3863995e-04 5.3022448e-03]
 [3.2793771e-05 7.6516527e-01 3.8267329e-04 2.3441927e-01]
 ...
 [8.6307931e-01 1.3454072e-01 8.9345273e-04 1.4865154e-03]
 [4.3025960e-12 1.9771926e-09 1.8883786e-09 1.0000000e+00]
 [1.2215490e-04 9.9867803e-01 3.4988503e-04 8.4990537e-04]]


In [29]:
loss_model_1, acc_model_1 = model.evaluate(X_test, one_hot_test_labels)



In [30]:
prec_model_1 = precision_score(y_test,pred_label_1, average=None)[0]
f1_model_1 = f1_score(list(y_test), pred_label_1, average = None)[0]

In [31]:
print("Accuracy of model 1 : ", acc_model_1)
print("Precision of model 1 : ", prec_model_1)
print("F1 score of model 1  : ", f1_model_1)

Accuracy of model 1 :  0.8531578947368421
Precision of model 1 :  0.8121827411167513
F1 score of model 1  :  0.7872078720787207


## Model 2

### k-fold cross-validation 
#### Here k=10

In [32]:
from keras import models
from keras import layers
def model2():
    model = models.Sequential()
    model.add(layers.Dense(16, activation='relu', input_shape=(20421,)))
    model.add(layers.Dense(16, activation='relu'))
    model.add(layers.Dense(4, activation='sigmoid'))
    model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
    return model

In [33]:
neural_network = KerasClassifier(build_fn=model2, 
                                 epochs=10, 
                                 batch_size=512, 
                                 verbose=0)
cv_score_2 = cross_val_score(neural_network,X_train,one_hot_train_labels, cv=10)

In [34]:
model2 = models.Sequential()
model2.add(layers.Dense(16, activation='relu', input_shape=(20421,)))
model2.add(layers.Dense(16, activation='relu'))
model2.add(layers.Dense(4, activation='sigmoid'))

In [35]:
model2.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [36]:
neural_2 = model2.fit(X_train,one_hot_train_labels,epochs=10,batch_size=512)
results2 = model2.evaluate(X_test,one_hot_test_labels)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [37]:
results2

[0.5445045305553236, 0.8605263157894737]

In [38]:
result2 = model2.predict(X_test)

pred_label_2 = []
for array in result2:
    pred_label_2.append(np.argmax(array))

In [39]:
loss_model_2, acc_model_2 = model2.evaluate(X_test, one_hot_test_labels)

prec_model_2 = precision_score(y_test,pred_label_2, average=None)[0]
f1_model_2 = f1_score(list(y_test), pred_label_2, average = None)[0]



In [40]:
print("Accuracy of model 2 : ", acc_model_2)
print("Precision of model 2 : ", prec_model_2)
print("F1 score of model 2  : ", f1_model_2)

Accuracy of model 2 :  0.8605263157894737
Precision of model 2 :  0.8036951501154734
F1 score of model 2  :  0.8169014084507042


## Model 3

In [41]:
def model3():
    model = models.Sequential()
    model.add(layers.Dense(40, activation='relu', input_shape=(20421,)))
    model.add(layers.Dense(40, activation='relu'))
    model.add(layers.Dense(4, activation='softmax'))
    model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
    return model


In [42]:
neural_network = KerasClassifier(build_fn=model3, 
                                 epochs=10, 
                                 batch_size=50, 
                                 verbose=0)
cv_score_3 = cross_val_score(neural_network,X_train,one_hot_train_labels, cv=10)

In [43]:
model3 = models.Sequential()
model3.add(layers.Dense(40, activation='relu', input_shape=(20421,)))
model3.add(layers.Dense(40, activation='relu'))
model3.add(layers.Dense(4, activation='softmax'))

In [44]:
model3.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [91]:
neural_3 = model3.fit(X_train,one_hot_train_labels,epochs=10,batch_size=50)
results3=model3.evaluate(X_test,one_hot_test_labels)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [92]:
results3

[0.5588376650547511, 0.9173684210526316]

In [93]:
result3 = model3.predict(X_test)

pred_label_3 = []
for array in result3:
    pred_label_3.append(np.argmax(array))

In [94]:
loss_model_3, acc_model_3 = model3.evaluate(X_test, one_hot_test_labels)

prec_model_3 = precision_score(y_test,pred_label_3, average=None)[0]
f1_model_3 = f1_score(list(y_test), pred_label_3, average = None)[0]



In [95]:
print("Accuracy of model 3 : ", acc_model_3)
print("Precision of model 3 : ", prec_model_3)
print("F1 score of model 3  : ", f1_model_3)

Accuracy of model 3 :  0.9173684210526316
Precision of model 3 :  0.7712895377128953
F1 score of model 3  :  0.7638554216867469


### Model 4

In [50]:
def model4():
    model = models.Sequential()
    model.add(layers.Dense(40, activation='relu', input_shape=(20421,)))
    model.add(layers.Dense(4, activation='softmax'))
    model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])
    return model

In [51]:
neural_network = KerasClassifier(build_fn=model4, 
                                 epochs=10, 
                                 batch_size=50, 
                                 verbose=0)
cv_score_4 = cross_val_score(neural_network,X_train,one_hot_train_labels, cv=10)

In [52]:
model4 = models.Sequential()
model4.add(layers.Dense(40, activation='relu', input_shape=(20421,)))
model4.add(layers.Dense(4, activation='softmax'))

In [53]:
model4.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [54]:
neural_4 = model4.fit(X_train,one_hot_train_labels,epochs=10,batch_size=50)
results4=model4.evaluate(X_test,one_hot_test_labels)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fd5e8316ef0>



In [55]:
results4

[0.26806785332627203, 0.9285526315789474]

In [56]:
result4 = model4.predict(X_test)

pred_label_4 = []
for array in result4:
    pred_label_4.append(np.argmax(array))

In [57]:
loss_model_4, acc_model_4 = model4.evaluate(X_test, one_hot_test_labels)

prec_model_4 = precision_score(y_test,pred_label_4, average=None)[0]
f1_model_4 = f1_score(list(y_test), pred_label_4, average = None)[0]



In [58]:
print("Accuracy of model 4 : ", acc_model_4)
print("Precision of model 4 : ", prec_model_4)
print("F1 score of model 4  : ", f1_model_4)

Accuracy of model 4 :  0.9285526315789474
Precision of model 4 :  0.7752293577981652
F1 score of model 4  :  0.7906432748538011


### Model 5

In [59]:
def model5():
    model = models.Sequential()
    model.add(layers.Dense(40, activation='relu', input_shape=(20421,)))
    model.add(layers.Dense(4, activation='sigmoid'))
    model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])
    return model

In [60]:
neural_network = KerasClassifier(build_fn=model5, 
                                 epochs=10, 
                                 batch_size=50, 
                                 verbose=0)
cv_score_5 = cross_val_score(neural_network,X_train,one_hot_train_labels, cv=10)

  % delta_t_median)
  % delta_t_median)
  % delta_t_median)
  % delta_t_median)


In [61]:
model5 = models.Sequential()
model5.add(layers.Dense(40, activation='relu', input_shape=(20421,)))
model5.add(layers.Dense(4, activation='sigmoid'))

In [62]:
model5.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [63]:
model5.fit(X_train,one_hot_train_labels,epochs=10,batch_size=50)
results5=model5.evaluate(X_test,one_hot_test_labels)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fd67d1860b8>



In [64]:
results5

[0.2380709560137046, 0.9301315789473684]

In [65]:
result5 = model5.predict(X_test)

pred_label_5 = []
for array in result5:
    pred_label_5.append(np.argmax(array))

In [66]:
loss_model_5, acc_model_5 = model5.evaluate(X_test, one_hot_test_labels)

prec_model_5 = precision_score(y_test,pred_label_5, average=None)[0]
f1_model_5 = f1_score(list(y_test), pred_label_5, average = None)[0]



In [67]:
print("Accuracy of model 5 : ", acc_model_5)
print("Precision of model 5 : ", prec_model_5)
print("F1 score of model 5  : ", f1_model_5)

Accuracy of model 5 :  0.9301315789473684
Precision of model 5 :  0.7837837837837838
F1 score of model 5  :  0.8064889918887602


## Model 6

In [76]:
from keras import optimizers
from keras import losses
from keras.wrappers.scikit_learn import KerasClassifier

In [77]:
def model6():
    model = models.Sequential()
    model.add(layers.Dense(40, activation='relu', input_shape=(20421,)))
    model.add(layers.Dense(4, activation='softmax'))
    
    model.compile(optimizer=optimizers.RMSprop(lr=0.01), loss=losses.binary_crossentropy, metrics = ["acc"] )
    return model

In [78]:
neural_network = KerasClassifier(build_fn=model6, 
                                 epochs=10, 
                                 batch_size=50, 
                                 verbose=0)
cv_score_6 = cross_val_score(neural_network,X_train,one_hot_train_labels, cv=10)

  % delta_t_median)


In [79]:
model_6 = model6()
neural_6 = model_6.fit(X_train,one_hot_train_labels,epochs=10,batch_size=50)
results6 = model_6.evaluate(X_test,one_hot_test_labels)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [80]:
results6

[0.6236510901819718, 0.9180263157894737]

In [81]:
result6 = model_6.predict(X_test)

pred_label_6 = []
for array in result6:
    pred_label_6.append(np.argmax(array))

In [82]:
loss_model_6, acc_model_6 = model_6.evaluate(X_test, one_hot_test_labels)

prec_model_6 = precision_score(y_test,pred_label_6, average=None)[0]
f1_model_6 = f1_score(list(y_test), pred_label_6, average = None)[0]



In [83]:
print("Accuracy of model 6 : ", acc_model_6)
print("Precision of model 6 : ", prec_model_6)
print("F1 score of model 6  : ", f1_model_6)

Accuracy of model 6 :  0.9180263157894737
Precision of model 6 :  0.7822784810126582
F1 score of model 6  :  0.7592137592137592


### Model 7

In [68]:
def model7():
    model = models.Sequential()
    model.add(layers.Dense(32, activation='sigmoid', input_shape=(20421,)))
    model.add(layers.Dense(32, activation='sigmoid'))
    model.add(layers.Dense(4, activation='softmax'))
    model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])
    return model

In [69]:
neural_network = KerasClassifier(build_fn=model7, 
                                 epochs=10, 
                                 batch_size=50, 
                                 verbose=0)
cv_score_7 = cross_val_score(neural_network,X_train,one_hot_train_labels, cv=10)

In [70]:
model7 = models.Sequential()
model7.add(layers.Dense(32, activation='sigmoid', input_shape=(20421,)))
model7.add(layers.Dense(32, activation='sigmoid'))
model7.add(layers.Dense(4, activation='softmax'))

In [71]:
model7.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [72]:
model7.fit(X_train,one_hot_train_labels,epochs=10,batch_size=50)
results7=model7.evaluate(X_test,one_hot_test_labels)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fd5e2ef7320>



In [73]:
result7 = model7.predict(X_test)

pred_label_7 = []
for array in result7:
    pred_label_7.append(np.argmax(array))

In [74]:
loss_model_7, acc_model_7 = model7.evaluate(X_test, one_hot_test_labels)

prec_model_7 = precision_score(y_test,pred_label_7, average=None)[0]
f1_model_7 = f1_score(list(y_test), pred_label_7, average = None)[0]



In [75]:
print("Accuracy of model 7 : ", acc_model_7)
print("Precision of model 7 : ", prec_model_7)
print("F1 score of model 7  : ", f1_model_7)

Accuracy of model 7 :  0.8721052631578947
Precision of model 7 :  0.7982062780269058
F1 score of model 7  :  0.823121387283237


## Saving Output in a file

In [112]:
f = open("Output","w")

In [113]:
for i in range(1,8):


    cv_score = vars()["cv_score_" + str(i)]
    acc_model = vars()["acc_model_" + str(i)]
    prec_model = vars()["prec_model_" + str(i)]
    f1_model = vars()["f1_model_" + str(i)]


    print("\t\t MODEL",i, file = f)
    print("\n\n", file = f)

    print("Ten fold cross validation of model {} is : ".format(i), list(map(lambda x: round(x * 100,2), cv_score)), file = f)
    print("\n", file = f)

    print("Accuracy is : ", acc_model, file = f)
    print("Precision is : ", prec_model, file = f)
    print("F1 score is : ", f1_model, file = f)
    print("\n\n", file = f)


In [114]:
f.close()