In [68]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
import numpy as np
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dropout
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [69]:
dataset = pd.read_csv('news/train.csv')
dataset = dataset.dropna()
dataset.reset_index(inplace=True)

In [70]:
X = dataset.iloc[:,0:4]
y = dataset.iloc[:,5]

In [71]:
X

Unnamed: 0,index,id,title,author
0,0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus
1,1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn
2,2,2,Why the Truth Might Get You Fired,Consortiumnews.com
3,3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss
4,4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy
...,...,...,...,...
18280,20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson
18281,20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman
18282,20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams
18283,20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary


In [72]:
y

0        1
1        0
2        1
3        1
4        1
        ..
18280    0
18281    0
18282    0
18283    1
18284    1
Name: label, Length: 18285, dtype: int64

In [73]:
lemmatizer = WordNetLemmatizer()
corpus = []

for i in range(0, len(X)):
    
    review = re.sub('[^a-zA-Z]', ' ', X['title'][i]).lower().split()
    review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [74]:
voc_size = 5000
onehot = [one_hot(words,voc_size)for words in corpus]

In [75]:
sent_length = 20
embedded_docs = pad_sequences(onehot,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[   0    0    0 ... 1497  544 4353]
 [   0    0    0 ... 1052  189 3694]
 [   0    0    0 ... 1789 1134 3204]
 ...
 [   0    0    0 ... 1469 2214 1688]
 [   0    0    0 ... 3384 2045 4457]
 [   0    0    0 ... 1705 3895  377]]


In [76]:
X = np.array(embedded_docs)
y = np.array(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [77]:
embedding_vector_features = 40
model = Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 20, 40)            200000    
                                                                 
 dropout_6 (Dropout)         (None, 20, 40)            0         
                                                                 
 lstm_5 (LSTM)               (None, 100)               56400     
                                                                 
 dropout_7 (Dropout)         (None, 100)               0         
                                                                 
 dense_5 (Dense)             (None, 1)                 101       
                                                                 
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________
None


In [78]:
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x173c300a970>

In [79]:
y_pred = np.argmax(model.predict(X_test), axis=1)



In [80]:
accuracy_score(y_test,y_pred)

0.5665285832642917

In [81]:
confusion_matrix(y_test,y_pred)

array([[3419,    0],
       [2616,    0]], dtype=int64)