In [35]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

In [29]:
dataset = pd.read_csv('/content/Dataset.csv')
dataset = dataset.set_index('id', drop = True)

print(dataset.shape)
dataset.head()

(1146, 3)


Unnamed: 0_level_0,title,text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Avrupa Birliği Şam'daki temsilciliğini yeniden...,Kallas; Avrupa Parlamentosu Genel Kurulunda dü...,0
1,İsrail'in saldırıları nedeniyle Filistinliler ...,Görgü tanıklarından alınan bilgiye göre; İsrai...,0
2,Von der Leyen'den Suriye mesajı: Türkiye'nin b...,Von der Leyen; Cumhurbaşkanı Recep Tayyip Erdo...,0
3,Dışişleri'nden İsrail'e Gazze tepkisi: Soykırı...,Dışişleri Bakanlığı'ndan yapılan yazılı açıkla...,0
4,Güney Kore'de; başkanlık bilgisayarlarına eriş...,​​Yonhap'ta çıkan haberde; polis ve Yolsuzluk ...,0


In [30]:
print('missing values counts\n', dataset.isnull().sum())

length = []
[length.append(len(str(text))) for text in dataset['text']]
dataset['length'] = length
print('data length\n', dataset.head())

print('min data length', min(dataset['length']), ', max data length', max(dataset['length']), ', average data length', round(sum(dataset['length'])/len(dataset['length'])))

print('count of less then 50 character', len(dataset[dataset['length'] < 50]))

# dropping the outliers
dataset = dataset.drop(dataset['text'][dataset['length'] < 50].index, axis = 0)
print('min data length', min(dataset['length']), ', max data length', max(dataset['length']), ', average data length', round(sum(dataset['length'])/len(dataset['length'])))
print(dataset.shape)

missing values counts
 title      0
text     330
label      0
dtype: int64
data length
                                                 title  \
id                                                      
0   Avrupa Birliği Şam'daki temsilciliğini yeniden...   
1   İsrail'in saldırıları nedeniyle Filistinliler ...   
2   Von der Leyen'den Suriye mesajı: Türkiye'nin b...   
3   Dışişleri'nden İsrail'e Gazze tepkisi: Soykırı...   
4   Güney Kore'de; başkanlık bilgisayarlarına eriş...   

                                                 text  label  length  
id                                                                    
0   Kallas; Avrupa Parlamentosu Genel Kurulunda dü...      0     946  
1   Görgü tanıklarından alınan bilgiye göre; İsrai...      0     971  
2   Von der Leyen; Cumhurbaşkanı Recep Tayyip Erdo...      0    6321  
3   Dışişleri Bakanlığı'ndan yapılan yazılı açıkla...      0    1128  
4   ​​Yonhap'ta çıkan haberde; polis ve Yolsuzluk ...      0    2225  
min data length

In [31]:
max_features = 2500

# Tokenizing the text - converting the words, letters into counts or numbers.
# We dont need to explicitly remove the punctuations. we have an inbuilt option in Tokenizer for this purpose
tokenizer = Tokenizer(num_words = max_features, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower = True, split = ' ')
tokenizer.fit_on_texts(texts = dataset['text'])
X = tokenizer.texts_to_sequences(texts = dataset['text'])

# now applying padding to make them even shaped.
X = pad_sequences(sequences = X, maxlen = max_features, padding = 'pre')

print('X shape', X.shape)
y = dataset['label'].values
print('Y shape', y.shape)

# splitting the data training data for training and validation.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 101)

X shape (816, 2500)
Y shape (816,)


In [32]:
# LSTM Neural Network
lstm_model = Sequential(name = 'lstm_nn_model')
lstm_model.add(layer = Embedding(input_dim = max_features, output_dim = 120, name = '1st_layer'))
lstm_model.add(layer = LSTM(units = 120, dropout = 0.2, recurrent_dropout = 0.2, name = '2nd_layer'))
lstm_model.add(layer = Dropout(rate = 0.5, name = '3rd_layer'))
lstm_model.add(layer = Dense(units = 120,  activation = 'relu', name = '4th_layer'))
lstm_model.add(layer = Dropout(rate = 0.5, name = '5th_layer'))
lstm_model.add(layer = Dense(units = len(set(y)),  activation = 'sigmoid', name = 'output_layer'))
# compiling the model
lstm_model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

lstm_model_fit = lstm_model.fit(X_train, y_train, epochs = 10)

Epoch 1/10
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 5s/step - accuracy: 0.8004 - loss: 0.5985
Epoch 2/10
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 5s/step - accuracy: 0.9580 - loss: 0.1049
Epoch 3/10
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 5s/step - accuracy: 0.9725 - loss: 0.0777
Epoch 4/10
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 5s/step - accuracy: 0.9760 - loss: 0.0451
Epoch 5/10
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 5s/step - accuracy: 0.9941 - loss: 0.0225
Epoch 6/10
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 5s/step - accuracy: 0.9964 - loss: 0.0145
Epoch 7/10
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 5s/step - accuracy: 1.0000 - loss: 0.0050
Epoch 8/10
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 5s/step - accuracy: 1.0000 - loss: 0.0017
Epoch 9/10
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━

In [37]:
lstm_prediction = lstm_model.predict(X_test)
lstm_prediction_vec = np.argmax(lstm_prediction, axis=1)

print("lstm_prediction", lstm_prediction_vec)

accuracy = accuracy_score(y_test, lstm_prediction_vec)
precision = precision_score(y_test, lstm_prediction_vec, average='weighted')
recall = recall_score(y_test, lstm_prediction_vec, average='weighted')
f1 = f1_score(y_test, lstm_prediction_vec, average='weighted')
confisiun_matrix = confusion_matrix(y_test, lstm_prediction_vec)

classification_rep = classification_report(y_test, lstm_prediction_vec,)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")
print(f"F1-Score: {f1:.2f}")
print(f"Classification Report:\n {classification_rep}")
print(f"Confisiun Matrix:\n {confisiun_matrix}")

[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2s/step
lstm_prediction [0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 1 0 1 0 1 0 0 1 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 1 0 1 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Accuracy: 0.98
Precision: 0.98
Recall: 0.98
F1-Score: 0.98
F1-Score: 0.98
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99       137
           1       0.90      1.00      0.95        27

    accuracy                           0.98       164
   macro avg       0.95      0.99      0.97       164
weighted avg       0.98      0.98      0.98       164

Confisiun Matrix:
 [[134   3]
 [  0  27]]
