# Data Preprocessing

In [1]:
#Importing Library
!pip install tensorflow
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
from nltk.stem.porter import PorterStemmer
import tensorflow as tf
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense, Embedding, Bidirectional
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
# Importing Data
df_train = pd.read_csv("/content/F3_FineGrained_Fake_News_Detection_train.csv")
x_train = df_train["statement"].values
df_test = pd.read_csv("/content/F3_FineGrained_Fake_News_Detection_test.csv")
x_test = df_test["statement"].values
label_map={'mostly-true':4,'barely-true':2,'half-true':3,'false':1, 'true':5,'pants-fire':0}
temp = []
y_train = df_train['label'].values
for i in range(len(y_train)):
    if y_train[i] == "false":
        temp.append([0,1,0,0,0,0])
    elif y_train[i] == "mostly-true":
        temp.append([0,0,0,0,1,0])
    elif y_train[i] == "barely-true":
        temp.append([0,0,1,0,0,0])
    elif y_train[i] == "true":
        temp.append([0,0,0,0,0,1])
    elif y_train[i] == "half-true":
        temp.append([0,0,0,1,0,0])
    elif y_train[i] == "pants-fire":
        temp.append([1,0,0,0,0,0])
y_train = np.array(temp)

In [3]:
# Data Cleaning
def lowercasing(statement):
  s2 = []
  for k in statement:
    s2.append(k.lower())
  
  return s2

def stemming(statement):
  s2 = []
  ps = nltk.stem.PorterStemmer()
  for k in statement:
    tokens = nltk.word_tokenize(k)
    t2 = []
    for word in tokens:
      word = ps.stem(word)
      t2.append(word)
    t2 = " ".join(t2)
    s2.append(t2)
  return s2

x_train = lowercasing(x_train)
x_train = stemming(x_train)
x_test = lowercasing(x_test)
x_test = stemming(x_test)

# Vectorization

In [4]:
# Hyperparameters
VOCAB_SIZE = 1000
SENT_LENGTH = 20
VECTOR_SIZE = 40

In [5]:
#Vectorizing Training Data
one_hot_repr = [one_hot(words, VOCAB_SIZE) for words in df_train["statement"]]
embedded_docs_train1 = pad_sequences(one_hot_repr, padding='pre', maxlen=SENT_LENGTH)
one_hot_repr = [one_hot(str(words), VOCAB_SIZE) for words in df_train["subject"]]
embedded_docs_train2 = pad_sequences(one_hot_repr, padding='pre', maxlen=SENT_LENGTH)
one_hot_repr = [one_hot(str(words), VOCAB_SIZE) for words in df_train["speaker"]]
embedded_docs_train3 = pad_sequences(one_hot_repr, padding='pre', maxlen=SENT_LENGTH)
one_hot_repr = [one_hot(str(words), VOCAB_SIZE) for words in df_train["party affiliation"]]
embedded_docs_train4 = pad_sequences(one_hot_repr, padding='pre', maxlen=SENT_LENGTH)
embedded_docs = np.concatenate([embedded_docs_train1, embedded_docs_train2, embedded_docs_train3, embedded_docs_train4], axis=1)

#Train Test Split
embedded_docs_train, embedded_docs_val, y_train, y_val = train_test_split(embedded_docs, y_train, test_size=0.1, random_state=42)

#Vectorizing Test Data
one_hot_repr = [one_hot(words, VOCAB_SIZE) for words in df_test["statement"]]
embedded_docs_test1 = pad_sequences(one_hot_repr, padding='pre', maxlen=SENT_LENGTH)
one_hot_repr = [one_hot(str(words), VOCAB_SIZE) for words in df_test["subject"]]
embedded_docs_test2 = pad_sequences(one_hot_repr, padding='pre', maxlen=SENT_LENGTH)
one_hot_repr = [one_hot(str(words), VOCAB_SIZE) for words in df_test["speaker"]]
embedded_docs_test3 = pad_sequences(one_hot_repr, padding='pre', maxlen=SENT_LENGTH)
one_hot_repr = [one_hot(str(words), VOCAB_SIZE) for words in df_test["party affiliation"]]
embedded_docs_test4 = pad_sequences(one_hot_repr, padding='pre', maxlen=SENT_LENGTH)
embedded_docs_test = np.concatenate([embedded_docs_test1, embedded_docs_test2, embedded_docs_test3, embedded_docs_test4], axis=1)

print(embedded_docs_train.shape, y_train.shape)
print(embedded_docs_val.shape, y_val.shape)
print(embedded_docs_test.shape,)

(6451, 80) (6451, 6)
(717, 80) (717, 6)
(3072, 80)


# Model

In [6]:
model = Sequential()
model.add(Embedding(VOCAB_SIZE, VECTOR_SIZE, input_length=SENT_LENGTH*4))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(100)))
model.add(Dropout(0.2))
model.add(Dense(6, activation='softmax'))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 80, 40)            40000     
                                                                 
 dropout (Dropout)           (None, 80, 40)            0         
                                                                 
 bidirectional (Bidirectiona  (None, 200)              112800    
 l)                                                              
                                                                 
 dropout_1 (Dropout)         (None, 200)               0         
                                                                 
 dense (Dense)               (None, 6)                 1206      
                                                                 
Total params: 154,006
Trainable params: 154,006
Non-trainable params: 0
__________________________________________________

In [7]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.fit(embedded_docs_train, y_train, epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7feb69539070>

# Evaluation

In [8]:
y_pred = tf.argmax(model.predict(embedded_docs_val), axis=1).numpy()
y_val = np.argmax(y_val, axis=1)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.18      0.13      0.15        63
           1       0.26      0.39      0.31       135
           2       0.18      0.14      0.15       103
           3       0.23      0.16      0.19       160
           4       0.25      0.34      0.29       136
           5       0.32      0.26      0.29       120

    accuracy                           0.25       717
   macro avg       0.24      0.23      0.23       717
weighted avg       0.24      0.25      0.24       717



In [9]:
y_test_pred = tf.argmax(model.predict(embedded_docs_test), axis=1).numpy()
df_test["label"] = y_test_pred
final_res = df_test[['label', 'id']]
final_res.head()



Unnamed: 0,label,id
0,1,0
1,3,1
2,4,2
3,4,3
4,5,4


In [10]:
final_res.to_csv("F3.csv", index=False)