# Data Preprocessing

In [54]:
# Importing Libraries
!pip install sentence_transformers
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sentence_transformers import SentenceTransformer
import tensorflow as tf

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [55]:
# Importing Data
df_train = pd.read_csv("/content/F3_FineGrained_Fake_News_Detection_train.csv")
x_train = df_train["statement"].values
df_test = pd.read_csv("/content/F3_FineGrained_Fake_News_Detection_test.csv")
x_test = df_test["statement"].values
label_map={'mostly-true':4,'barely-true':2,'half-true':3,'false':1, 'true':5,'pants-fire':0}
y = df_train["label"].values
y_train = []
for i in range(len(y)):
    y_train.append(label_map[y[i]])
y_train = np.array(y_train)

In [56]:
# Data Cleaning
def lowercasing(statement):
  s2 = []
  for k in statement:
    s2.append(k.lower())
  
  return s2

def stemming(statement):
  s2 = []
  ps = nltk.stem.PorterStemmer()
  for k in statement:
    tokens = nltk.word_tokenize(k)
    t2 = []
    for word in tokens:
      word = ps.stem(word)
      t2.append(word)
    t2 = " ".join(t2)
    s2.append(t2)
  return s2

x_train = lowercasing(x_train)
x_train = stemming(x_train)
x_test = lowercasing(x_test)
x_test = stemming(x_test)

# Vectorization

In [57]:
#Vectorization
CONEXTUAL_MODEL_TYPE = SentenceTransformer('all-mpnet-base-v2')
x_train1 = CONEXTUAL_MODEL_TYPE.encode(df_train["statement"])
x_train2 = CONEXTUAL_MODEL_TYPE.encode(df_train["subject"])
x_train3 = CONEXTUAL_MODEL_TYPE.encode(df_train["speaker"])
x_train4 = CONEXTUAL_MODEL_TYPE.encode(df_train["party affiliation"])
x_train = np.concatenate((x_train1, x_train2, x_train3, x_train4),axis=1)

x_test1 = CONEXTUAL_MODEL_TYPE.encode(df_test["statement"])
x_test2 = CONEXTUAL_MODEL_TYPE.encode(df_test["subject"])
x_test3 = CONEXTUAL_MODEL_TYPE.encode(df_test["speaker"])
x_test4 = CONEXTUAL_MODEL_TYPE.encode(df_test["party affiliation"])
x_test = np.concatenate((x_test1, x_test2, x_test3, x_test4),axis=1)

In [51]:
#Train Test Split
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.1, random_state=42)
print(x_train.shape, y_train.shape)
print(x_val.shape, y_val.shape)
print(x_test.shape)

(5805, 3072) (5805,)
(646, 3072) (646,)
(3072, 3072)


# Model

In [58]:
model = tf.keras.models.Sequential([tf.keras.layers.Dense(512, activation=tf.nn.relu), 
                                    tf.keras.layers.Dropout(0.2),
                                    tf.keras.layers.Dense(64, activation=tf.nn.relu),
                                    tf.keras.layers.Dropout(0.2),
                                    tf.keras.layers.Dense(6, activation=tf.nn.softmax)])

In [59]:
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.fit(x_train, y_train, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fd230067a00>

# Evaluation

In [43]:
y_pred = tf.argmax(model.predict(x_val), axis=1).numpy()
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.42      0.25      0.32        63
           1       0.34      0.32      0.33       135
           2       0.21      0.28      0.24       103
           3       0.30      0.33      0.31       160
           4       0.18      0.14      0.16       136
           5       0.28      0.31      0.29       120

    accuracy                           0.27       717
   macro avg       0.29      0.27      0.28       717
weighted avg       0.28      0.27      0.27       717



In [60]:
y_test_pred = tf.argmax(model.predict(x_test), axis=1).numpy()
df_test["label"] = y_test_pred
final_res = df_test[['label', 'id']]
final_res.head()



Unnamed: 0,label,id
0,3,0
1,3,1
2,2,2
3,1,3
4,4,4


In [61]:
final_res.to_csv("F3.csv", index=False)