# Data Preprocessing

In [None]:
# Importing Libraries
!pip install sentence_transformers
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sentence_transformers import SentenceTransformer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Importing Data
df_train = pd.read_csv("/content/F3_FineGrained_Fake_News_Detection_train.csv")
x_train = df_train["statement"].values
df_test = pd.read_csv("/content/F3_FineGrained_Fake_News_Detection_test.csv")
x_test = df_test["statement"].values
label_map={'mostly-true':4,'barely-true':2,'half-true':3,'false':1, 'true':5,'pants-fire':0}
y = df_train["label"].values
y_train = []
for i in range(len(y)):
    y_train.append(label_map[y[i]])
y_train = np.array(y_train)

In [None]:
# Data Cleaning
def lowercasing(statement):
  s2 = []
  for k in statement:
    s2.append(k.lower())
  
  return s2

def stemming(statement):
  s2 = []
  ps = nltk.stem.PorterStemmer()
  for k in statement:
    tokens = nltk.word_tokenize(k)
    t2 = []
    for word in tokens:
      word = ps.stem(word)
      t2.append(word)
    t2 = " ".join(t2)
    s2.append(t2)
  return s2

x_train = lowercasing(x_train)
x_train = stemming(x_train)
x_test = lowercasing(x_test)
x_test = stemming(x_test)

# Vectorization

In [None]:
#Vectorization
CONEXTUAL_MODEL_TYPE = SentenceTransformer('all-mpnet-base-v2')
x_train1 = CONEXTUAL_MODEL_TYPE.encode(df_train["statement"])
x_train2 = CONEXTUAL_MODEL_TYPE.encode(df_train["subject"])
x_train3 = CONEXTUAL_MODEL_TYPE.encode(df_train["speaker"])
x_train4 = CONEXTUAL_MODEL_TYPE.encode(df_train["party affiliation"])
x_train = np.concatenate((x_train1, x_train2, x_train3, x_train4),axis=1)

x_test1 = CONEXTUAL_MODEL_TYPE.encode(df_test["statement"])
x_test2 = CONEXTUAL_MODEL_TYPE.encode(df_test["subject"])
x_test3 = CONEXTUAL_MODEL_TYPE.encode(df_test["speaker"])
x_test4 = CONEXTUAL_MODEL_TYPE.encode(df_test["party affiliation"])
x_test = np.concatenate((x_test1, x_test2, x_test3, x_test4),axis=1)

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
#Train Test Split
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.1, random_state=42)
print(x_train.shape, y_train.shape)
print(x_val.shape, y_val.shape)

(6451, 3072) (6451,)
(717, 3072) (717,)


# Models

In [None]:
# KNN
classifier = KNeighborsClassifier(n_neighbors=50, weights='distance')
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.48      0.22      0.30        63
           1       0.26      0.33      0.29       135
           2       0.21      0.13      0.16       103
           3       0.26      0.36      0.30       160
           4       0.26      0.33      0.29       136
           5       0.26      0.15      0.19       120

    accuracy                           0.27       717
   macro avg       0.29      0.25      0.26       717
weighted avg       0.27      0.27      0.26       717



In [None]:
# Decision Tree
classifier = tree.DecisionTreeClassifier()
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.27      0.29      0.28        63
           1       0.27      0.24      0.26       135
           2       0.21      0.24      0.23       103
           3       0.16      0.16      0.16       160
           4       0.24      0.24      0.24       136
           5       0.17      0.17      0.17       120

    accuracy                           0.22       717
   macro avg       0.22      0.22      0.22       717
weighted avg       0.22      0.22      0.22       717



In [None]:
# Random Forest
classifier = RandomForestClassifier(n_estimators=200, criterion='entropy')
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.48      0.16      0.24        63
           1       0.25      0.38      0.30       135
           2       0.22      0.13      0.16       103
           3       0.27      0.30      0.28       160
           4       0.27      0.37      0.31       136
           5       0.34      0.19      0.24       120

    accuracy                           0.27       717
   macro avg       0.30      0.25      0.26       717
weighted avg       0.29      0.27      0.26       717



In [None]:
# SVM
classifier = SVC(kernel='rbf')
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.57      0.21      0.30        63
           1       0.28      0.46      0.35       135
           2       0.29      0.19      0.23       103
           3       0.33      0.34      0.33       160
           4       0.31      0.40      0.35       136
           5       0.31      0.17      0.22       120

    accuracy                           0.31       717
   macro avg       0.35      0.29      0.30       717
weighted avg       0.33      0.31      0.30       717



In [None]:
# Linear Regression
classifier = GaussianNB()
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.16      0.29      0.21        63
           1       0.24      0.27      0.26       135
           2       0.24      0.28      0.26       103
           3       0.27      0.05      0.08       160
           4       0.27      0.43      0.33       136
           5       0.25      0.20      0.22       120

    accuracy                           0.24       717
   macro avg       0.24      0.25      0.23       717
weighted avg       0.25      0.24      0.22       717

