# Data Preprocessing

In [11]:
# Importing Libraries
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
from sklearn import tree
# from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
df_train = pd.read_csv("/content/F3_FineGrained_Fake_News_Detection_train.csv")
df_test = pd.read_csv("/content/F3_FineGrained_Fake_News_Detection_test.csv")
x_train = df_train["statement"].values
x_test = df_test["statement"].values
label_map={'mostly-true':4,'barely-true':2,'half-true':3,'false':1, 'true':5,'pants-fire':0}
y = df_train["label"].values
y_train = []
for i in range(len(y)):
    y_train.append(label_map[y[i]])
y_train = np.array(y_train)
# le = LabelEncoder()
# y_train = le.fit_transform(df_train["label"])
print(x_train.shape, y_train.shape)

(7168,) (7168,)


In [3]:
def lowercasing(statement):
  s2 = []
  for k in statement:
    s2.append(k.lower())
  
  return s2

def stemming(statement):
  s2 = []
  ps = nltk.stem.PorterStemmer()
  for k in statement:
    tokens = nltk.word_tokenize(k)
    t2 = []
    for word in tokens:
      word = ps.stem(word)
      t2.append(word)
    t2 = " ".join(t2)
    s2.append(t2)
  return s2

x_train = lowercasing(x_train)
x_train = stemming(x_train)

x_test = lowercasing(x_test)
x_test = stemming(x_test)

# Vectorization

In [4]:
# TF-IDF
vectorizer = TfidfVectorizer(max_features = 500)
x_train = vectorizer.fit_transform(x_train).toarray()
x_test = vectorizer.fit_transform(x_test).toarray()

In [5]:
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.1, random_state=42)
print(x_train.shape, y_train.shape)
print(x_val.shape, y_val.shape)

(6451, 500) (6451,)
(717, 500) (717,)


# Model

In [6]:
# KNN Model
classifier = KNeighborsClassifier(n_neighbors=50, weights='distance')
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        63
           1       0.24      0.39      0.30       135
           2       0.27      0.15      0.19       103
           3       0.23      0.27      0.25       160
           4       0.28      0.32      0.30       136
           5       0.25      0.23      0.24       120

    accuracy                           0.25       717
   macro avg       0.21      0.22      0.21       717
weighted avg       0.23      0.25      0.24       717



In [9]:
# dt model
classifier = tree.DecisionTreeClassifier()
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.12      0.11      0.12        63
           1       0.22      0.24      0.23       135
           2       0.21      0.22      0.22       103
           3       0.23      0.26      0.25       160
           4       0.25      0.23      0.24       136
           5       0.19      0.17      0.18       120

    accuracy                           0.22       717
   macro avg       0.21      0.20      0.20       717
weighted avg       0.21      0.22      0.21       717



In [10]:
# Random Forest
classifier = RandomForestClassifier(n_estimators=200, criterion='entropy')
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.50      0.08      0.14        63
           1       0.23      0.36      0.28       135
           2       0.18      0.13      0.15       103
           3       0.20      0.28      0.23       160
           4       0.23      0.21      0.22       136
           5       0.24      0.16      0.19       120

    accuracy                           0.22       717
   macro avg       0.26      0.20      0.20       717
weighted avg       0.24      0.22      0.21       717



In [14]:
# SVM
classifier = SVC(kernel='rbf')
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.50      0.02      0.03        63
           1       0.27      0.42      0.33       135
           2       0.28      0.17      0.21       103
           3       0.25      0.34      0.29       160
           4       0.17      0.18      0.18       136
           5       0.30      0.19      0.23       120

    accuracy                           0.25       717
   macro avg       0.30      0.22      0.21       717
weighted avg       0.27      0.25      0.23       717



In [13]:
# Linear Regression
classifier = GaussianNB()
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.11      0.71      0.18        63
           1       0.21      0.05      0.08       135
           2       0.25      0.13      0.17       103
           3       0.34      0.15      0.21       160
           4       0.25      0.14      0.18       136
           5       0.21      0.11      0.14       120

    accuracy                           0.17       717
   macro avg       0.23      0.22      0.16       717
weighted avg       0.24      0.17      0.16       717



In [None]:
y_test_pred = classifier.predict(x_test)
y_test_pred

array([1, 3, 4, ..., 1, 3, 3])

In [None]:
df_test["label"] = y_test_pred
final_res = df_test[['label', 'id']]
final_res.head()

Unnamed: 0,label,id
0,1,0
1,3,1
2,4,2
3,3,3
4,2,4


In [None]:
final_res.to_csv("F3.csv", index=False)