In [1]:
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import csv                               # csv reader
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_fscore_support # to report on precision and recall
import numpy as np

import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer

import nltk
from nltk import bigrams
import nltk
from nltk import trigrams

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
def load_data(path):
    """Load data from a tab-separated file and append it to raw_data."""
    with open(path) as f:
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            if line[0] == "Id":  # skip header
                continue
            (label, text) = parse_data_line(line)
            raw_data.append((text, label))

def split_and_preprocess_data(percentage):
    """Split the data between train_data and test_data according to the percentage
    and performs the preprocessing."""
    num_samples = len(raw_data)
    num_training_samples = int((percentage * num_samples))
    for (text, label) in raw_data[:num_training_samples]:
        train_data.append((to_feature_vector(pre_process(text)),label))
    for (text, label) in raw_data[num_training_samples:]:
        test_data.append((to_feature_vector(pre_process(text)),label))

# Question 1: Input and Basic preprocessing (10 marks)

In [4]:
def convert_label(label):
    """Converts the multiple classes into two,
    making it a binary distinction between fake news and real."""
    #return label
    # Converting the multiclass labels to binary label
    labels_map = {
        'true': 'REAL',
        'mostly-true': 'REAL',
        'half-true': 'REAL',
        'false': 'FAKE',
        'barely-true': 'FAKE',
        'pants-fire': 'FAKE'
    }
    return labels_map[label]


def parse_data_line(data_line):
    # Should return a tuple of the label as just FAKE or REAL and the statement
    # e.g. (label, statement)
    return (convert_label(data_line[1]), data_line[2])

In [5]:
from nltk.tokenize import word_tokenize
from string import punctuation
import nltk
import re
nltk.download('punkt')
from sklearn.feature_extraction.text import CountVectorizer


# REMOVE STOP WORD
#def pre_process(text):
#  vectorizer = CountVectorizer(stop_words = 'english')
#  analyze = vectorizer.build_analyzer()
#  tokens=analyze(text)
#  return tokens


# # # To lemmatize the data
#import nltk
#nltk.download('wordnet')
#nltk.download('omw-1.4')
#from nltk.stem import WordNetLemmatizer
# # # LEMMATIZING
#wnl = WordNetLemmatizer()
#def pre_process(text):
#  vectorizer = CountVectorizer(stop_words = 'english')
#  analyze = vectorizer.build_analyzer()
#  tokens=analyze(text)
#  tokens = [wnl.lemmatize(t) for t in tokens]
# # #Should return a list of tokens
#  return tokens


#BIGRAMS
import nltk
from nltk import bigrams
def pre_process(text):
  no_symbols = re.sub(r'[^\w]', ' ', text.lower())
  tokens = no_symbols.split()
  bitokens = list(bigrams(tokens))
  return bitokens

#TRIGRAMS
#import nltk
#from nltk import trigrams
#def pre_process(text):
#    #normalisation and tokenising 
#    no_symbols = re.sub(r'[^\w]', ' ', text.lower())
#    tokens = no_symbols.split()
#    tritokens = list(trigrams(tokens))
#    return tritokens




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
#[word.lower() for word in word_tokenize(text) if word not in punctuation]

# Question 2: Basic Feature Extraction (20 marks)

In [7]:
global_feature_dict = {} # A global dictionary of features

def to_feature_vector(tokens):
    # Should return a dictionary containing features as keys, and weights as values
    # DESCRIBE YOUR METHOD IN WORDS
    result = {}
    for token in tokens:
        if token not in result:
            result[token] = 0
        result[token] += 1
        if token not in global_feature_dict:
            global_feature_dict[token] = 0
        global_feature_dict[token] += 1
    return result

In [8]:
# TRAINING AND VALIDATING OUR CLASSIFIER

def train_classifier(data):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC())])
    return SklearnClassifier(pipeline).train(data)

# Question 3: Cross-validation (20 marks)

In [9]:
#solution
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score



def cross_validate(dataset, folds):
    results=[]
    cv_results = []
    accuracy = []
    fold_size = int(len(dataset)/folds) + 1
    for i in range(0,len(dataset),int(fold_size)):
        # insert code here that trains and tests on the 10 folds of data in the dataset
        print("Fold start on items %d - %d" % (i, i+fold_size))
        # FILL IN THE METHOD HERE
        train_set = dataset[:i] + dataset[i+fold_size:]
        valid_set = dataset[i:i+fold_size]
        samples, labels = map(list, zip(*valid_set))
        classifier = train_classifier(train_set)
        #y_predict
        predictions = predict_labels(samples, classifier)
        results += predictions
        #y_true = label 
        y_true = [x[1] for x in valid_set]
        cv_results.append(precision_recall_fscore_support(y_true, predictions, average='weighted'))
        accuracy.append(accuracy_score(y_true, predictions))

    train_set_label = [sample[1] for sample in dataset]    
    print(classification_report(train_set_label, results))    
# Average calculation of values oer 10 fold runs
    cv_results = np.array(cv_results)
    cv_results = [np.mean(cv_results[:,0]), np.mean(cv_results[:,1]), np.mean(cv_results[:,2])]

    accuracy = np.asarray(accuracy)
    accuracy = np.mean(accuracy)

    print('The overall precision is {}'
          '\nrecall score is {}'
          '\nf1 score is {}'
          '\naccuracy is {}'.format(cv_results[0],cv_results[1],cv_results[2],accuracy))

    predicted_label_data = []
    for i in range(len(dataset)):
        list_dataset = list(dataset[i])
        list_dataset = list_dataset + [results[i]]
        predicted_label_data.append(list_dataset)

    return cv_results, predicted_label_data





In [10]:
# PREDICTING LABELS GIVEN A CLASSIFIER

def predict_labels(samples, classifier):
    """Assuming preprocessed samples, return their predicted labels from the classifier model."""
    return classifier.classify_many(samples)

def predict_label_from_raw(sample, classifier):
    """Assuming raw text, return its predicted label from the classifier model."""
    return classifier.classify(to_feature_vector(preProcess(reviewSample)))

In [11]:
# MAIN

# loading reviews
# initialize global lists that will be appended to by the methods below
raw_data = []          # the filtered data from the dataset file
train_data = []        # the pre-processed training data as a percentage of the total dataset
test_data = []         # the pre-processed test data as a percentage of the total dataset


# references to the data files
data_file_path = 'fake_news.tsv'

# Do the actual stuff (i.e. call the functions we've made)
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Preparing the dataset...",sep='\n')

load_data(data_file_path) 

# We split the raw dataset into a set of training data and a set of test data (80/20)
# You do the cross validation on the 80% (training data)
# We print the number of training samples and the number of features before the split
print("Now %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Preparing training and test data...",sep='\n')


split_and_preprocess_data(0.8)

# We print the number of training samples and the number of features after the split
print("After split, %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Training Samples: ", len(train_data), "Features: ", len(global_feature_dict), sep='\n')


Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 10241 rawData, 0 trainData, 0 testData
Preparing training and test data...
After split, 10241 rawData, 8192 trainData, 2049 testData
Training Samples: 
8192
Features: 
84109


In [12]:
cv_results, predicted_label_data = cross_validate(train_data, 10)

Fold start on items 0 - 820
Training Classifier...
Fold start on items 820 - 1640
Training Classifier...
Fold start on items 1640 - 2460
Training Classifier...
Fold start on items 2460 - 3280
Training Classifier...
Fold start on items 3280 - 4100
Training Classifier...
Fold start on items 4100 - 4920
Training Classifier...
Fold start on items 4920 - 5740
Training Classifier...
Fold start on items 5740 - 6560
Training Classifier...
Fold start on items 6560 - 7380
Training Classifier...
Fold start on items 7380 - 8200
Training Classifier...
              precision    recall  f1-score   support

        FAKE       0.51      0.48      0.49      3562
        REAL       0.62      0.65      0.63      4630

    accuracy                           0.57      8192
   macro avg       0.56      0.56      0.56      8192
weighted avg       0.57      0.57      0.57      8192

The overall precision is 0.5726008664300004
recall score is 0.5748582241980055
f1 score is 0.5728748238267123
accuracy is 0.5748