In [None]:
import csv # csv reader
import warnings
warnings.filterwarnings("ignore")
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_fscore_support # to report on precision and recall
import numpy as np
import nltk
from collections import Counter
from nltk.stem import PorterStemmer
ps = PorterStemmer()
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
wnl = WordNetLemmatizer()

nltk.download('wordnet')
nltk.download('punkt')
stopword_set = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\iq\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\iq\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
def load_data(path):
    """Load data from a tab-separated file and append it to raw_data."""
    with open(path,encoding="utf8") as f:
        reader = csv.reader(f, delimiter='\t')
        index = 0 
        for line in reader:
            if line[0] == "Id":  # skip header
                continue
            #print(index)
            (label, text) = parse_data_line(line)
            raw_data.append((text, label))
            index +=1

def split_and_preprocess_data(percentage):
    """Split the data between train_data and test_data according to the percentage
    and performs the preprocessing."""
    num_samples = len(raw_data)
    num_training_samples = int((percentage * num_samples))
    train_index = 0
    
    for (text, label) in raw_data[:num_training_samples]:
        train_data.append((to_feature_vector(pre_process(text)),label))

      
    for (text, label) in raw_data[num_training_samples:]:    
        test_data.append((to_feature_vector(pre_process(text)),label))

In [None]:
def convert_label(label):
    """Converts the multiple classes into two,
    making it a binary distinction between fake news and real."""
    #return label
    # Converting the multiclass labels to binary label
    labels_map = {
        'true': 'REAL',
        'mostly-true': 'REAL',
        'half-true': 'REAL',
        'false': 'FAKE',
        'barely-true': 'FAKE',
        'pants-fire': 'FAKE'
    }
    return labels_map[label]


def parse_data_line(line):  
    # Should return a tuple of the label as just FAKE or REAL and the statement
    # e.g. (label, statement)
    """ """
    (label,text) = line[1],line[2]
    label = convert_label(label)
    return (label,text)

In [None]:
def pre_process(text):
    # Should return a list of tokens
    # DESCRIBE YOUR METHOD IN WORDS
    text_tokens = text.split()
    output = []
    for word in text_tokens :
        if word:
              output.append(word)
    return output

In [None]:
from collections import Counter
global_feature_dict = {} # A global dictionary of features
from collections import Counter
from nltk import ngrams
import math

def to_feature_vector(tokens):
    # Should return a dictionary containing features as keys, and weights as values
    # DESCRIBE YOUR METHOD IN WORDS
    tokens = [word.lower() for word in tokens ]
    dic_vector = dict(Counter(tokens))

    result = dic_vector 
    return result

In [None]:
# TRAINING AND VALIDATING OUR CLASSIFIER

def train_classifier(data):
    print("Training Classifier...")
    pipeline =  Pipeline([
                           ('svc', LinearSVC(max_iter=1000,C=1.3))])
    return SklearnClassifier(pipeline).train(data)

# Question 3: Cross-validation (20 marks)

In [None]:
#solution
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

def cross_validate(dataset, folds):
    results = []
    fold_size = int(len(dataset)/folds) + 1
    per = []
    recall = []
    f1score = []
    accu = []
    
    for i in range(0,len(dataset),int(fold_size)):
        # insert code here that trains and tests on the 10 folds of data in the dataset
        print("Fold start on items %d - %d" % (i, i+fold_size))
        # FILL IN THE METHOD HERE
        test_set = dataset[i:i+fold_size]
        train_set = dataset[:i]+dataset[i+fold_size:]
        
        classifier = train_classifier(train_set)
        
        true_label = [t[1] for t in test_set]#ground truth
        y_test = predict_labels([x[0] for x in test_set],classifier)
        
        
        
        output = classification_report(true_label,y_test,output_dict=True)
        accuracy = accuracy_score(true_label, y_test)
        
        accu.append(accuracy)
        per.append(output['FAKE']['precision'])
        recall.append(output['FAKE']['recall'])
        f1score.append(output['FAKE']['f1-score'])
        
        
    return [np.mean(per),np.mean(recall),np.mean(f1score),np.mean(accu)]

In [None]:
# PREDICTING LABELS GIVEN A CLASSIFIER

def predict_labels(samples, classifier):
    """Assuming preprocessed samples, return their predicted labels from the classifier model."""
    return classifier.classify_many(samples)

def predict_label_from_raw(sample, classifier):
    """Assuming raw text, return its predicted label from the classifier model."""
    return classifier.classify(to_feature_vector(preProcess(reviewSample)))

In [None]:
# MAIN

# loading reviews
# initialize global lists that will be appended to by the methods below
raw_data = []          # the filtered data from the dataset file
train_data = []        # the pre-processed training data as a percentage of the total dataset
test_data = []         # the pre-processed test data as a percentage of the total dataset


# references to the data files
data_file_path = 'fake_news.tsv'

# Do the actual stuff (i.e. call the functions we've made)
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Preparing the dataset...",sep='\n')

load_data(data_file_path) 

# We split the raw dataset into a set of training data and a set of test data (80/20)
# You do the cross validation on the 80% (training data)
# We print the number of training samples and the number of features before the split
print("Now %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Preparing training and test data...",sep='\n')
print('I am before split_and_preprocess_data')
split_and_preprocess_data(0.8)

print('I am after split_and_preprocess_data')
# We print the number of training samples and the number of features after the split
print("After split, %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Training Samples: ", len(train_data), "Features: ", len(global_feature_dict), sep='\n')


Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 10241 rawData, 0 trainData, 0 testData
Preparing training and test data...
I am before split_and_preprocess_data
I am after split_and_preprocess_data
After split, 10241 rawData, 8192 trainData, 2049 testData
Training Samples: 
8192
Features: 
0


In [None]:
cross_validate(train_data, 10)

Fold start on items 0 - 820
Training Classifier...
Fold start on items 820 - 1640
Training Classifier...
Fold start on items 1640 - 2460
Training Classifier...
Fold start on items 2460 - 3280
Training Classifier...
Fold start on items 3280 - 4100
Training Classifier...
Fold start on items 4100 - 4920
Training Classifier...
Fold start on items 4920 - 5740
Training Classifier...
Fold start on items 5740 - 6560
Training Classifier...
Fold start on items 6560 - 7380
Training Classifier...
Fold start on items 7380 - 8200
Training Classifier...


[0.5039986113960203,
 0.5015893961483191,
 0.5023835099418511,
 0.5683779887059954]

### Questions 5 (20%) and 6 (20%) (recommend starting a new notebook)

In [None]:
# Finally, check the accuracy of your classifier by training on all the traning data
# and testing on the test set
# Will only work once all functions are complete
functions_complete = True  # set to True once you're happy with your methods for cross val
if functions_complete:
    print(test_data[0])   # have a look at the first test data instance
    classifier = train_classifier(train_data)  # train the classifier
    test_true = [t[1] for t in test_data]   # get the ground-truth labels from the data
    test_pred = predict_labels([x[0] for x in test_data], classifier)  # classify the test data to get predicted labels
    final_scores = precision_recall_fscore_support(test_true, test_pred, average='weighted') # evaluate
    print("Done training!")
    print("Precision: %f\nRecall: %f\nF Score:%f" % final_scores[:3])
    print(f'Accuracy Acheived : {round(accuracy_score(test_true, test_pred),4)}')

({'the': 2, 'bush': 1, 'tax': 1, 'cuts': 1, 'helped': 1, 'to': 1, 'create': 1, 'a': 1, 'substantial': 1, 'part': 1, 'of': 1, 'deficit.': 1}, 'REAL')
Training Classifier...
Done training!
Precision: 0.569793
Recall: 0.569546
F Score:0.569665
Accuracy Acheived : 0.5695
