## Vectorising Features into TF-IDF Values

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# read and store the csv file dataset as a dataframe
df = pd.read_csv(r"..\dataset\2\fake_and_real_news.csv")
print(df.shape)
df.head()

(9900, 2)


Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake
1,U.S. conservative leader optimistic of common ...,Real
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real
3,Court Forces Ohio To Allow Millions Of Illega...,Fake
4,Democrats say Trump agrees to work on immigrat...,Real


In [3]:
# imbalanced dataset does affect training,
# however, this is almost 50:50 so i will consider it negligible.
df.label.value_counts()

label
Fake    5000
Real    4900
Name: count, dtype: int64

In [4]:
# mapping label categories to numbers
# fake: 0, real: 1
# df.label.map({
#     "Fake": 0,
#     "Real": 1,
# })
# takes in a dictionary, and produces an effect on dataframe

# create a new column to the current dataframe
df["df_label"] = df.label.map({
    "Fake": 0,
    "Real": 1,
})

df.head()
print(df["Text"])

0        Top Trump Surrogate BRUTALLY Stabs Him In The...
1       U.S. conservative leader optimistic of common ...
2       Trump proposes U.S. tax overhaul, stirs concer...
3        Court Forces Ohio To Allow Millions Of Illega...
4       Democrats say Trump agrees to work on immigrat...
                              ...                        
9895     Wikileaks Admits To Screwing Up IMMENSELY Wit...
9896    Trump consults Republican senators on Fed chie...
9897    Trump lawyers say judge lacks jurisdiction for...
9898     WATCH: Right-Wing Pastor Falsely Credits Trum...
9899     Sean Spicer HILARIOUSLY Branded As Chickensh*...
Name: Text, Length: 9900, dtype: object


In [5]:
# create a vectorizer object for tf-idf
"""
.fit_transform(data: str) -> tf_idf_value_matrix
# text_data is fit and transformed as features
# each feature has a TF-IDF value associated with it.
# the resulting value matrix is returned.
"""

vectorizer = TfidfVectorizer()
transformed_output = vectorizer.fit_transform(df["Text"])

# print(vectorizer.vocabulary_)
print(transformed_output)

# lower the tf_idf value, higher the occurence of that feature

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2171752 stored elements and shape (9900, 58445)>
  Coords	Values
  (0, 52736)	0.08528328193860288
  (0, 53356)	0.24524663418922665
  (0, 50744)	0.09251973107666457
  (0, 10048)	0.09392101862235107
  (0, 49438)	0.12871702155365564
  (0, 25352)	0.11471603956019787
  (0, 26799)	0.015539354339607682
  (0, 51929)	0.07504302286307662
  (0, 7267)	0.0363100044045118
  (0, 24816)	0.13020485802624113
  (0, 39074)	0.20959773740142504
  (0, 55605)	0.041286127767650706
  (0, 28064)	0.03525468440873214
  (0, 32092)	0.05478450462654908
  (0, 6542)	0.07517839739693681
  (0, 52211)	0.044047628252456714
  (0, 44093)	0.02984434205652043
  (0, 41173)	0.03979535097146953
  (0, 10763)	0.05086183225006348
  (0, 17439)	0.03882026654266726
  (0, 27981)	0.12045052559736223
  (0, 32163)	0.06532150123396688
  (0, 50663)	0.08050833780921683
  (0, 19741)	0.09161237789961373
  (0, 22102)	0.02042598845225193
  :	:
  (9899, 58115)	0.03696372778235382
  (989

## Preprecessing Text

In [6]:
# preprocessing the text
import spacy

# load the model once (takes a few seconds)
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])  # disable unused components

def preprocess_spacy(text):
    doc = nlp(text)
    tokens = [
        token.lemma_.lower() 
        for token in doc 
        if not token.is_stop and not token.is_punct
    ]
    return " ".join(tokens)

In [7]:
# add the preprocessed text as a column to dataframe
df["preprocessed_text"] = df["Text"].apply(preprocess_spacy)

In [8]:
df.head()

Unnamed: 0,Text,label,df_label,preprocessed_text
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,0,trump surrogate brutally stabs pathetic vide...
1,U.S. conservative leader optimistic of common ...,Real,1,u.s. conservative leader optimistic common gro...
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,1,trump propose u.s. tax overhaul stir concern d...
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,0,court forces ohio allow millions illegally p...
4,Democrats say Trump agrees to work on immigrat...,Real,1,democrats trump agree work immigration bill wa...


## Training and Testing Dataset Split

In [29]:
from sklearn.model_selection import train_test_split

transformed_output = vectorizer.fit_transform(df["preprocessed_text"])
X = transformed_output
y = df["df_label"]

# splits dataset into two parts:
# training data (80%) and testing data (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


## Logistic Regression

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# decision boundary, separates "Real" and "Fake" news based on word patterns
model = LogisticRegression()
model.fit(X_train, y_train)
# if words like, hoax or scam -> likely fake
# if words like, official or study -> likely real

In [11]:
# now, the LogisticRegression model is trained
# use the test dataset to predict the output from the trained model
y_pred = model.predict(X_test)
print(y_pred)

[0 1 0 ... 0 1 0]


In [12]:
# evaluating performance
# compare predicted values with actual values
print(accuracy_score(y_test, y_pred))

0.9893939393939394


In [13]:
print(classification_report(y_test, y_pred))
# precision: % of predicted fake news, that was actually fake
# recall: % of actual fake news correctly detected

# f1-score: balance between precision and recall

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       970
           1       0.99      0.99      0.99      1010

    accuracy                           0.99      1980
   macro avg       0.99      0.99      0.99      1980
weighted avg       0.99      0.99      0.99      1980



## Random Forest

A powerful ensemble learning model which can work well with TF-IDF vectors for *text classification* tasks like *sentiment analysis*, *spam detection*, or topic categorization.

Main dataset is divided into batches of random datasets, then a decision tree is created for each of those batches. Because of this random sampling, the model is called random forest.

The `inputs_n` dataset is passed to each of those sub-decision trees, the most common output is then selected among all the outputs generated. 

In [14]:
# using random forest classifier to achieve the same
from sklearn.ensemble import RandomForestClassifier

In [20]:
# initialise and train the model on the training dataset
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)

y_pred = rf_clf.predict(X_test)
y_pred

array([0, 1, 0, ..., 0, 1, 0], dtype=int64)

In [22]:
# generate an accuracy and classification report
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9959595959595959
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       970
           1       1.00      0.99      1.00      1010

    accuracy                           1.00      1980
   macro avg       1.00      1.00      1.00      1980
weighted avg       1.00      1.00      1.00      1980

