In [1]:
# install necessary libraries

!pip install gradio
!pip install contractions

Collecting gradio
  Downloading gradio-4.8.0-py3-none-any.whl (16.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.5/16.5 MB[0m [31m49.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)
Collecting fastapi (from gradio)
  Downloading fastapi-0.104.1-py3-none-any.whl (92 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.9/92.9 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpy (from gradio)
  Downloading ffmpy-0.3.1.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio-client==0.7.1 (from gradio)
  Downloading gradio_client-0.7.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.9/302.9 kB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx (from gradio)
  Downloading httpx-0.25.2-py3-none-any.whl (74 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import gradio as gr
import torch
import torch.nn as nn
import torch.optim as optim
import re
import nltk
import numpy as np
import pandas as pd
import torchtext
import itertools

from tqdm import tqdm
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab
from torch.nn import functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.nn.modules.loss import _WeightedLoss

from torch.utils.data import DataLoader, TensorDataset


from nltk.corpus import stopwords
from contractions import contractions_dict
from nltk.stem import WordNetLemmatizer
from collections import Counter


In [3]:
# connect to Google Drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
nltk.download("stopwords")
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [5]:
stop_words = set(stopwords.words("english"))
print(stop_words)

{'be', 'where', "couldn't", 'hasn', 'too', 'below', 'has', 'ma', "shan't", 'him', "aren't", 'yourselves', "didn't", 'isn', 't', "it's", 'd', 'here', 'his', 'each', 'didn', 'ain', 'both', 'themselves', 'why', 'me', 'you', 'what', 'my', 'had', 'that', 'very', 'most', "needn't", "wouldn't", 'the', 'needn', 'out', 'by', "you've", 'down', 'mightn', 'during', 'nor', 'when', 'now', 'so', 'for', 'am', 'i', 'again', 'don', 'should', 'to', "won't", 'at', 'under', 'they', 'how', 'further', 'because', 'y', "you'll", 'hers', 'have', 'but', 'about', "she's", 'weren', 'other', 'them', "wasn't", "should've", 'are', 'few', 'only', 'been', 'm', 'of', 'from', 'all', 'own', 'just', 'these', 'whom', 'your', 'after', 'over', 'haven', 'does', "mightn't", 'with', 'no', 'theirs', 'up', 'shouldn', 'won', 'itself', 'who', 'any', 'were', 'was', 'into', 'yours', 'once', 'it', 'their', "doesn't", 'wouldn', 'he', 'doing', 'o', 'll', 'she', 'do', 'there', 're', 'did', 'its', 'if', 'then', 'ours', 'same', "mustn't", '

In [6]:
# expands contractions, converts text to lowercase, removes URLs and parentheses, eliminates double quotes, lemmatizes words, and removes stopwords on a dataframe

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# expanding contractions
def expand(text):
    for c, e in contractions_dict.items():
        text = text.replace(c, e)
    return text

def clean_text(text):
    text = expand(text)
    # print(text)
    text = text.lower()
    # print(text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    # print(text)
    text = re.sub(r'\([^)]*\)', '', text)
    # print(text)
    text = re.sub('"', '', text)
    # print(text)
    words = [lemmatizer.lemmatize(w) for w in re.findall(r'\b\w+\b', text) if w not in stop_words]

    return " ".join(words).strip()


In [7]:
# Load Vocabulary that we created from the Google Drive

def load_vocab(vocab_path):
    vocab = torch.load(vocab_path)
    return vocab

vocab_path = '/content/drive/MyDrive/Models/vocab.pth'
vocabulary = load_vocab(vocab_path)

In [8]:
class HybridCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_labels, num_filters, filter_sizes, hidden_size, lstm_layers, dropout_prob):
        super(HybridCNN, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        self.conv_layers = nn.ModuleList([
            nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=fs)
            for fs in filter_sizes
        ])

        self.lstm = nn.LSTM(
            input_size=num_filters * len(filter_sizes),
            hidden_size=hidden_size,
            num_layers=lstm_layers,
            batch_first=True,
            dropout=dropout_prob if lstm_layers > 1 else 0
        )

        self.fc = nn.Linear(hidden_size, num_labels)
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, x):
        embedded = self.embedding(x)
        embedded = embedded.permute(0, 2, 1)
        conv_outputs = [F.relu(conv(embedded)) for conv in self.conv_layers]
        pooled_outputs = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conv_outputs]
        cat_output = torch.cat(pooled_outputs, dim=1)
        lstm_out, _ = self.lstm(cat_output.unsqueeze(1))
        lstm_out = lstm_out[:, -1, :]

        fc_out = self.fc(lstm_out)
        output = self.dropout(fc_out)

        return output


vocab_size = len(vocabulary)
embedding_dim = 200
num_labels = 2
num_filters = 100
filter_sizes = [2, 3, 4]
hidden_size = 128
lstm_layers = 1
dropout_prob = 0.5

model = HybridCNN(vocab_size, embedding_dim, num_labels, num_filters, filter_sizes, hidden_size, lstm_layers, dropout_prob)
model.load_state_dict(torch.load('/content/drive/MyDrive/Models/best_hybridcnn_model2.pth'))

<All keys matched successfully>

In [9]:
# map the tokenized representations of given news to numerical indices using our vocabulary
def number_tokens(tokens):
    return [vocabulary[token] if token in vocabulary else vocabulary['<unk>'] for token in tokens]

# preprocess the gives news text and predict the label
def predict(text):
    processed_text = clean_text(text)

    # print(processed_text)
    tokenizer = get_tokenizer("basic_english")
    # print(tokenizer)
    tokens = tokenizer(processed_text)
    # print(tokens)
    numericalized_text = number_tokens(tokens)
    # print(numericalize_tokens)

    padded_text = pad_sequence([torch.tensor(numericalized_text)], batch_first=True)
    # print(padded_text)

    # Make prediction using the trained model
    model.eval()
    with torch.no_grad():
        output = model(padded_text)

    # print(output)
    prob = F.softmax(output, dim=1)
    prediction = torch.argmax(prob).item()
    # print(predicted_class)
    label = "True News" if prediction == 0 else "Fake News"
    return label

In [10]:
# web interface using gradio

iface = gr.Interface(
    fn=predict,
    inputs=gr.Textbox(),
    outputs=gr.Label(num_top_classes=2),
    title="Fake News Detection"
)

iface.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://abce26426bbbb9a98e.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


