# Testing model
Testing our model with real-world data

In [52]:
import numpy as np
import pandas as pd 
import string
import re 

# nltk
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

#sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Tensors
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import gensim

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [53]:
# connect with drive

from google.colab import drive
# drive.mount('/content/drive')

# FactvsOpi

In [54]:
# Real news picked up from CNBC -> https://www.cnbc.com/2022/11/17/microsoft-satya-nadella-is-very-bullish-on-asia-china-and-india.html
data = {'title':['TECH Microsoft’s Satya Nadella says he is ‘very, very bullish’ on Asia, especially China and India'],
        'text':['The CEO of Microsoft says he is bullish about Asia, especially China and India, as Microsoft plans to build more data centers around the world. “Absolutely. We’re very, very bullish about what’s happening in Asia,” Satya Nadella, chairman and CEO of Microsoft, told CNBC’s Tanvir Gill in an interview Thursday, adding that Microsoft is investing in at least 11 regions. “We’re absolutely committed to all of these countries and in China too. Today, we primarily work to support multinational companies that operate in China and multinational companies out of China.” He also added that India has been a “massive growth market” after emerging from the pandemic. “Microsoft’s presence in India was about mostly multinational companies operating in India. But for now, it’s completely changed,” he said.'],
        'subject':['Business'],
        'date':['NOV 16 2022']
        }
 
# Create DataFrame
df = pd.DataFrame(data)
df

Unnamed: 0,title,text,subject,date
0,TECH Microsoft’s Satya Nadella says he is ‘ver...,The CEO of Microsoft says he is bullish about ...,Business,NOV 16 2022


In [55]:
text=df['text']
Tfidf_vect = TfidfVectorizer(max_features=3600, ngram_range=(1,1))

Tfidf_vect.fit(text)
text_tfidf = Tfidf_vect.fit_transform(text)
# text_tfidf = pad_sequences(text_tfidf, maxlen=2170)

## Load SVM model

In [None]:
filepath="/content/drive/MyDrive/saves/factvsopi_model.pkl"

# Load saved model
SVM = pd.read_pickle(filepath)
result = SVM.predict(text_tfidf)

if(result):
  print("This is opinion based news => Classified as Fake")

# LSTM classification

In [57]:
df["text"] = df["title"] + " " + df["text"]
df = df.drop(["subject", "date","title"], axis=1)

X = []
stop_words = set(nltk.corpus.stopwords.words("english"))
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
for par in df["text"].values:
    tmp = []
    sentences = nltk.sent_tokenize(par)
    for sent in sentences:
        sent = sent.lower()
        tokens = tokenizer.tokenize(sent)
        filtered_words = [w.strip() for w in tokens if w not in stop_words and len(w) > 1]
        tmp.extend(filtered_words)
    X.append(tmp)
  
#Dimension of vectors we are generating
EMBEDDING_DIM = 100

#Creating Word Vectors by Word2Vec Method (takes time...)
w2v_model = gensim.models.Word2Vec(sentences=X, size=EMBEDDING_DIM, window=5, min_count=1)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)

X = tokenizer.texts_to_sequences(X)

maxlen = 700 

# Making all news of size maxlen defined above
X = pad_sequences(X, maxlen=maxlen)



In [58]:
filepath="/content/drive/MyDrive/saves/lstm_model.pkl"

# Load saved model
model = pd.read_pickle(filepath)

y_pred = (model.predict(X) >= 0.5).astype("int")

if(y_pred):
  print("Classified as Real news!!")
else:
    print("Classified as Fake news!!")


Classified as Real news!!
