In [1]:
import json
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
def train_model(x_train, x_test, y_train, y_test, model):
    pipe1 = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('model', model)])

    trained_model = pipe1.fit(x_train, y_train)
    prediction = trained_model.predict(x_test)
    
    accuracy = round(accuracy_score(y_test, prediction)*100, 2)
    results[f"{model.__class__.__name__}-same_set"].append(trained_model)
    results[f"{model.__class__.__name__}-same_set"].append(accuracy)
    print(f"Accuracy of {model.__class__.__name__} Classifier: {accuracy}%")
    print(f"Confusion Matrix of {model.__class__.__name__} Classifier:")
    print(confusion_matrix(y_test, prediction))
    print(f"CLassification Report of {model.__class__.__name__} Classifier:")
    print(classification_report(y_test, prediction))
    return trained_model

def test_model(article_bodies, article_labels, model, dataset_no, model_name):
    prediction = model.predict(article_bodies)
    
    accuracy = round(accuracy_score(article_labels, prediction)*100, 2)
    if f"{model_name.__class__.__name__}-data_set{dataset_no}" not in results:
        results[f"{model_name.__class__.__name__}-data_set{dataset_no}"] = [accuracy]
    else:
        results[f"{model_name.__class__.__name__}-data_set{dataset_no}"].append(accuracy)
    print(f"Accuracy of {model_name.__class__.__name__} Classifier: {accuracy}%")
    print(f"Confusion Matrix of {model_name.__class__.__name__} Classifier:")
    print(confusion_matrix(article_labels, prediction))
    print(f"CLassification Report of {model_name.__class__.__name__} Classifier:")
    print(classification_report(article_labels, prediction))
    
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)    
    return text

In [3]:
models = [LogisticRegression(), LinearSVC(), MultinomialNB(), SGDClassifier(), RandomForestClassifier()]

In [138]:
datasets = []
for i in range(5):
    data = pd.read_csv(f"dataset{i}.csv")
    datasets.append(data)

In [142]:
for i in range(len(datasets)):
    datasets[i] = datasets[i].sample(frac = 1)
    datasets[i].reset_index(inplace = True)
    datasets[i].drop(["index"], axis = 1, inplace = True)
    datasets[i]["text"] = datasets[i]["text"].apply(wordopt)

                                                text  label
0  so  anyway  what s the deal  is trump just try...      1
1  on thursday  the u  s  military published a re...      0
2    by bar    tens of thousands have demonstrate...      1
3  facebook will stop  fake news  using a similar...      0
4  imidacloprid sulfoxaflor aldi has made a name ...      1
                                                text  label
0   reuters    u s  senate republicans are making...      1
1  when the majority of students  regardless of c...      0
2  well  that didn t take long  look for the firs...      0
3  winston salem  n c   reuters    a state judge ...      1
4  over the course of five days  tulsa deputies w...      0
                                                text  label
0  the empire files  inside palestine s refugee c...      0
1  professor shares insights on paranormal  canni...      0
2   before it s news  what the video is about  em...      0
3  this video is really disturbing     n

In [4]:
results = {}
for model in models:
    results[f"{model.__class__.__name__}-same_set"] = []
print(results)

{'LogisticRegression-same_set': [], 'LinearSVC-same_set': [], 'MultinomialNB-same_set': [], 'SGDClassifier-same_set': [], 'RandomForestClassifier-same_set': []}


In [1]:
for i in range(len(datasets)):
    x_train, x_test, y_train, y_test = train_test_split(datasets[i].text, datasets[i].label, test_size=0.2, random_state=1)
    for model in models:
        print(f"For dataset {i}:")
        trained_model = train_model(x_train, x_test, y_train, y_test, model)
        for j in range(len(datasets)):
            if i==j:
                  continue
            print(f"Testing on dataset {j}:")
            test_model(datasets[j].text, datasets[j].label, trained_model, i, model)

In [5]:
dataset = pd.read_csv("dataset4.csv")
x_train, x_test, y_train, y_test = train_test_split(dataset.text, dataset.label, test_size=0.2, random_state=1)
for model in models:
    trained_model = train_model(x_train, x_test, y_train, y_test, model)

Accuracy of LogisticRegression Classifier: 96.68%
Confusion Matrix of LogisticRegression Classifier:
[[251  15]
 [  2 244]]
CLassification Report of LogisticRegression Classifier:
              precision    recall  f1-score   support

           0       0.99      0.94      0.97       266
           1       0.94      0.99      0.97       246

    accuracy                           0.97       512
   macro avg       0.97      0.97      0.97       512
weighted avg       0.97      0.97      0.97       512

Accuracy of LinearSVC Classifier: 99.22%
Confusion Matrix of LinearSVC Classifier:
[[263   3]
 [  1 245]]
CLassification Report of LinearSVC Classifier:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       266
           1       0.99      1.00      0.99       246

    accuracy                           0.99       512
   macro avg       0.99      0.99      0.99       512
weighted avg       0.99      0.99      0.99       512

Accuracy of M

In [6]:
model = results['LinearSVC-same_set'][0]

In [7]:
import newspaper
fake = ['https://www.theonion.com/wealthy-teen-nearly-experiences-consequence-1819570166',
       'https://www.theonion.com/why-cant-i-sell-any-of-these-fucking-bibles-1819583497',
       'https://www.theonion.com/bush-our-long-national-nightmare-of-peace-and-prosperi-1819565882',
       'https://www.theonion.com/fuck-everything-were-doing-five-blades-1819584036',
       'https://www.theonion.com/evangelical-scientists-refute-gravity-with-new-intellig-1819567984',
       'https://www.theonion.com/black-guy-asks-nation-for-change-1819569703',
       'https://www.theonion.com/hijackers-surprised-to-find-selves-in-hell-1819566162',
       'https://www.theonion.com/pope-francis-worried-about-job-security-after-butting-h-1819578788',
       'https://www.theonion.com/fun-toy-banned-because-of-three-stupid-dead-kids-1819565691',
       'https://www.theonion.com/area-man-passionate-defender-of-what-he-imagines-consti-1819571149']

In [38]:
true = ['https://www.hindustantimes.com/world-news/explosives-set-off-to-bring-down-rest-of-collapsed-condo-in-florida-101625456836623.html',
       'https://www.hindustantimes.com/world-news/blast-at-thai-factory-shakes-bangkok-airport-area-evacuated-101625460300340.html',
       'https://www.hindustantimes.com/cities/patna-news/2-more-areas-in-bihar-s-champaran-achieve-100-first-dose-vaccination-101625458808289.html',
       'https://www.hindustantimes.com/india-news/govt-presents-draft-bill-to-check-human-trafficking-suggestions-invited-101625458954199.html',
       'https://www.hindustantimes.com/india-news/cowin-to-go-global-today-50-countries-show-interest-101625457771714.html',
       'https://www.thehindu.com/news/international/scale-details-of-massive-kaseya-ransomware-attack-emerge/article35141077.ece?homepage=true',
       'https://www.thehindu.com/news/international/blast-at-thai-factory-shakes-bangkok-airport-area-evacuated/article35141393.ece?homepage=true',
       'https://www.thehindu.com/news/international/taliban-captures-several-districts-in-afghanistan/article35134805.ece?homepage=true',
       'https://www.ndtv.com/india-news/stan-swamy-84-year-old-tribal-rights-activist-arrested-under-anti-terrorism-law-last-year-dies-after-prolonged-illness-2479602?trendingnow',
       'https://www.ndtv.com/india-news/pranab-mukherjees-son-abhijit-mukherjee-to-join-trinamool-congress-today-sources-2479627?trendingnow',
       'https://www.ndtv.com/india-news/12-bjp-mlas-in-maharashtra-disqualified-for-abusing-speaker-devendra-fadnavis-says-allegations-false-2479634?trendingnow',
       'https://www.ndtv.com/india-news/covaxin-brazil-deal-brazil-contradicts-bharat-biotech-claim-on-emergency-approval-for-covaxin-2479694?trendingnow',
       'https://www.ndtv.com/delhi-news/central-market-rui-mandi-shut-2-more-delhi-markets-shut-for-violation-of-covid-norms-2479438?trendingnow',
       'https://www.ndtv.com/world-news/afghanistan-over-1-000-afghan-troops-flee-taliban-into-tajikistan-2479677?trendingnow',
       'https://allthatsinteresting.com/jared-vaughn',
       'https://allthatsinteresting.com/zodiac-killer-cipher-solved',
       'https://allthatsinteresting.com/first-americans',
       'https://allthatsinteresting.com/raffaela-weyman',
       'https://allthatsinteresting.com/jarvensuo',
       'https://www.news.com.au/lifestyle/real-life/true-stories/millions-left-to-charities-after-queenstown-mans-cancer-death/news-story/1f0ae0974cc7c7e8c7e495202d13f1de',
       'https://www.news.com.au/lifestyle/real-life/tragic-reason-14yearold-boy-was-in-burning-house-that-killed-him/news-story/feadffabe363a8b4b4bc793e6574ffe7',
       'https://www.news.com.au/lifestyle/real-life/true-stories/woman-freaks-out-over-grim-shower-find-in-her-home/news-story/800a1325ffc053876a7451c0d1c9c709',
       'https://www.news.com.au/world/coronavirus/global/covid19-uk-strain-more-transmissible-than-delta-hits-uk/news-story/4680c0a91a079cc00143cfdddddb20ba',
       'https://www.news.com.au/travel/travel-updates/travel-stories/this-chinese-town-is-a-tourist-hotspot-except-its-all-fake/news-story/45c0f982136cf8482c1c686c761e62d0',
       'https://www.news.com.au/world/south-america/high-numbers-of-children-dying-from-covid-in-brazil-as-nations-death-toll-rises/news-story/7e7ff610a566fda2af950d8f3506f789']
print(len(true))

25


In [49]:
# fake_results = []
real_results = []
# fake_articles = []
real_articles = []
# for url in fake:
#     article = newspaper.Article(url = url)
#     article.download()
#     try:
#         article.parse()
#     except:
#         print('yo')
#         pass
#     text = article.text
#     text = wordopt(text)
#     fake_articles.append(text)
#     fake_results.append(model.predict([text])[0])
    
    
for url in true:
    article = newspaper.Article(url = url)
    article.download()
    print(article.html)
#     try:
    article.parse()
#     except:
#         print('yo2')
#         pass
    text = article.text
    text = wordopt(text)
    real_articles.append(text)
    real_results.append(model.predict([text])[0])




ArticleException: Article `download()` failed with HTTPSConnectionPool(host='www.hindustantimes.com', port=443): Read timed out. on URL https://www.hindustantimes.com/world-news/explosives-set-off-to-bring-down-rest-of-collapsed-condo-in-florida-101625456836623.html

In [44]:
print(fake_results)
print(real_results)
# print(real_articles)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1]


In [20]:
test_true_news = "Demolition crews set off explosives late Sunday to bring down the damaged remaining portion of a collapsed South Florida condo building, a key step to resuming the search for victims as rescuers possibly gain access to new areas of the rubble. A number of explosions could be heard and then the building started to fall, sending up massive plumes of dust into the air in the surrounding neighborhoods. Crews were to begin clearing some of the new debris so rescuers could start making their way into parts of the underground garage that is of particular interest. Once there, they were hoping to get a clearer picture of voids that may exist in the rubble and could possibly harbor survivors. The precarious, still-standing portion of a collapsed structure was rigged with explosive charges and set for demolition, after suspending the search-and-rescue mission. Through the night, rescuers were awaiting the 'all-clear' after the demolition so they could dive back into the task of trying to locate any survivors buried under the rubble. Officials had previously said that the search could resume from 15 minutes to an hour after the detonation. 'We are standing by. We are ready to go in, no matter the time of night,' Levine Cava told a news conference earlier Sunday night. No one has been rescued alive since the first hours after the June 24 collapse. Rescuers are hoping the demolition will give them access for the first time to parts of the garage area that are a focus of interest. Once a new pathway into the initial rubble is secure, “we will go back to the debris pile, and we’ll begin our search and rescue efforts,” Miami-Dade Fire Chief Albert Cominsky said. The decision to demolish the Surfside building came after concerns mounted that the damaged structure was at risk of falling, endangering the crews below and preventing them from operating in some areas. Parts of the remaining building shifted on Thursday, prompting a 15-hour suspension in the work. An approaching storm added urgency to the concerns."
test_true_news = wordopt(test_true_news)
print(model.predict([test_true_news]))

[1]


In [18]:
print(true)

['https://www.hindustantimes.com/world-news/explosives-set-off-to-bring-down-rest-of-collapsed-condo-in-florida-101625456836623.html', 'https://www.hindustantimes.com/world-news/blast-at-thai-factory-shakes-bangkok-airport-area-evacuated-101625460300340.html', 'https://www.hindustantimes.com/cities/patna-news/2-more-areas-in-bihar-s-champaran-achieve-100-first-dose-vaccination-101625458808289.html', 'https://www.hindustantimes.com/india-news/govt-presents-draft-bill-to-check-human-trafficking-suggestions-invited-101625458954199.html', 'https://www.hindustantimes.com/india-news/cowin-to-go-global-today-50-countries-show-interest-101625457771714.html', 'https://www.thehindu.com/news/international/scale-details-of-massive-kaseya-ransomware-attack-emerge/article35141077.ece?homepage=true', 'https://www.thehindu.com/news/international/blast-at-thai-factory-shakes-bangkok-airport-area-evacuated/article35141393.ece?homepage=true', 'https://www.thehindu.com/news/international/taliban-captures-