In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import string
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


In [3]:
df = pd.read_csv("WELFake_Dataset_preprocessed.csv")

In [4]:
df.shape

(61937, 2)

In [5]:
x = df['text']
y = df['label']

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2, random_state=42)

In [8]:
# Fill missing values
x_train = x_train.fillna("")
x_test = x_test.fillna("")

# Ensure all data is string type
x_train = x_train.astype(str)
x_test = x_test.astype(str)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

LOGISTIC REGRESSION

In [10]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression()
LR.fit(xv_train, y_train)
pred_lr = LR.predict(xv_test)
print(LR.score(xv_test, y_test))
print (classification_report(y_test, pred_lr))


0.9492250565062964
              precision    recall  f1-score   support

           0       0.95      0.96      0.95      6813
           1       0.95      0.94      0.94      5575

    accuracy                           0.95     12388
   macro avg       0.95      0.95      0.95     12388
weighted avg       0.95      0.95      0.95     12388



DECISION TREE CLASSIFIER

In [11]:
from sklearn.tree import DecisionTreeClassifier

DT = DecisionTreeClassifier()
DT.fit(xv_train, y_train)
pred_dt = DT.predict(xv_test)
print(DT.score(xv_test, y_test))
print (classification_report(y_test, pred_lr))

0.9040200193735873
              precision    recall  f1-score   support

           0       0.95      0.96      0.95      6813
           1       0.95      0.94      0.94      5575

    accuracy                           0.95     12388
   macro avg       0.95      0.95      0.95     12388
weighted avg       0.95      0.95      0.95     12388



GRADIENT BOOSTING CLASSIFIER

In [12]:
from sklearn.ensemble import GradientBoostingClassifier

GB = GradientBoostingClassifier(random_state = 0)
GB.fit(xv_train, y_train)
pred_gb = GB.predict(xv_test)
print(GB.score(xv_test, y_test))
print(classification_report(y_test, pred_gb))

0.9290442363577656
              precision    recall  f1-score   support

           0       0.95      0.92      0.93      6813
           1       0.91      0.93      0.92      5575

    accuracy                           0.93     12388
   macro avg       0.93      0.93      0.93     12388
weighted avg       0.93      0.93      0.93     12388



RANDOM FOREST CLASSIFIER

In [13]:
from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier(random_state = 0)
RF.fit(xv_train, y_train)
pred_rf = RF.predict(xv_test)
print(RF.score(xv_test, y_test))
print (classification_report(y_test, pred_rf))

0.9121730707135938
              precision    recall  f1-score   support

           0       0.90      0.95      0.92      6813
           1       0.93      0.87      0.90      5575

    accuracy                           0.91     12388
   macro avg       0.92      0.91      0.91     12388
weighted avg       0.91      0.91      0.91     12388



TEST

In [14]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Ensure stopwords and lemmatizer are downloaded
nltk.download('stopwords')
nltk.download('wordnet')

def clean_text(text):
    # 1. Convert text to lowercase
    text = text.lower()
    
    # 2. Remove HTML tags (if any)
    text = re.sub(r'<.*?>', '', text)
    
    # 3. Remove special characters, numbers, and keep only alphabets and spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # 4. Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    # 5. Remove stop words
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    # 6. Lemmatization (to get the root form of words)
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:
def output_label(n):
    if n == 0:
        return "Fake News"
    elif n == 1:
        return "Real News"

def manual_testing(news):
        
    # Display the input news
    print(f"Input News: {news}\n")
    
    # Create a DataFrame for the testing news
    testing_news = {"text": [news]}
    new_def_test = pd.DataFrame(testing_news)
    
    # Clean the text
    new_def_test['text'] = new_def_test["text"].apply(clean_text)
    
    # Transform the text using vectorization
    new_x_test = new_def_test["text"]
    new_xv_test = vectorization.transform(new_x_test)
    
    # Make predictions using the models
    pred_LR = LR.predict(new_xv_test)
    pred_DT = DT.predict(new_xv_test)
    pred_GB = GB.predict(new_xv_test)
    pred_RF = RF.predict(new_xv_test)
    
    # Print the predictions
    print("\n\nLR Prediction: {} \nDT Prediction: {} \nGBC Prediction: {} \nRFC Prediction: {}".format(
        output_label(pred_LR[0]),
        output_label(pred_DT[0]),
        output_label(pred_GB[0]),
        output_label(pred_RF[0])
    ))


In [16]:
news = str(input()) 
manual_testing(news)

Input News: RIO DE JANEIRO/SAO PAULO (Reuters) - Billionaire Marcelo Odebrecht, the highest-profile executive imprisoned in Brazil s massive graft scandal, was released from jail on Tuesday to continue his sentence for corruption under house arrest, according to a federal court. The former chief executive officer of Odebrecht SA [ODBES.UL], Latin America s largest construction firm, was arrested in 2015 during an investigation dubbed Car Wash that exposed billions of dollars in kickbacks to politicians and executives at state-run companies in exchange for inflated contracts. Odebrecht was set to travel to Sao Paulo to begin his house arrest under electronic surveillance on Tuesday, according to the federal court in Parana. A representative for the former executive said he remained committed to collaborating with authorities under a leniency deal.  Odebrecht was first sentenced to 19 years in prison in one of the many cases related to Car Wash. That was reduced to 10 years after he sign

In [17]:
news = str(input()) 
manual_testing(news)




LR Prediction: Real News 
DT Prediction: Real News 
GBC Prediction: Real News 
RFC Prediction: Real News


In [18]:
import pickle

# Save the trained models and vectorizer
with open('lr_model.pkl', 'wb') as f:
    pickle.dump(LR, f)

with open('dt_model.pkl', 'wb') as f:
    pickle.dump(DT, f)

with open('gb_model.pkl', 'wb') as f:
    pickle.dump(GB, f)

with open('rf_model.pkl', 'wb') as f:
    pickle.dump(RF, f)

with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorization, f)


USING SAVED MODELS TO RUN 

In [19]:
import pickle
import pandas as pd

# Step 1: Load the saved models and vectorizer
with open('lr_model.pkl', 'rb') as f:
    LR = pickle.load(f)

with open('dt_model.pkl', 'rb') as f:
    DT = pickle.load(f)

with open('gb_model.pkl', 'rb') as f:
    GB = pickle.load(f)

with open('rf_model.pkl', 'rb') as f:
    RF = pickle.load(f)

with open('vectorizer.pkl', 'rb') as f:
    vectorization = pickle.load(f)

# Step 2: Define the manual_testing function to test new input
def manual_testing(news):
    # Display the input news
    print(f"Input News: {news}\n")
    
    # Prepare the test data as a DataFrame
    testing_news = {"text": [news]}
    new_def_test = pd.DataFrame(testing_news)
    
    # Clean the text (apply your cleaning function here)
    new_def_test['text'] = new_def_test["text"].apply(clean_text)  # Assuming you have a clean_text function
    
    # Vectorize the input text using the loaded vectorizer
    new_x_test = new_def_test["text"]
    new_xv_test = vectorization.transform(new_x_test)
    
    # Step 3: Make predictions using the loaded models
    pred_LR = LR.predict(new_xv_test)
    pred_DT = DT.predict(new_xv_test)
    pred_GB = GB.predict(new_xv_test)
    pred_RF = RF.predict(new_xv_test)
    
    # Step 4: Output the predictions
    print("\n\nLR Prediction: {} \nDT Prediction: {} \nGBC Prediction: {} \nRFC Prediction:{}".format(
        output_label(pred_LR[0]),
        output_label(pred_DT[0]),
        output_label(pred_GB[0]),
        output_label(pred_RF[0])
    ))

# Step 5: Input a news article and call the testing function
news = str(input("Enter news text: "))
manual_testing(news)





LR Prediction: Real News 
DT Prediction: Real News 
GBC Prediction: Real News 
RFC Prediction:Real News
