<a href="https://colab.research.google.com/github/priyanshivmehta/NewsPrediction/blob/main/RealFakeNews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
from google.colab import files

In [12]:
uploaded = files.upload()

Saving archive.zip to archive (1).zip


In [27]:
import zipfile
import os
with zipfile.ZipFile("archive.zip","r") as zip_ref:
  zip_ref.extractall("dataset")
os.listdir("dataset")

['True.csv', 'Fake.csv']

In [28]:
import pandas as pd
fake=pd.read_csv("dataset/Fake.csv")
real=pd.read_csv("dataset/True.csv")
print("Fake news shape:", fake.shape)
print("Real news shape:",real.shape)

Fake news shape: (23481, 4)
Real news shape: (21417, 4)


In [15]:
print("Fake news sample:")
print(fake.head(),"\n")
print("Real news sample:")
print(real.head(),"\n")

Fake news sample:
                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text subject  \
0  Donald Trump just couldn t wish all Americans ...    News   
1  House Intelligence Committee Chairman Devin Nu...    News   
2  On Friday, it was revealed that former Milwauk...    News   
3  On Christmas day, Donald Trump announced that ...    News   
4  Pope Francis used his annual Christmas Day mes...    News   

                date  
0  December 31, 2017  
1  December 31, 2017  
2  December 30, 2017  
3  December 29, 2017  
4  December 25, 2017   

Real news sample:
                                               title  \
0  As U.S. budget fight looms, Republicans fli

In [16]:
fake["label"]=1
real["label"]=0

In [17]:
data=pd.concat([fake,real],axis=0).reset_index(drop=True)
data=data.sample(frac=1).reset_index(drop=True)
print(data.head())

                                               title  \
0   Trump Just Ridiculously Claimed Palestine And...   
1  Malaysian police say they foiled attack on SEA...   
2  Trump faces diplomatic hurdles during 28 hours...   
3  60 Minutes Uses “Fake News” Story To Destroy C...   
4  U.S. condemns Venezuelan elections as neither ...   

                                                text       subject  \
0  According to Donald Trump, Israel and Palestin...          News   
1  KUALA LUMPUR (Reuters) - Malaysian police thwa...     worldnews   
2  JERUSALEM (Reuters) - U.S. presidential trips ...  politicsNews   
3  Things got tense Sunday on 60 Minutes as CBS E...     left-news   
4  WASHINGTON (Reuters) - The United States on Mo...  politicsNews   

                 date  label  
0         May 3, 2017      1  
1  September 5, 2017       0  
2       May 19, 2017       0  
3        Mar 27, 2017      1  
4   October 16, 2017       0  


In [18]:
print("Label distribution:\n", data['label'].value_counts())

Label distribution:
 label
1    23481
0    21417
Name: count, dtype: int64


In [19]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [20]:
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
stop_words=set(stopwords.words('english'))
stemmer=PorterStemmer()

In [21]:
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [38]:
def preprocess(text):
  text=text.lower()
  text=''.join([char for char in text if char not in string.punctuation])
  words=word_tokenize(text)
  words=[word for word in words if word not in stop_words]
  words=[stemmer.stem(word) for word in words]
  return ' '.join(words)

In [23]:
data['full_text'] = data['title'] + " " + data['text']
data['clean_text'] = data['full_text'].apply(preprocess)

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [33]:
tfidf = TfidfVectorizer(ngram_range=(1, 2), stop_words='english', max_features=5000)

In [34]:
X=tfidf.fit_transform(data['clean_text']).toarray()

In [35]:
import joblib
joblib.dump(tfidf, 'tfidf.pkl')

['tfidf.pkl']

In [36]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
y=encoder.fit_transform(data['label'])

In [37]:
from imblearn.over_sampling import SMOTE
smote=SMOTE(random_state=42)
X_balanced, y_balanced=smote.fit_resample(X, y)

In [39]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

In [41]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train,y_train)

In [42]:
joblib.dump(model, 'model.pkl')

['model.pkl']

In [43]:
y_pred=model.predict(X_test)

In [44]:
print("Accuracy:",accuracy_score(y_test,y_pred))
print("Classification report:",classification_report(y_test,y_pred))
print("Confusion Matrix:",confusion_matrix(y_test,y_pred))

Accuracy: 0.997657830299159
Classification report:               precision    recall  f1-score   support

           0       1.00      1.00      1.00      4630
           1       1.00      1.00      1.00      4763

    accuracy                           1.00      9393
   macro avg       1.00      1.00      1.00      9393
weighted avg       1.00      1.00      1.00      9393

Confusion Matrix: [[4622    8]
 [  14 4749]]


In [49]:
from google.colab import files
joblib.dump(tfidf,'tfidf.pkl')
files.download('model.pkl')
files.download('tfidf.pkl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [46]:
import joblib

model = joblib.load('model.pkl')
tfidf = joblib.load('tfidf.pkl')


In [47]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import nltk

nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    words = [stemmer.stem(word) for word in words]
    return ' '.join(words)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [53]:
sample_text = "Accelerated climate action can be a driver of growth and development"

clean_text = preprocess(sample_text)
print("Preprocessed:", clean_text)

vectorized_text = tfidf.transform([clean_text])
print("Vectorized shape:", vectorized_text.shape)

pred = model.predict(vectorized_text)[0]
pred_proba = model.predict_proba(vectorized_text)[0]

label_map = {0: 'Real', 1: 'Fake'}
prediction_label = label_map.get(pred, 'Unknown')
confidence = round(max(pred_proba) * 100, 2)

print("Prediction:", prediction_label)
print("Confidence:", confidence, "%")


Preprocessed: acceler climat action driver growth develop
Vectorized shape: (1, 5000)
Prediction: Fake
Confidence: 99.86 %
