In [42]:
import pandas as pd
import gdown
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import joblib

In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

Downloading dataset

In [4]:
file_id = "1qvA4g6pebwpoebQVbMcBVegRshLTZkUB"
url = f"https://drive.google.com/uc?id={file_id}"
output_zip = "fake_news_prediction.zip"
gdown.download(url, output_zip, quiet = False)

Downloading...
From: https://drive.google.com/uc?id=1qvA4g6pebwpoebQVbMcBVegRshLTZkUB
To: /content/fake_news_prediction.zip
100%|██████████| 12.0M/12.0M [00:00<00:00, 30.3MB/s]


'fake_news_prediction.zip'

In [5]:
!unzip "fake_news_prediction.zip" -d "extracted file"

Archive:  fake_news_prediction.zip
  inflating: extracted file/fake_or_real_news.csv  


In [6]:
df = pd.read_csv("/content/extracted file/fake_or_real_news.csv", index_col=0)
print(df.head())
df.shape

                                                   title  \
8476                        You Can Smell Hillary’s Fear   
10294  Watch The Exact Moment Paul Ryan Committed Pol...   
3608         Kerry to go to Paris in gesture of sympathy   
10142  Bernie supporters on Twitter erupt in anger ag...   
875     The Battle of New York: Why This Primary Matters   

                                                    text label  
8476   Daniel Greenfield, a Shillman Journalism Fello...  FAKE  
10294  Google Pinterest Digg Linkedin Reddit Stumbleu...  FAKE  
3608   U.S. Secretary of State John F. Kerry said Mon...  REAL  
10142  — Kaydee King (@KaydeeKing) November 9, 2016 T...  FAKE  
875    It's primary day in New York and front-runners...  REAL  


(6335, 3)

In [7]:
df.isnull().sum()

Unnamed: 0,0
title,0
text,0
label,0


In [8]:
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

Combining title and text column

In [9]:
df['content'] = df['title'] + ' ' + df['text']
print(df['content'])

8476     You Can Smell Hillary’s Fear Daniel Greenfield...
10294    Watch The Exact Moment Paul Ryan Committed Pol...
3608     Kerry to go to Paris in gesture of sympathy U....
10142    Bernie supporters on Twitter erupt in anger ag...
875      The Battle of New York: Why This Primary Matte...
                               ...                        
4490     State Department says it can't find emails fro...
8062     The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...
8622     Anti-Trump Protesters Are Tools of the Oligarc...
4021     In Ethiopia, Obama seeks progress on peace, se...
4330     Jeb Bush Is Suddenly Attacking Trump. Here's W...
Name: content, Length: 6335, dtype: object


Stemming:

In [10]:
port_stem = PorterStemmer()
def stemming(content):
  stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content

In [11]:
df['content'] = df['content'].apply(stemming)
print(df['content'])

8476     smell hillari fear daniel greenfield shillman ...
10294    watch exact moment paul ryan commit polit suic...
3608     kerri go pari gestur sympathi u secretari stat...
10142    berni support twitter erupt anger dnc tri warn...
875      battl new york primari matter primari day new ...
                               ...                        
4490     state depart say find email clinton specialist...
8062     p pb stand plutocrat pentagon p pb stand pluto...
8622     anti trump protest tool oligarchi inform anti ...
4021     ethiopia obama seek progress peac secur east a...
4330     jeb bush suddenli attack trump matter jeb bush...
Name: content, Length: 6335, dtype: object


In [12]:
#Separating the data annd label
X = df['content'].values
y = df['label'].values

In [13]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X)
X = vectorizer.transform(X)

In [14]:
le = LabelEncoder()
le.fit(y)
y = le.transform(y)
print(X)
print(y)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1608995 stored elements and shape (6335, 43893)>
  Coords	Values
  (0, 106)	0.02744692561612803
  (0, 212)	0.023215343408902665
  (0, 271)	0.05401467247976589
  (0, 328)	0.06002592819852068
  (0, 357)	0.013867829232434481
  (0, 451)	0.02027865711425031
  (0, 579)	0.07881859308416558
  (0, 618)	0.019768886095385775
  (0, 621)	0.018620522655052717
  (0, 623)	0.044409320210287366
  (0, 650)	0.015431274388779802
  (0, 682)	0.01903386479695475
  (0, 980)	0.026544507251586087
  (0, 1001)	0.02062514951758212
  (0, 1024)	0.036933195721984564
  (0, 1035)	0.014754554925576969
  (0, 1087)	0.028677611130454733
  (0, 1091)	0.018037345728748506
  (0, 1199)	0.022419909144057226
  (0, 1208)	0.010623606552681372
  (0, 1552)	0.025000182391534717
  (0, 1670)	0.0258882182374615
  (0, 1718)	0.041002210517151014
  (0, 1749)	0.027519231619294537
  (0, 1788)	0.018796568192211562
  :	:
  (6334, 39797)	0.023327079157066333
  (6334, 40119)	0.039649102

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2, stratify=y)

In [16]:
model = LogisticRegression()

In [17]:
model.fit(X_train, y_train)

In [53]:
#accuracy score in training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, y_train)
print(f"Accuracy on the training data: {training_data_accuracy*100:.2f}%")

Accuracy on the training data: 95.15%


In [54]:
#accuracy score in test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, y_test)
print(f'Accuracy on the test data: {test_data_accuracy*100:.2f}%')

Accuracy on the test data: 92.11%


Making a predictive system

In [39]:
X_new = X_test[2]
prediction = model.predict(X_new)
print(prediction)
if(prediction[0] == 0):
  print('The news is fake')
else:
  print('The news is real')

[1]
The news is real


In [40]:
print(y_test[2])

1


In [38]:
X_check = """Iran reportedly makes new push for uranium concessions in nuclear talks  Iranian negotiators reportedly have made a last-ditch push for more concessions from the U.S. and five other world powers as talks on the fate of Iran's nuclear program come down to the final days before a crucial deadline.

The New York Times reported late Sunday that Tehran had backed away from a tentative promise to ship a large portion of its uranium stockpile to Russia, where it could not be used as part of any future weapons program. Western officials insisted to the paper that the uranium did not have to be sent overseas, but could be disposed of in other ways.

The new twist in the talks comes just two days before the deadline for both sides to agree on a framework for a permanent deal. The final deadline for a permanent deal would not arrive until the end of June.

However, if Iran insists on keeping its uranium in the country, it would undermine a key argument made in favor of the deal by the Obama administration. The Times reports that if the uranium had gone to Russia, it would have been converted into fuel rods, which are difficult to use in nuclear weapons. It is not clear what would happen to the uranium if it remained in Iran.

The Associated Press reported Sunday that Iran's position had shifted from from demanding that it be allowed to keep nearly 10,000 centrifuges enriching uranium, to agreeing to keep 6,000. Western officials involved in the talks told the Associated Press that Tehran may be ready to accept an even lower number.

The United States and its allies want a deal that extends the time Iran would need to make a nuclear weapon from the present two months to three months to at least a year. However, The Times reported Sunday that a paper published by Olli Heinonen, former head of inspections for the U.N.'s nuclear watchdog, estimated that Iran could still develop a nuclear weapon in seven or eight months with around 6,500 centrifuges.

Tehran says it wants to enrich uranium only for energy, science, industry and medicine. But many countries fear Iran could use the technology to make weapons-grade uranium.

Officials told the Associated Press that another main dispute involved the length of an agreement. Iran, they said, wants a total lifting of all caps on its activities after 10 years, while the U.S. and the five other nations at the talks â€” Russia, China, Britain, France and Germany â€” insist on progressive removal after a decade.

A senior U.S. official characterized the issue as lack of agreement on what happens in years 11 to 15. The official spoke on condition of anonymity in line with State Department rules on briefing about the closed-door talks.

Limits on Iran's research and development of centrifuges also were unresolved, the Western officials said.

Tehran has created a prototype centrifuge that it says enriches uranium 16 times faster than its present mainstay model. The U.S. and its partners want to constrain research that would increase greatly the speed of making enough weapons-grade uranium for a bomb, once limits on Iran's programs are lifted.

One official said Russia opposed the U.S. position that any U.N. penalties lifted in the course of a deal should be reimposed quickly if Tehran reneged on any commitments.

Both Western officials said Iran was resisting attempts to make inspections and other ways of verification as intrusive as possible.

There was tentative agreement on turning a nearly-finished reactor into a model that gives off less plutonium waste than originally envisaged. Plutonium, like enriched uranium, is a path to nuclear weapons.

Iran and the U.S. were discussing letting Iran run centrifuges at an underground bunker that has been used to enrich uranium. The machines would produce isotopes for peaceful applications, the officials said.

With the Tuesday deadline approaching and problems remaining, U.S. Secretary of State John Kerry canceled plans Sunday to return to the United States for an event honoring the late U.S. Sen. Edward Kennedy. French Foreign Minister Laurent Fabius and Frank-Walter Steinmeier, his German counterpart, scratched planned trips to Kazakhstan.

Kerry has been in discussions with Iranian Foreign Minister Mohammad Javad Zarif since Thursday.

The Associated Press contributed to this report.

Click for more from The New York Times."""
X_check_stemmed = stemming(X_check)
X_check_vectorized = vectorizer.transform([X_check_stemmed])
prediction = model.predict(X_check_vectorized)
print(prediction)
if (prediction[0]==0):
  print('The news is Fake')
else:
  print('The news is Real')

[1]
The news is Real


In [43]:
joblib.dump(model, 'fake_news_prediction.joblib')
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')

['tfidf_vectorizer.joblib']