In [1]:
import numpy as np
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords # the for of in with
from nltk.stem.porter import PorterStemmer # loved loving == love
from sklearn.feature_extraction.text import TfidfVectorizer # loved = [0.0]
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


## Reading CSV file

In [2]:
news_df = pd.read_csv('train.csv')

In [3]:
news_df["label"].value_counts()[1]

37106

In [5]:
news_df.shape

(72134, 3)

In [4]:
news_df =news_df.drop(news_df.columns[0],axis=1)
news_df.head()

Unnamed: 0,title,text,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,,Did they post their votes for Hillary already?,1
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


# 1. Preprocessing

In [6]:
news_df.isnull().sum()

Unnamed: 0,0
title,558
text,39
label,0


In [7]:
print(news_df.columns)

Index(['title', 'text', 'label'], dtype='object')


In [8]:
print(news_df['text'])

0        No comment is expected from Barack Obama Membe...
1           Did they post their votes for Hillary already?
2         Now, most of the demonstrators gathered last ...
3        A dozen politically active pastors came here f...
4        The RS-28 Sarmat missile, dubbed Satan 2, will...
                               ...                        
72129    WASHINGTON (Reuters) - Hackers believed to be ...
72130    You know, because in fantasyland Republicans n...
72131    Migrants Refuse To Leave Train At Refugee Camp...
72132    MEXICO CITY (Reuters) - Donald Trump’s combati...
72133    Goldman Sachs Endorses Hillary Clinton For Pre...
Name: text, Length: 72134, dtype: object


In [9]:
news_df = news_df.fillna(' ')

In [10]:
news_df.isnull().sum()

Unnamed: 0,0
title,0
text,0
label,0


In [11]:
news_df['content'] = news_df['title']+' '+news_df['title']

In [12]:
news_df.shape

(72134, 4)

In [13]:
news_df

Unnamed: 0,title,text,label,content
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,LAW ENFORCEMENT ON HIGH ALERT Following Threat...
1,,Did they post their votes for Hillary already?,1,
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...
3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,"Bobby Jindal, raised Hindu, uses story of Chri..."
4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,SATAN 2: Russia unvelis an image of its terrif...
...,...,...,...,...
72129,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...,0,Russians steal research on Trump in hack of U....
72130,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n...",1,WATCH: Giuliani Demands That Democrats Apolog...
72131,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0,Migrants Refuse To Leave Train At Refugee Camp...
72132,Trump tussle gives unpopular Mexican leader mu...,MEXICO CITY (Reuters) - Donald Trump’s combati...,0,Trump tussle gives unpopular Mexican leader mu...


# Stemming:

Stemming is the process of reducing a word to its Root word

example: hung         hanged        hanging ======hang

# Steps:
lower case                 
splitting                             
removing stopwords                              
stemming                                   

In [14]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [15]:
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()
def stemming(content):
    # Remove non-alphabetic characters
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()

    # Initialize an empty list for the stemmed words
    final_stemmed_words = []

    # Iterate through each word
    for word in stemmed_content:
        # Check if the word is not a stopword
        if word not in stop_words:
            # Stem the word and append to the final list
            final_stemmed_words.append(ps.stem(word))

    # Join the stemmed words back into a single string
    return ' '.join(final_stemmed_words)

In [16]:
news_df['content'] = news_df['content'].apply(stemming)

In [17]:
news_df['content']

Unnamed: 0,content
0,law enforc high alert follow threat cop white ...
1,
2,unbeliev obama attorney gener say charlott rio...
3,bobbi jindal rais hindu use stori christian co...
4,satan russia unv imag terrifi new supernuk wes...
...,...
72129,russian steal research trump hack u democrat p...
72130,watch giuliani demand democrat apolog trump ra...
72131,migrant refus leav train refuge camp hungari m...
72132,trump tussl give unpopular mexican leader much...


In [18]:
X = news_df['content'].values
y = news_df['label'].values

# Converting textual data into numerical data

In [19]:
vector = TfidfVectorizer()
vector.fit(X)
X = vector.transform(X)

In [20]:
print(X)

  (0, 407)	0.3190180925014663
  (0, 1802)	0.33473541566384035
  (0, 3679)	0.24871262252022117
  (0, 5509)	0.31820565801047196
  (0, 6425)	0.28932771754845743
  (0, 6730)	0.48553136502134386
  (0, 7887)	0.26746434949988324
  (0, 9699)	0.22829788917209384
  (0, 17260)	0.24871262252022117
  (0, 17363)	0.2542650376115143
  (0, 18648)	0.1297506867782943
  (0, 19106)	0.19134939529376566
  (2, 1049)	0.28404017886581956
  (2, 2673)	0.30809679188606154
  (2, 2919)	0.3639616996972358
  (2, 6880)	0.2652283770602196
  (2, 8020)	0.2692285294185893
  (2, 11864)	0.2231406266784195
  (2, 12011)	0.16878852994653004
  (2, 12744)	0.27904818164471595
  (2, 13591)	0.22687620695463123
  (2, 14591)	0.3580030298678158
  (2, 15094)	0.1609967301122813
  (2, 16446)	0.1999703023632961
  (2, 18034)	0.35962437110547785
  :	:
  (72130, 17778)	0.13227219506940732
  (72130, 18936)	0.25302499393443006
  (72131, 2566)	0.3967249021272091
  (72131, 8206)	0.46269177743112333
  (72131, 9752)	0.3384827653769501
  (72131, 109

# Splitting the data for training and testing

In [21]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.2, random_state=2)

In [22]:
X_train.shape

(57707, 19639)

### Training the model

In [23]:
model = LogisticRegression()
model.fit(X_train,Y_train)

### Predict on the training set

In [24]:
train_y_pred = model.predict(X_train)
train_y_pred_prob = model.predict_proba(X_train)

### Predict on the testing set

In [25]:
# Predict on the testing set
test_y_pred = model.predict(X_test)
test_y_pred_prob = model.predict_proba(X_test)

## Calculate accuracy

In [26]:
from sklearn.metrics import accuracy_score, log_loss
# Calculate accuracy for training and testing sets
train_accuracy = accuracy_score(Y_train, train_y_pred)
test_accuracy = accuracy_score(Y_test, test_y_pred)

# Calculate log loss for training and testing sets
train_loss = log_loss(Y_train, train_y_pred_prob)
test_loss = log_loss(Y_test, test_y_pred_prob)

# Print the accuracy and loss
print(f"Training Accuracy: {train_accuracy:.2f}")
print(f"Testing Accuracy: {test_accuracy:.2f}")
print(f"Training Loss: {train_loss:.2f}")
print(f"Testing Loss: {test_loss:.2f}")

Training Accuracy: 0.92
Testing Accuracy: 0.90
Training Loss: 0.24
Testing Loss: 0.27


## Building the detection system

In [None]:
input_data = X_test[30]
prediction = model.predict(input_data)

In [None]:
if prediction[0] == 0:
    print('The News Is Real')
else:
    print('The News is Fake')

The News Is Real


In [None]:
news_df['text'][2]

'demonstr gather last night exercis constitut protect right peac protest order rais issu creat chang loretta lynch aka eric holder skirt'

## Saving the model

In [None]:
import pickle
with open('model.pkl','wb') as f:
    pickle.dump(model,f)

In [None]:
pickle.dump(vector,open('vector.pkl','wb'))