# Problem Statement
This project aims to classify news articles as real or fake based on their content. Specifically, we will use machine learning to build a model to predict whether a given news article is real or fake based on its text.

#  Data Collection and Exploration

In [1]:
import pandas as pd
#importind the csv files
real_news = pd.read_csv('True.csv')
fake_news = pd.read_csv('Fake.csv')

In [2]:
real_news.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [3]:
fake_news.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


# Inserting a column "CLASS" as target feature

In [4]:
real_news['CLASS']=1

In [5]:
fake_news['CLASS']=0

In [6]:
real_news.shape

(21417, 5)

In [7]:
fake_news.shape

(23481, 5)

# Merging True and Fake Dataframes

In [8]:
df_merge = pd.concat([real_news, fake_news], axis =0 )
df_merge.head(10)

Unnamed: 0,title,text,subject,date,CLASS
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1
5,"White House, Congress prepare for talks on spe...","WEST PALM BEACH, Fla./WASHINGTON (Reuters) - T...",politicsNews,"December 29, 2017",1
6,"Trump says Russia probe will be fair, but time...","WEST PALM BEACH, Fla (Reuters) - President Don...",politicsNews,"December 29, 2017",1
7,Factbox: Trump on Twitter (Dec 29) - Approval ...,The following statements were posted to the ve...,politicsNews,"December 29, 2017",1
8,Trump on Twitter (Dec 28) - Global Warming,The following statements were posted to the ve...,politicsNews,"December 29, 2017",1
9,Alabama official to certify Senator-elect Jone...,WASHINGTON (Reuters) - Alabama Secretary of St...,politicsNews,"December 28, 2017",1


In [9]:
df_merge.columns

Index(['title', 'text', 'subject', 'date', 'CLASS'], dtype='object')

# Remove unnecessary columns

In [10]:
df_merge.drop(columns=['title','subject','date'],inplace=True)

In [11]:
df_merge.isnull().sum()

text     0
CLASS    0
dtype: int64

# Random Shuffling the dataframe

In [12]:
df = df_merge.sample(frac = 1)

In [13]:
df.head()

Unnamed: 0,text,CLASS
5343,Jon Stewart completely annihilated Sean Hannit...,0
10235,WASHINGTON (Reuters) - A bipartisan group of U...,1
13356,Making America Unsafe Again It s the Obama leg...,0
17646,"TAMPA, Fla (Reuters) - The head of U.S. Centra...",1
11034,Typical! The New York Times tries to blame con...,0


In [14]:
df.reset_index(inplace = True)

In [15]:
df.head()

Unnamed: 0,index,text,CLASS
0,5343,Jon Stewart completely annihilated Sean Hannit...,0
1,10235,WASHINGTON (Reuters) - A bipartisan group of U...,1
2,13356,Making America Unsafe Again It s the Obama leg...,0
3,17646,"TAMPA, Fla (Reuters) - The head of U.S. Centra...",1
4,11034,Typical! The New York Times tries to blame con...,0


In [16]:
df.drop(["index"], axis = 1, inplace = True)

In [17]:
df.head()

Unnamed: 0,text,CLASS
0,Jon Stewart completely annihilated Sean Hannit...,0
1,WASHINGTON (Reuters) - A bipartisan group of U...,1
2,Making America Unsafe Again It s the Obama leg...,0
3,"TAMPA, Fla (Reuters) - The head of U.S. Centra...",1
4,Typical! The New York Times tries to blame con...,0


# Preprocess the text 

In [18]:
import re
import string
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def preprocessing(text):
    # Initialize the stemmer and stopwords list
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    
    # Convert text to lowercase
    text = text.lower()
    
    # Remove text within square brackets
    text = re.sub('\[.*?\]', '', text)
    
    # Replace non-word characters (excluding whitespace) with spaces
    text = re.sub("\\W", " ", text)
    
    # Remove URLs
    text = re.sub('https?://\S+|www\.\S+', '', text)
    
    # Remove HTML/XML tags
    text = re.sub('<.*?>+', '', text)
    
    # Remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    
    # Remove newline characters
    text = re.sub('\n', '', text)
    
    # Remove words containing digits
    text = re.sub('\w*\d\w*', '', text)
    
    # Tokenize the text
    words = word_tokenize(text)
    
    # Remove stopwords
    filtered_words = [word for word in words if word not in stop_words]
    
    # Apply stemming to each word
    stemmed_words = [stemmer.stem(word) for word in filtered_words]
    
    # Join the stemmed words back into a single string
    text = ' '.join(stemmed_words)
    
    return text


In [19]:
df["text"] = df["text"].apply(preprocessing)

# Defining dependent and independent variables

In [20]:
X = df["text"]
y = df["CLASS"]

# Splitting Training and Testing

In [22]:
!pip install scikit-learn



In [23]:
from sklearn.model_selection import train_test_split

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

# Make a pipeline of Converting text into numeric and Model training

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline

tfidf_vectorizer = TfidfVectorizer()

# Initialize RandomForestClassifier
RFC = RandomForestClassifier(random_state=0)

# Create a pipeline
pipeline = make_pipeline(tfidf_vectorizer, RFC)

# Fit the pipeline on the training data
pipeline.fit(X_train,y_train)

# Model Evaluation

In [26]:
y_pred = pipeline.predict(X_test)

In [27]:
pipeline.score(X_test,y_test)

0.9919821826280624

In [28]:
from sklearn.metrics import classification_report

In [29]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4698
           1       0.99      0.99      0.99      4282

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



# Model Testing

In [30]:
def prediction(text):
    preprocessed_text = preprocessing(text)
    y_pred = pipeline.predict([preprocessed_text])
    if y_pred[0]== 1:
        print()
        print()
        print('********REAL********')
    else:
        print()
        print()
        print('********FAKE********')
        

In [32]:
news = str(input())
prediction(news)



********REAL********


In [33]:
news = str(input())
prediction(news)



********FAKE********


#  Hyperparameter tuning using GridSearchCV.

In [None]:
from sklearn.model_selection import GridSearchCV


# Define the parameter grid for GridSearchCV
param_grid = {
    'tfidfvectorizer__max_features': [500, 1000, 1500],  # Number of features to use for TF-IDF
    'tfidfvectorizer__ngram_range': [(1, 1), (1, 2)],   # N-grams (unigrams and bigrams)
    'randomforestclassifier__n_estimators': [100, 200],   # Number of trees in the forest
    'randomforestclassifier__max_depth': [None, 10, 20],  # Maximum depth of the tree
    'randomforestclassifier__min_samples_split': [2, 5, 10] # Minimum samples required to split an internal node
}

# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit GridSearchCV on the training data
grid_search.fit(X_train, y_train)

# Retrieve the best parameters and best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best parameters found: ", best_params)
print("Best model: ", best_model)


Fitting 5 folds for each of 108 candidates, totalling 540 fits


# Saving  model

In [145]:
from joblib import dump

# Save the pipeline to a file
joblib.dump(pipeline, 'text_classification_pipeline.pkl')

NameError: name 'joblib' is not defined

# 5. Model Deployment

In [87]:
# from flask import Flask, request, render_template
# from joblib import load
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()


In [96]:
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()

    # Remove punctuation and digits
    text = text.translate(str.maketrans('', '', string.punctuation + string.digits))

    # Tokenize the text
    words = word_tokenize(text)

    # Remove stop words
    words = [word for word in words if word not in stop_words]

    # Stem or lemmatize the words
    words = [stemmer.stem(word) for word in words]
   
        # Join the words back into a string
    text = ' '.join(words)

    return text

 The latest report from the United Nations Intergovernmental Panel on Climate Change (IPCC) underscores the urgent need for global action to combat climate change. Released on August 29, 2024, the report warns that the world is not on track to meet the targets set by the Paris Agreement and that immediate, substantial reductions in greenhouse gas emissions are required to avoid the most severe impacts of climate change.  The report highlights that global temperatures are on course to rise by 2.5 degrees Celsius above pre-industrial levels by the end of the century, significantly higher than the 1.5 degrees Celsius limit set by the Paris Agreement. This increase in temperature could lead to more frequent and severe heatwaves, flooding, and extreme weather events, with devastating consequences for ecosystems and human societies.  Key findings of the report include the acceleration of ice loss from the polar regions, rising sea levels, and increasing frequency of extreme weather events. T

real


In [26]:
!pip install flask

Collecting flask
  Downloading flask-3.0.3-py3-none-any.whl.metadata (3.2 kB)
Collecting itsdangerous>=2.1.2 (from flask)
  Downloading itsdangerous-2.2.0-py3-none-any.whl.metadata (1.9 kB)
Collecting blinker>=1.6.2 (from flask)
  Downloading blinker-1.8.2-py3-none-any.whl.metadata (1.6 kB)
Downloading flask-3.0.3-py3-none-any.whl (101 kB)
Downloading blinker-1.8.2-py3-none-any.whl (9.5 kB)
Downloading itsdangerous-2.2.0-py3-none-any.whl (16 kB)
Installing collected packages: itsdangerous, blinker, flask
Successfully installed blinker-1.8.2 flask-3.0.3 itsdangerous-2.2.0
