# **Importing Libraries**

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import re
import string

# **Initialization**

In [None]:
df_fake = pd.read_csv("drive/MyDrive/Fake.csv")
df_true = pd.read_csv("drive/MyDrive/True.csv")

In [None]:
df_fake.head()

In [None]:
df_true.head()

## **Inserting a column "class" as target feature**

In [None]:
df_fake["class"] = 0

In [None]:
df_true["class"] = 1

In [None]:
df_fake.head()

In [None]:
df_fake.shape

In [None]:
df_true.shape

In [None]:
df_fake_manual_testing = df_fake.tail(10)
df_fake = df_fake[:-10]


df_true_manual_testing = df_true.tail(10)
for i in range(21416,21406,-1):
    df_true.drop([i], axis = 0, inplace = True)

In [None]:
df_fake.shape, df_true.shape

# **Creating Manual Testing Dataset**

In [None]:
df_fake_manual_testing["class"] = 0
df_true_manual_testing["class"] = 1

In [None]:
df_fake_manual_testing.head(10)

In [None]:
df_true_manual_testing.head(10)

In [None]:
df_manual_testing = pd.concat([df_fake_manual_testing,df_true_manual_testing], axis = 0)
df_manual_testing.to_csv("manual_testing.csv")

# **Merging True and False dataframes**

In [None]:
df_merge = pd.concat([df_fake, df_true], axis =0 )
df_merge.head(10)

Removing columns which arent needed

In [None]:
df_merge.columns

In [None]:
df = df_merge.drop(["title", "subject", "date"], axis = 1)

In [None]:
df.isnull().sum()

Shuffling Dataset

In [None]:
df = df.sample(frac = 1)

In [None]:
df.head()

In [None]:
df.reset_index(inplace = True)
df.drop(["index"], axis = 1, inplace = True)

In [None]:
df.head()

# **Text Processing Initial Step:**

In [None]:
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [None]:
df["text"] = df["text"].apply(wordopt)

# **Splitting into Training and Testing [3:1]**

In [None]:
x = df["text"]
y = df["class"]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

# **Logistic Regression**


In [None]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression()
LR.fit(xv_train, y_train)

In [None]:
pred_lr = LR.predict(xv_test)

In [None]:
LR.score(xv_test, y_test)

In [None]:
print(classification_report(y_test, pred_lr))

# **Decision Tree Classificiation**

In [None]:
from sklearn.tree import DecisionTreeClassifier

DT = DecisionTreeClassifier()
DT.fit(xv_train, y_train)

In [None]:
pred_dt = DT.predict(xv_test)

In [None]:
DT.score(xv_test, y_test)

In [None]:
print(classification_report(y_test, pred_dt))

# **Gradient Boosting Classifier**

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

GBC = GradientBoostingClassifier(random_state=0)
GBC.fit(xv_train, y_train)

In [None]:
pred_gbc = GBC.predict(xv_test)

In [None]:
GBC.score(xv_test, y_test)

In [None]:
print(classification_report(y_test, pred_gbc))

# **Random Forest Classifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier

RFC = RandomForestClassifier(random_state = 0)
RFC.fit(xv_train, y_train)

In [None]:
pred_rfc = RFC.predict(xv_test)

In [None]:
RFC.score(xv_test, y_test)

In [None]:
print(classification_report(y_test, pred_rfc))

# **Manual Testing**

In [None]:
def output_result(n):
  if n == 0:
    return "Fake News"
  elif n == 1:
    return "True News"

def manual_testing(news):
  testing_news = {"text":[news]}
  new_def_test = pd.DataFrame(testing_news)
  new_def_test["text"] = new_def_test["text"].apply(wordopt)
  new_x_test = new_def_test["text"]
  new_xv_test = vectorization.transform(new_x_test)
  pred_LR = LR.predict(new_xv_test)
  pred_DT = DT.predict(new_xv_test)
  pred_GBC = GBC.predict(new_xv_test)
  pred_RFC = RFC.predict(new_xv_test)

  return print("\n\nLR Prediction: {} \nDT Prediction: {} \nGBC Prediction: {} \nRFC Prediction: {}".format(output_result(pred_LR[0]),
                                                                                                              output_result(pred_DT[0]),
                                                                                                              output_result(pred_GBC[0]),
                                                                                                              output_result(pred_RFC[0])))

In [None]:

news = str(input("Enter the news:"))
manual_testing(news)

In [None]:
import joblib

# Save models
joblib.dump(LR, 'lr_model.pkl')
joblib.dump(DT, 'dt_model.pkl')
joblib.dump(GBC, 'gbc_model.pkl')
joblib.dump(RFC, 'rfc_model.pkl')

# Save the vectorizer
joblib.dump(vectorization, 'vectorizer.pkl')


# **Summarizer**

In [None]:
!pip install gensim


In [None]:
import gensim.downloader as api

# Load GloVe embeddings (100 dimensions)
glove_vectors = api.load("glove-wiki-gigaword-100")

In [None]:
import re
import string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

import nltk
nltk.download('punkt')   # Word tokenization
nltk.download('stopwords')
nltk.download('punkt_tab')

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    sentences = sent_tokenize(text)
    stop_words = set(stopwords.words('english'))
    sentences = [
        [word for word in word_tokenize(sentence) if word not in stop_words]
        for sentence in sentences
    ]

    cleaned_text = " ".join([" ".join(sentence) for sentence in sentences])

    return sentences, cleaned_text

In [None]:
import numpy as np

def sentence_vector(sentence, glove_vectors):
    word_vectors = [glove_vectors[word] for word in sentence if word in glove_vectors]
    if len(word_vectors) == 0:
        return np.zeros(glove_vectors.vector_size)#If no GloVe vectors are found for the words, return a zero vector

    return np.mean(word_vectors, axis=0)  #Average the word vectors to get the sentence vector

sentence_vectors = [sentence_vector(sentence, glove_vectors) for sentence in processed_sentences]


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cos_sim_matrix = cosine_similarity(sentence_vectors)
sentence_scores = np.sum(cos_sim_matrix, axis=1)

In [None]:
def generate_summary(text, glove_vectors, top_n=3):
    sentences, _ = preprocess_text(text)
    sentence_vectors = [sentence_vector(sentence, glove_vectors) for sentence in sentences]
    cos_sim_matrix = cosine_similarity(sentence_vectors)
    sentence_scores = np.sum(cos_sim_matrix, axis=1)
    top_sentence_indices = sentence_scores.argsort()[-top_n:][::-1]
    summary = [sentences[i] for i in top_sentence_indices]
    return ' '.join([' '.join(sentence) for sentence in summary])




In [None]:
def manual_testing_with_summary():
    news = input("Please enter the news text: ")
    testing_news = {"text": [news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test["text"] = new_def_test["text"].apply(wordopt)

    new_x_test = new_def_test["text"]
    new_xv_test = vectorization.transform(new_x_test)

    pred_LR = LR.predict(new_xv_test)
    pred_DT = DT.predict(new_xv_test)
    pred_GBC = GBC.predict(new_xv_test)
    pred_RFC = RFC.predict(new_xv_test)

    # Generate the summary of the input news
    summary = generate_summary(news, glove_vectors, top_n=3)

    # Print the predictions from all models
    print("\n\nLR Prediction: {} \nDT Prediction: {} \nGBC Prediction: {} \nRFC Prediction: {}".format(
        output_result(pred_LR[0]),
        output_result(pred_DT[0]),
        output_result(pred_GBC[0]),
        output_result(pred_RFC[0])
    ))

    # Print the summary of the news
    print("\nSummary of the news:\n", summary)



In [None]:
manual_testing_with_summary()

In [None]:
!pip install transformers

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def summarize_sentence_bart(sentence: str):

    testing_news = {"text": [sentence]}
    new_def_test = pd.DataFrame(testing_news)

    new_def_test["text"] = new_def_test["text"].apply(wordopt)

    new_x_test = new_def_test["text"]
    new_xv_test = vectorization.transform(new_x_test)

    pred_LR = LR.predict(new_xv_test)
    pred_DT = DT.predict(new_xv_test)
    pred_GBC = GBC.predict(new_xv_test)
    pred_RFC = RFC.predict(new_xv_test)

    print("\n\nLR Prediction: {} \nDT Prediction: {} \nGBC Prediction: {} \nRFC Prediction: {}".format(
        output_result(pred_LR[0]),
        output_result(pred_DT[0]),
        output_result(pred_GBC[0]),
        output_result(pred_RFC[0])
    ))

    if(output_result(pred_GBC[0]) == "True News"):
      if len(sentence.split()) < 5:
        return sentence  # Return the original sentence if it's too short to summarize

      summary = summarizer(sentence, max_length=50, min_length=25, do_sample=False)

      return summary[0]['summary_text']

In [None]:
input_sentence = input("Enter Sentence: ")
print(summarize_sentence_bart(input_sentence))