In [24]:
import os
import re
import string
import numpy as np
import pandas as pd
import dagshub
import mlflow
import joblib
import warnings
warnings.filterwarnings('ignore')

from mlflow import log_metric, log_param, log_artifacts
from mlflow import sklearn as mlflow_sklearn
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score,precision_score,recall_score,f1_score
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import  RandomForestClassifier
from xgboost import XGBClassifier

In [3]:
# Loading the data 
df = pd.read_csv(r"C:\Users\Varun\Downloads\Saurav\Outsource 360 internship\Project\Fake News Detector\News.csv")

In [4]:
df.head()

Unnamed: 0,title,text,subject,date,label
0,Suicide attack targets area southeast of Baghdad,BAGHDAD (Reuters) - Two attackers shot several...,worldnews,2017-11-27,true
1,FLASHBACK: WATCH TED CRUZ Promise To Support D...,"Yes, he did promise to support Donald Trump:",politics,2016-07-21,fake
2,Boris Johnson gives PM May advice on Brexit wh...,"MANCHESTER, England (Reuters) - British Foreig...",worldnews,2017-10-03,true
3,India struggles to rein in border flows of cat...,NEW DELHI (Reuters) - Stopping Rohingya refuge...,worldnews,2017-10-05,true
4,"In speech, Trump tries to turn from divisive t...",WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,2017-03-01,true


In [5]:
df["label"] = df["label"].map({'fake':0, 'true':1})

In [6]:
# Function to remove all the unwanted things from our text column

def clean_text(text):
    
    text = str(text)  # Ensure it's a string

    # Remove HTML tags
    text = re.sub(r"<.*?>", "", text)

    # Remove URLs
    text = re.sub(r"http\S+|www\.\S+", "", text)

    # Remove email addresses
    text = re.sub(r"\S+@\S+", "", text)

    # Remove non-ASCII characters
    text = text.encode("ascii", errors="ignore").decode()

    # Convert to lowercase
    text = text.lower()

    # Remove leading/trailing and multiple spaces
    text = re.sub(r"\s+", " ", text).strip()

    return text

In [7]:
# new feature that holds cleaned text (free from url,email,non ascii chars or html tags)
df["clean_text"] = df["text"].apply(clean_text)
df["clean_title"] = df["title"].apply(clean_text)

In [8]:
df.head(3)

Unnamed: 0,title,text,subject,date,label,clean_text,clean_title
0,Suicide attack targets area southeast of Baghdad,BAGHDAD (Reuters) - Two attackers shot several...,worldnews,2017-11-27,1,baghdad (reuters) - two attackers shot several...,suicide attack targets area southeast of baghdad
1,FLASHBACK: WATCH TED CRUZ Promise To Support D...,"Yes, he did promise to support Donald Trump:",politics,2016-07-21,0,"yes, he did promise to support donald trump:",flashback: watch ted cruz promise to support d...
2,Boris Johnson gives PM May advice on Brexit wh...,"MANCHESTER, England (Reuters) - British Foreig...",worldnews,2017-10-03,1,"manchester, england (reuters) - british foreig...",boris johnson gives pm may advice on brexit wh...


In [9]:
# Removing the raw text and title columns
df.drop(columns=["text","title"], inplace=True)

In [10]:
df.head(3)

Unnamed: 0,subject,date,label,clean_text,clean_title
0,worldnews,2017-11-27,1,baghdad (reuters) - two attackers shot several...,suicide attack targets area southeast of baghdad
1,politics,2016-07-21,0,"yes, he did promise to support donald trump:",flashback: watch ted cruz promise to support d...
2,worldnews,2017-10-03,1,"manchester, england (reuters) - british foreig...",boris johnson gives pm may advice on brexit wh...


In [11]:
# removing punctuation's from text and title column 
exclude = string.punctuation
# exclude is holding all the value's considered as punctuation
exclude

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [12]:
# function to remove punctuation from clean_text and clean_title column
def remove_punc(text):
    return text.translate(str.maketrans("","",exclude))

In [13]:
# remove punctuation from clean_title column
df["clean_title"] = df["clean_title"].apply(remove_punc)

In [14]:
# remove punctuation from clean_text column
df["clean_text"] = df["clean_text"].apply(remove_punc)

In [15]:
df.head()

Unnamed: 0,subject,date,label,clean_text,clean_title
0,worldnews,2017-11-27,1,baghdad reuters two attackers shot several ci...,suicide attack targets area southeast of baghdad
1,politics,2016-07-21,0,yes he did promise to support donald trump,flashback watch ted cruz promise to support do...
2,worldnews,2017-10-03,1,manchester england reuters british foreign se...,boris johnson gives pm may advice on brexit wh...
3,worldnews,2017-10-05,1,new delhi reuters stopping rohingya refugees ...,india struggles to rein in border flows of cat...
4,politicsNews,2017-03-01,1,washington reuters us president donald trump ...,in speech trump tries to turn from divisive to...


In [16]:
# Combining the text and title column and rearranging the columns
df["combined_text"] = df["clean_title"] + " " + df["clean_text"]

In [17]:
df = df[["combined_text","subject","date","label"]]

In [18]:
df.head(3)

Unnamed: 0,combined_text,subject,date,label
0,suicide attack targets area southeast of baghd...,worldnews,2017-11-27,1
1,flashback watch ted cruz promise to support do...,politics,2016-07-21,0
2,boris johnson gives pm may advice on brexit wh...,worldnews,2017-10-03,1


In [19]:
# Initializing the CountVectorizer
vectorizer = CountVectorizer(max_features = 1000)

In [20]:
# Transforming the text data to feature vectors that can be used as input to the model
X = vectorizer.fit_transform(df["combined_text"])
y = df["label"]

In [21]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y,stratify = y, test_size=0.2, random_state=42)

In [22]:
# Setting up dagshub 
import dagshub

dagshub.init(repo_owner='srvmann', repo_name='news-detector', mlflow=True)
mlflow.set_tracking_uri("https://dagshub.com/srvmann/news-detector.mlflow")

mlflow.set_experiment("Logistic Regression Baseline Model")

<Experiment: artifact_location='mlflow-artifacts:/8e1bb10c74c545babf118673e3eeeed7', creation_time=1759388840432, experiment_id='0', last_update_time=1759388840432, lifecycle_stage='active', name='Logistic Regression Baseline Model', tags={}>

In [25]:
with mlflow.start_run():

    # Log preprocessing parameters
    mlflow.log_param("vectorizer", "CountVectorizer")
    mlflow.log_param("max_features", 1000)
    mlflow.log_param("test_size",0.2)

    # Model building and training
    model = LogisticRegression()
    model.fit(X_train, y_train)

    # Log model parameters
    mlflow.log_param("model", "Logistic Regression")
    mlflow.log_param("solver", model.solver)

    # Model evaluation
    y_pred     = model.predict(X_test)
    accuracy   = accuracy_score(y_test, y_pred)
    precision  = precision_score(y_test, y_pred)
    recall     = recall_score(y_test, y_pred)   
    f1_value   = f1_score(y_test, y_pred)   

    # Log metrics
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1_value)

    # Save the model locally
    model_path = "log_reg_model.pkl"
    joblib.dump(model, model_path)

    # Log the model as an artifact (this works on DagsHub!)
    mlflow.log_artifact(model_path, artifact_path="models")

    # Save and log the notebook
    notebook_path = "exp1_baseLineModel.ipynb"
    os.system(f"jupyter nbconvert --to notebook --execute --inplace {notebook_path}")
    mlflow.log_artifact(notebook_path, artifact_path="notebooks")

    # Print classification report
    print("Accuracy:", accuracy) 
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1_value)


Accuracy: 0.9956560481176209
Precision: 0.9957953749124037
Recall: 0.9950980392156863
F1 Score: 0.9954465849387041
🏃 View run enchanting-hound-496 at: https://dagshub.com/srvmann/news-detector.mlflow/#/experiments/0/runs/36f66b0e06b44881abb7fca62545aee8
🧪 View experiment at: https://dagshub.com/srvmann/news-detector.mlflow/#/experiments/0
