In [None]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.27.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-3.0.2-py3-none-manylinux_2_28_x86_64.whl (253.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.9/253.9 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia_nccl_cu12-2.27.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (322.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m322.3/322.3 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
Successfully installed nvidia-nccl-cu12-2.27.5 xgboost-3.0.2


In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import string

nltk.download('punkt_tab')

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\regmi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\regmi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\regmi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\regmi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
from sklearn import set_config
set_config(display='diagram')


In [4]:
data_news = pd.read_csv('../data/processed/cleaned_news.csv')

In [None]:
data_news.shape, data_news.isna().sum()

((38638, 9),
 title                      0
 text                       0
 subject                    0
 label                      0
 text_length                0
 title_length               0
 exclamations_mark_count    0
 questions_mark_count       0
 uppercase_words_count      0
 dtype: int64)

In [None]:
data_news.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,21191
1,17447


In [None]:
# creating a custom transformer
from sklearn.base  import BaseEstimator, TransformerMixin


class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.stop_words = stopwords.words('english')
        self.freqwords = set([
            'trump', 'president', 'reuters', 'state', 'donald',
            'states', 'house', 'government', 'republican', 'united'
        ])
        self.stopwords_set = set(self.stop_words).union(self.freqwords)
        self.lemmatizer = WordNetLemmatizer()

    def preprocess_text(self, text):
        # Lowercase
        text = text.lower()

        # Remove URLs
        text = re.sub(r'https?://\S+|www\.\S+', '', text)

        # Remove standalone numbers
        text = re.sub(r'\b\d+\b', '', text)

        # Remove HTML tags
        text = re.sub(r'<.*?>', '', text)

        # Remove punctuation
        text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)

        # Remove newlines and unicode punctuation
        text = re.sub(r'\n', ' ', text)
        text = re.sub(r'[’“”…]', '', text)

        # Remove emojis
        emoji_pattern = re.compile("["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags
            u"\U00002702-\U000027B0"
            u"\U000024C2-\U0001F251"
            "]+", flags=re.UNICODE)
        text = emoji_pattern.sub('', text)

        # Expand contractions
        contractions = {
            "isn't": "is not", "he's": "he is", "wasn't": "was not", "there's": "there is",
            "couldn't": "could not", "won't": "will not", "they're": "they are", "she's": "she is",
            "wouldn't": "would not", "haven't": "have not", "that's": "that is", "you've": "you have",
            "what's": "what is", "weren't": "were not", "we're": "we are", "hasn't": "has not",
            "you'd": "you would", "shouldn't": "should not", "let's": "let us", "they've": "they have",
            "you'll": "you will", "i'm": "i am", "we've": "we have", "it's": "it is", "don't": "do not",
            "that´s": "that is", "i´m": "i am", "it’s": "it is", "she´s": "she is", "i’m": "i am",
            "i’d": "i did", "there’s": "there is"
        }

        for contraction, expanded in contractions.items():
            text = re.sub(rf"\b{re.escape(contraction)}\b", expanded, text)

        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()

        # Tokenize
        tokens = nltk.word_tokenize(text)

        # Remove stopwords
        tokens = [word for word in tokens if word not in self.stopwords_set]

        # Lemmatize
        tokens = [self.lemmatizer.lemmatize(word) for word in tokens]

        # Return processed string
        return ' '.join(tokens)


    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return [self.preprocess_text(text) for text in X]


## TextPreprocessor

In [None]:
data_news['text_cleaned'] = TextPreprocessor().fit_transform(data_news['text'])


## Evaluating the performance of the navie model first

In [None]:
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score
def train_and_evaluate_model(X,y,model=MultinomialNB(), model_name='MultinominalNB', vecotrizer='CountVectorizer'):
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    test_size=0.20,
                                                    random_state=42)
    model.fit(X_train, y_train)
    predicted = model.predict(X_test)
    f1score = f1_score(predicted, y_test)
    print(f'{model_name} model accuracy is {f1score*100 : 04.2f}% using {vecotrizer}')
    print('------------------------------------------------')
    print('Classification Report:')
    print(classification_report(y_test, predicted))
    return model, predicted, f1score, X_test, y_test


In [None]:
# vecotrization

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.tokenize import RegexpTokenizer

token = RegexpTokenizer(r'[a-zA-Z0-9]+')


tfid = CountVectorizer(
    stop_words=None,
    ngram_range=(1,1),
    tokenizer=token.tokenize,
    token_pattern=None
)

text_counts_1_2 = tfid.fit_transform(data_news['text_cleaned'])


In [None]:
_, _, _ , _, _ = train_and_evaluate_model(text_counts_1_2, data_news['label'])

MultinominalNB model accuracy is  94.98% using CountVectorizer
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96      4238
           1       0.96      0.94      0.95      3490

    accuracy                           0.96      7728
   macro avg       0.96      0.95      0.95      7728
weighted avg       0.96      0.96      0.96      7728



## Conclusion
> Built a baseline Multinomial Naive Bayes model using CountVectorizer, achieving 95% F1-score.

> Let's build a more robust model by using metadata egineered features + ensemble learning + hyperparameter tunning

## Creating a pipline for optimizing vetorizer and classifier parameters

In [None]:
data_news.head()

Unnamed: 0,title,text,subject,label,text_length,title_length,exclamations_mark_count,questions_mark_count,uppercase_words_count,text_cleaned
0,APPLE’S CEO SAYS RELIGIOUS FREEDOM LAWS ARE ‘D...,The gay mafia has a new corporate Don. This i...,politics,1,1245,130,0,0,4,gay mafia new corporate article need read shee...
1,BENGHAZI PANEL CALLS HILLARY TO TESTIFY UNDER ...,Does anyone really think Hillary Clinton will ...,politics,1,3157,117,0,5,14,anyone really think hillary clinton come clean...
2,HILLARY RODHAM NIXON: A CANDIDATE WITH MORE BA...,The irony here isn t lost on us. Hillary is be...,politics,1,1994,76,0,1,0,irony lost u hillary compared wanted take nixo...
3,WATCH DIRTY HARRY REID ON HIS LIE ABOUT ROMNEY...,"In case you missed it Sen. Harry Reid (R-NV), ...",left-news,1,658,80,0,1,4,case missed sen harry reid rnv announced last ...
4,OH NO! GUESS WHO FUNDED THE SHRINE TO TED KENNEDY,Nothing like political cronyism to make your s...,politics,1,2488,49,0,4,5,nothing like political cronyism make stomach c...


In [None]:
# final_df = news_df.merge(new_text_df, left_index=True, right_index=True)

final_df = data_news.drop(columns=['title', 'text', 'title_length'])

# final_df.rename(columns={'no_sw_lem': 'text_cleaned'}, inplace=True)
# final_df.head()



X = final_df.drop(columns=['label' ,'subject'])
y = final_df.label



X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    test_size=0.20,
                                                    random_state=42
                                                    )

In [None]:
X['text_cleaned'] = X['text_cleaned'].astype(str)
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38638 entries, 0 to 38637
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   text_length              38638 non-null  int64 
 1   exclamations_mark_count  38638 non-null  int64 
 2   questions_mark_count     38638 non-null  int64 
 3   uppercase_words_count    38638 non-null  int64 
 4   text_cleaned             38638 non-null  object
dtypes: int64(4), object(1)
memory usage: 1.5+ MB


In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((30910, 5), (7728, 5), (30910,), (7728,))

In [None]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30910 entries, 21251 to 8773
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   text_length              30910 non-null  int64 
 1   exclamations_mark_count  30910 non-null  int64 
 2   questions_mark_count     30910 non-null  int64 
 3   uppercase_words_count    30910 non-null  int64 
 4   text_cleaned             30910 non-null  object
dtypes: int64(4), object(1)
memory usage: 1.4+ MB


In [None]:
X_train.dropna(inplace=True, how='any')
X_test.dropna(inplace=True, how='any')
y_train.dropna(inplace=True, how='any')
y_test.dropna(inplace=True, how='any')

## PipeLine to merge text_data and metadata features together

In [None]:

trf1 = ColumnTransformer(
    transformers=[
        ('vectorizer', CountVectorizer(), 'text_cleaned')
    ],
    remainder='passthrough',  # keeps metadata columns

    force_int_remainder_cols=False # treat remainder columns by name, not index.
)

model_pipe = Pipeline(
    steps=[
        ('trf1', trf1),
        ('classifier', XGBClassifier(objective='binary:logistic', eval_metric='logloss'))
    ]
)

model_pipe  # Displays a diagram of the pipeline

In [None]:
model_pipe.fit(X_train, y_train)

In [None]:
from sklearn.model_selection import cross_val_score

score = cross_val_score(model_pipe, X_train, y_train, cv=5)

In [None]:
print("Cross-Validation Accuracy Scores:", score)
print(f"Mean Accuracy: { score.mean()*100:.2f}%")

Cross-Validation Accuracy Scores: [0.98511808 0.98495632 0.98317697 0.98722096 0.98754448]
Mean Accuracy: 98.56%


## Just after using the countvectorizer with meta data  features our model has accuracy up 98.56%. Let's use hpyerparameter for building a more robust model

## Tuning our model

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

param_dist = {
    'trf1__vectorizer__ngram_range': [(1, 1), (1, 2)],
    'trf1__vectorizer__max_features': randint(2000, 6000),
    'classifier__learning_rate': uniform(0.01, 0.3),
    'classifier__n_estimators': randint(100, 500),
    'classifier__max_depth': randint(3, 10)
}

random_search = RandomizedSearchCV(
    model_pipe,
    param_distributions=param_dist,
    n_iter=25,  # number of random combos
    cv=5,
    scoring='f1',
    n_jobs=-1,
    random_state=42,
    verbose=2
)

In [None]:
random_search.fit(X_train,y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


In [None]:
best_params = random_search.best_params_
from pprint import pprint
pprint(best_params)

# saving these parameters
import pickle
with open('best_params.pkl', 'wb') as f:
    pickle.dump(best_params, f)

# {'classifier__learning_rate': np.float64(0.24555278841790407),
#  'classifier__max_depth': 5,
#  'classifier__n_estimators': 463,
#  'trf1__vectorizer__max_features': 5638,
#  'trf1__vectorizer__ngram_range': (1, 2)}

{'classifier__learning_rate': np.float64(0.24555278841790407),
 'classifier__max_depth': 5,
 'classifier__n_estimators': 463,
 'trf1__vectorizer__max_features': 5638,
 'trf1__vectorizer__ngram_range': (1, 2)}


In [None]:
best_model_pipeline = random_search.best_estimator_

In [None]:
best_model_pipeline.fit(X_train, y_train)

## Evaluating our model

In [None]:
from sklearn.metrics import f1_score, classification_report
y_pred = best_model_pipeline.predict(X_test)

f1score= f1_score(y_test, y_pred)*100

In [None]:
print(f"Accracy of the model is {f1score:.4f}%")
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(y_test, y_pred))

Accracy of the model is 98.9365%
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4238
           1       0.99      0.99      0.99      3490

    accuracy                           0.99      7728
   macro avg       0.99      0.99      0.99      7728
weighted avg       0.99      0.99      0.99      7728



In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(best_model_pipeline, X, y, cv=skf, scoring='f1')
print("F1 Scores per fold:", scores)

F1 Scores per fold: [0.98734177 0.98950395 0.98632503 0.98848921 0.9871924 ]


In [None]:
print("F1 Scores per fold:", scores)
print("------------------------------")
print("Mean F1 Score:", scores.mean())

F1 Scores per fold: [0.98734177 0.98950395 0.98632503 0.98848921 0.9871924 ]
------------------------------
Mean F1 Score: 0.9877704737894876


## Conclusion
After combining meta data features created using feature engineering, and fine tunning our xbgoost model using randomsearch cv 99% accuracy is achieved.

## let's create complete pipeline for our model

In [None]:
import numpy as np


text_preprocessor_pipeline = Pipeline(
    steps=[
        ('clean_text_rm_sw', TextPreprocessor()),
        ('vectorizer', CountVectorizer())
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ('text_preprocessor', text_preprocessor_pipeline, 'text')
    ],
    remainder='passthrough',  # keeps metadata columns
    force_int_remainder_cols=False, # treat remainder columns by name, not index.
    verbose_feature_names_out=False
)

deployment_model_pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', XGBClassifier(objective='binary:logistic',eval_metric='logloss'))
    ]
)

deployment_model_pipeline


In [None]:
deployment_model_pipeline.set_params(
    **{
        'classifier__learning_rate': best_params['classifier__learning_rate'],
        'classifier__max_depth': best_params['classifier__max_depth'],
        'classifier__n_estimators': best_params['classifier__n_estimators'],
        'preprocessor__text_preprocessor__vectorizer__max_features': best_params['trf1__vectorizer__max_features'],
        'preprocessor__text_preprocessor__vectorizer__ngram_range':  best_params['trf1__vectorizer__ngram_range']
    }
)

pprint("The best parameters are:")
pprint(best_params)

'The best parameters are:'
{'classifier__learning_rate': np.float64(0.24555278841790407),
 'classifier__max_depth': 5,
 'classifier__n_estimators': 463,
 'trf1__vectorizer__max_features': 5638,
 'trf1__vectorizer__ngram_range': (1, 2)}


In [46]:

X = data_news.drop(columns=['label', 'title', 'text_cleaned', 'title_length', 'subject'])
y = data_news.label



X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    test_size=0.20,
                                                    random_state=42
                                                    )

In [47]:
X.info(), y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38638 entries, 0 to 38637
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   text                     38638 non-null  object
 1   text_length              38638 non-null  int64 
 2   exclamations_mark_count  38638 non-null  int64 
 3   questions_mark_count     38638 non-null  int64 
 4   uppercase_words_count    38638 non-null  int64 
dtypes: int64(4), object(1)
memory usage: 1.5+ MB
<class 'pandas.core.series.Series'>
RangeIndex: 38638 entries, 0 to 38637
Series name: label
Non-Null Count  Dtype
--------------  -----
38638 non-null  int64
dtypes: int64(1)
memory usage: 302.0 KB


(None, None)

In [48]:
# Let's us fit the final model
deployment_model_pipeline.fit(X_train, y_train)

In [49]:
# Evaluating our final model

y_pred = deployment_model_pipeline.predict(X_test)

f1score= f1_score(y_test,y_pred)*100

print(f'Deployment model accuracy is {f1score : 04.2f}%')
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(y_test, y_pred))



Deployment model accuracy is  98.94%
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4238
           1       0.99      0.99      0.99      3490

    accuracy                           0.99      7728
   macro avg       0.99      0.99      0.99      7728
weighted avg       0.99      0.99      0.99      7728



## Saving our deployment pipeline for api development

In [50]:
import joblib
joblib.dump(deployment_model_pipeline,'misinfo_detection_pipeline.pkl')

['misinfo_detection_pipeline.pkl']

In [52]:
model = joblib.load('misinfo_detection_pipeline.pkl')

In [54]:
model.feature_names_in_

array(['text', 'text_length', 'exclamations_mark_count',
       'questions_mark_count', 'uppercase_words_count'], dtype=object)