# News Article Classification Notebook

Step-by-step guide.

In [None]:
import pandas as pd
from utils import clean_text
from sklearn.pipeline import Pipeline

df = pd.read_csv('data/sample_news.csv')
df['content_clean'] = df['text'].apply(clean_text)
df.head()


Build and train

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
pipe = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LogisticRegression(max_iter=1000))])
pipe.fit(df['content_clean'].values, df['label'].apply(lambda x:1 if str(x).lower()=='real' else 0).values)


Evaluate the model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df['content_clean'], 
    df['label'].apply(lambda x: 1 if str(x).lower() == 'real' else 0),
    test_size=0.2,
    random_state=42
)

# Train on training set
pipe.fit(X_train, y_train)

# Predictions on test set
y_pred = pipe.predict(X_test)

# Evaluation
print("Classification Report:\n")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))


Predict a single new article

In [None]:
# Example articles
articles = [
    "NASAâ€™s James Webb Telescope has captured unprecedented images of distant galaxies.",
    "A recent study claims that eating chocolate every day can cure all known diseases."
]

for art in articles:
    clean_art = clean_text(art)
    pred = pipe.predict([clean_art])
    print(f"Article: {art}")
    print("Prediction:", "Real" if pred[0] == 1 else "Fake")
    print("-" * 50)
