In [12]:
import pandas as pd

df = pd.read_csv("fakenews.csv")
# remove any rows with missing text/label
df = df.dropna(subset=['text','label']).reset_index(drop=True)

print("shape:", df.shape)
print(df['label'].value_counts())
df.head()



shape: (15, 2)
label
real    8
fake    7
Name: count, dtype: int64


Unnamed: 0,text,label
0,Breaking news: Scientists discover water on Mars,real
1,Shocking! Man grows wings after drinking energ...,fake
2,Government launches new space mission next month,real
3,Alien spaceship spotted over New York City,fake
4,New policy announced for electric vehicles,real


In [13]:
from sklearn.model_selection import train_test_split

X = df['text']
y = df['label']

# use stratify to preserve class balance in train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("train:", len(X_train), "test:", len(X_test))
print(y_train.value_counts(), y_test.value_counts())


train: 12 test: 3
label
fake    6
real    6
Name: count, dtype: int64 label
real    2
fake    1
Name: count, dtype: int64


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', max_features=2000)  # limit features for tiny data
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec  = vectorizer.transform(X_test)

print("X_train_vec shape:", X_train_vec.shape)


X_train_vec shape: (12, 57)


In [15]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_vec, y_train)


In [16]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred))
print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 1.0

Classification report:
               precision    recall  f1-score   support

        fake       1.00      1.00      1.00         1
        real       1.00      1.00      1.00         2

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3


Confusion matrix:
 [[1 0]
 [0 2]]


In [17]:
from sklearn.model_selection import cross_val_score
import numpy as np

# cross-val on full data using pipeline (vectorizer + model)
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(TfidfVectorizer(stop_words='english', max_features=2000),
                     LogisticRegression(max_iter=1000, random_state=42))

scores = cross_val_score(pipe, X, y, cv=5, scoring='accuracy')  # if classes small, use cv=3
print("CV accuracy scores:", scores)
print("Mean CV accuracy:", np.mean(scores))


CV accuracy scores: [0.66666667 0.33333333 1.         0.33333333 0.33333333]
Mean CV accuracy: 0.5333333333333334


In [18]:
import joblib
joblib.dump(model, "src/fake_news_model.pkl")
joblib.dump(vectorizer, "src/tfidf_vectorizer.pkl")



['src/tfidf_vectorizer.pkl']

In [19]:
def predict_news(text):
    vec = vectorizer.transform([text])
    return model.predict(vec)[0]

print(predict_news("Government launches new space mission tomorrow"))



real


In [20]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()
nb_model.fit(X_train_vec, y_train)

nb_pred = nb_model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, nb_pred))


Accuracy: 1.0


In [21]:
from sklearn.svm import LinearSVC

svm = LinearSVC()
svm.fit(X_train_vec, y_train)

svm_pred = svm.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, svm_pred))


Accuracy: 1.0
