In [3]:
import pandas as pd  # Import the pandas library for data manipulation
from sklearn.feature_extraction.text import TfidfVectorizer  # Import TF-IDF vectorizer for text feature extraction


In [4]:

# Read the CSV file containing news articles and labels into a pandas DataFrame
dataframe = pd.read_csv('news.csv')


In [5]:
# Print the first few rows of the DataFrame to get a glimpse of the data
dataframe.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [6]:
# Extract the text of the news articles and the corresponding labels (fake or real)
x = dataframe['text']
y = dataframe['label']

In [7]:
# Split the data into training and testing sets for model evaluation (20% for testing)
from sklearn.model_selection import train_test_split


In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)


In [9]:
# Create a TF-IDF vectorizer to convert text into numerical features
# - stop_words='english': Remove common English words (e.g., the, a, an) that don't contribute much meaning
# - max_df=0.7: Keep only words that appear in no more than 70% of the documents (helps filter out very frequent or infrequent words)
tfvect = TfidfVectorizer(stop_words='english', max_df=0.7)

In [10]:
# Fit the TF-IDF vectorizer to the training data (learn word importance)
tfid_x_train = tfvect.fit_transform(x_train)

In [11]:
# Transform the testing data using the fitted vectorizer (convert text to numerical features)
tfid_x_test = tfvect.transform(x_test)

In [13]:
# Create a machine learning model for fake news detection (Passive Aggressive Classifier is used here)
# - max_iter=50: Set the maximum number of training iterations to 50
from sklearn.linear_model import PassiveAggressiveClassifier

classifier = PassiveAggressiveClassifier(max_iter=50)

In [14]:
# Train the model on the transformed training data and labels
classifier.fit(tfid_x_train, y_train)

In [15]:
# Use the trained model to predict labels (fake or real) for the testing data
y_pred = classifier.predict(tfid_x_test)

In [16]:
# Calculate the accuracy of the model (percentage of correct predictions)
from sklearn.metrics import accuracy_score

In [17]:
score = accuracy_score(y_test, y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 93.45%


In [19]:
# Create a confusion matrix to see how many predictions were correct or incorrect for each class (fake and real)
from sklearn.metrics import confusion_matrix
cf = confusion_matrix(y_test, y_pred, labels=['FAKE', 'REAL'])
print(cf)


[[572  43]
 [ 40 612]]


In [20]:
# Function to predict if a new piece of news is fake or real
def fake_news_det(news):
    # Prepare the input news article as a list for the model
    input_data = [news]

    # Transform the input data using the fitted vectorizer
    vectorized_input_data = tfvect.transform(input_data)

    # Make a prediction using the trained model
    prediction = classifier.predict(vectorized_input_data)

    # Print the predicted label (fake or real)
    print(prediction)

In [21]:
# Example usage: Classify a news article
fake_news_det('U.S. Secretary of State John F. Kerry said Monday that he will stop in Paris later this week, amid criticism that no top American officials attended Sundayâ€™s unity march against terrorism.')

['REAL']


In [22]:
fake_news_det("""Go to Article President Barack Obama has been campaigning hard for the woman who is supposedly going to extend his legacy four more years. The only problem with stumping for Hillary Clinton, however, is sheâ€™s not exactly a candidate easy to get too enthused about. """)

['FAKE']
