In [61]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [33]:
true = pd.read_csv("True.csv")
fake = pd.read_csv("Fake.csv")

In [34]:
fake.shape

(23481, 4)

In [35]:
true.shape

(21417, 4)

In [36]:
fake['target'] = 'fake'
true['target'] = 'true'

In [37]:
df_combined = pd.concat([true, fake], ignore_index=True)

In [38]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [39]:
def clean_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalnum()]
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    cleaned_text = ' '.join(tokens)
    return cleaned_text


In [40]:
true['label'] = 0
fake['label'] = 1

In [41]:
combined_df = pd.concat([true, fake], axis=0)

In [42]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')  # You can customize max_features


In [55]:
tfidf_matrix = tfidf_vectorizer.fit_transform(combined_df['text'])


In [57]:
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())


In [59]:
print(tfidf_df.shape)

(44898, 5000)


In [63]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_df, combined_df['label'], test_size=0.2, random_state=42)


In [65]:
logistic_regression_model = LogisticRegression()
logistic_regression_model.fit(X_train, y_train)

In [67]:
y_pred = logistic_regression_model.predict(X_test)

In [103]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Assuming combined_df is already loaded and preprocessed
# Create the TF-IDF Vectorizer and fit it on the combined dataset
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(combined_df['text'])

# Split the data into features and labels
X = tfidf_matrix
y = combined_df['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Logistic Regression model
logistic_regression_model = LogisticRegression()
logistic_regression_model.fit(X_train, y_train)

# Predict on the test set
y_pred = logistic_regression_model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 98.79%

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      4330
           1       0.99      0.98      0.99      4650

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980


Confusion Matrix:
[[4294   36]
 [  73 4577]]


In [71]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 98.79%


In [73]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      4330
           1       0.99      0.98      0.99      4650

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [75]:
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))



Confusion Matrix:
[[4294   36]
 [  73 4577]]


In [105]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(combined_df['text'])

# Save the TF-IDF vectorizer to a file
with open('tfidf_vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(tfidf_vectorizer, vectorizer_file)


In [106]:
logistic_regression_model = LogisticRegression()
logistic_regression_model.fit(X_train, y_train)

# Save the logistic regression model to a file
with open('logistic_model.pkl', 'wb') as model_file:
    pickle.dump(logistic_regression_model, model_file)

In [113]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# Load the trained model and vectorizer
with open('logistic_model.pkl', 'rb') as model_file:
    logistic_regression_model = pickle.load(model_file)

with open('tfidf_vectorizer.pkl', 'rb') as vectorizer_file:
    tfidf_vectorizer = pickle.load(vectorizer_file)

def predict_fake_news(text):
    text_transformed = tfidf_vectorizer.transform([text])
    prediction = logistic_regression_model.predict(text_transformed)
    return "Fake News" if prediction == 1 else "Real News"

if __name__ == "__main__":
    input_text = input("Enter the news text: ")
    result = predict_fake_news(input_text)
    print(f"The entered news is likely: {result}")


Enter the news text:  donald trump is the president


The entered news is likely: Fake News


In [117]:
# Import necessary libraries
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle

# Load datasets
true = pd.read_csv("True.csv")
fake = pd.read_csv("Fake.csv")

# Add target labels
true['target'] = 'true'
fake['target'] = 'fake'

# Combine datasets
df_combined = pd.concat([true, fake], ignore_index=True)

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Text cleaning function
def clean_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalnum()]
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    cleaned_text = ' '.join(tokens)
    return cleaned_text

# Apply text cleaning
df_combined['text'] = df_combined['text'].apply(clean_text)

# Assign labels
df_combined['label'] = df_combined['target'].apply(lambda x: 0 if x == 'true' else 1)

# Create TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

# Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(df_combined['text'])

# Convert to DataFrame for easier manipulation (optional)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_df, df_combined['label'], test_size=0.2, random_state=42)

# Train the Logistic Regression model
logistic_regression_model = LogisticRegression()
logistic_regression_model.fit(X_train, y_train)

# Predict on the test set
y_pred = logistic_regression_model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Save the model and vectorizer for future use
with open('tfidf_vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(tfidf_vectorizer, vectorizer_file)

with open('logistic_model.pkl', 'wb') as model_file:
    pickle.dump(logistic_regression_model, model_file)

# Function to predict fake news on new data
def predict_fake_news(text):
    with open('tfidf_vectorizer.pkl', 'rb') as vectorizer_file:
        tfidf_vectorizer = pickle.load(vectorizer_file)
    
    with open('logistic_model.pkl', 'rb') as model_file:
        logistic_regression_model = pickle.load(model_file)
    
    text_transformed = tfidf_vectorizer.transform([text])
    prediction = logistic_regression_model.predict(text_transformed)
    return "Fake News" if prediction == 1 else "Real News"

# Example usage with a sample news article:
if __name__ == "__main__":
    sample_news = "The government has announced a new economic policy to boost the economy and create jobs. This policy is expected to reduce unemployment by 10% over the next year."
    result = predict_fake_news(sample_news)
    print(f"The entered news is likely: {result}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 98.64%

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      4330
           1       0.99      0.98      0.99      4650

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980


Confusion Matrix:
[[4290   40]
 [  82 4568]]
The entered news is likely: Fake News


