In [178]:
# Importing libraries
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Download stopwords
nltk.download('stopwords')

# Load dataset
fake_data = pd.read_csv('/users/rahul/Desktop/ML/news.csv')
fake_data = fake_data.fillna('')

# Combine author and title to form the content
fake_data['content'] = fake_data['author'] + ' ' + fake_data['title']

# Initialize stemmer
port_stem = PorterStemmer()

# Stemming function
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content]
    return ' '.join(stemmed_content)

# Apply stemming
fake_data['content'] = fake_data['content'].apply(stemming)

# Features and labels
x = fake_data['content']
y = fake_data['label']

# Vectorize text data
vectorizer = TfidfVectorizer(max_features=3000, stop_words='english', ngram_range=(1, 2))
x = vectorizer.fit_transform(x)

# Split data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=2)

# Train using Logistic Regression
model = LogisticRegression()
model.fit(x_train, y_train)

# Evaluate on training and test data
x_train_pred = model.predict(x_train)
x_test_pred = model.predict(x_test)

train_accuracy = accuracy_score(y_train, x_train_pred)
test_accuracy = accuracy_score(y_test, x_test_pred)

print("Accuracy score of the training data:", train_accuracy)
print("Accuracy score of the test data:", test_accuracy)

# Cross-validation scores
cv_scores = cross_val_score(model, x, y, cv=5)
print("Cross-validated scores:", cv_scores)
print("Mean cross-validated accuracy:", cv_scores.mean())

# Predicting a single instance
x_new = x_test[3]
prediction = model.predict(x_new)

print("Prediction:", prediction)
print("Actual label:", y_test.iloc[1])

if prediction[0] == 0:
    print("The News is Real")
else:
    print("The News is Fake")

[nltk_data] Downloading package stopwords to /Users/rahul/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy score of the training data: 0.9903846153846154
Accuracy score of the test data: 0.9875
Cross-validated scores: [0.98557692 0.98509615 0.9875     0.98485577 0.98533654]
Mean cross-validated accuracy: 0.985673076923077
Prediction: [0]
Actual label: 0
The News is Real
