In [None]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
import pandas as pd

# Skip problematic rows
news_dataset = pd.read_csv('/content/train.csv', on_bad_lines='skip')




In [None]:
# Data Pre-processing
news_dataset = news_dataset.fillna('')  # Replacing null values with empty string
news_dataset['content'] = news_dataset['author'] + ' ' + news_dataset['title']  # Merging author and title


In [None]:
# Stemming
port_stem = PorterStemmer()

In [None]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    return ' '.join(stemmed_content)


In [None]:
news_dataset['content'] = news_dataset['content'].apply(stemming)

In [None]:
# Separating the data and label
X = news_dataset['content'].values
Y = news_dataset['label'].values

In [None]:
# Converting textual data to numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(X)
X = vectorizer.transform(X)

In [None]:
# Splitting the dataset to training & test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, stratify=Y, random_state=2)

In [None]:
# Logistic Regression Model Training
logistic_model = LogisticRegression()
logistic_model.fit(X_train, Y_train)

In [None]:

# Predictions for Logistic Regression
X_train_pred_lr = logistic_model.predict(X_train)
X_test_pred_lr = logistic_model.predict(X_test)

In [None]:
# Accuracy, Precision, Recall, F1 Score for Logistic Regression
train_accuracy_lr = accuracy_score(Y_train, X_train_pred_lr)
test_accuracy_lr = accuracy_score(Y_test, X_test_pred_lr)
precision_lr = precision_score(Y_test, X_test_pred_lr)
recall_lr = recall_score(Y_test, X_test_pred_lr)
f1_score_lr = f1_score(Y_test, X_test_pred_lr)

In [None]:
print(f'Logistic Regression - Training Accuracy: {train_accuracy_lr}')
print(f'Logistic Regression - Test Accuracy: {test_accuracy_lr}')
print(f'Logistic Regression - Precision: {precision_lr}')
print(f'Logistic Regression - Recall: {recall_lr}')
print(f'Logistic Regression - F1 Score: {f1_score_lr}')

Logistic Regression - Training Accuracy: 0.9852335164835165
Logistic Regression - Test Accuracy: 0.9745192307692307
Logistic Regression - Precision: 0.9579857893110905
Logistic Regression - Recall: 0.9926376440460948
Logistic Regression - F1 Score: 0.9750039301996541


In [None]:
# Random Forest Model Training
rf_model = RandomForestClassifier(random_state=2)
rf_model.fit(X_train, Y_train)

In [None]:
# Predictions for Random Forest
X_train_pred_rf = rf_model.predict(X_train)
X_test_pred_rf = rf_model.predict(X_test)

In [None]:
# Accuracy, Precision, Recall, F1 Score for Random Forest
train_accuracy_rf = accuracy_score(Y_train, X_train_pred_rf)
test_accuracy_rf = accuracy_score(Y_test, X_test_pred_rf)
precision_rf = precision_score(Y_test, X_test_pred_rf)
recall_rf = recall_score(Y_test, X_test_pred_rf)
f1_score_rf = f1_score(Y_test, X_test_pred_rf)

In [None]:
print(f'Random Forest - Training Accuracy: {train_accuracy_rf}')
print(f'Random Forest - Test Accuracy: {test_accuracy_rf}')
print(f'Random Forest - Precision: {precision_rf}')
print(f'Random Forest - Recall: {recall_rf}')
print(f'Random Forest - F1 Score: {f1_score_rf}')

Random Forest - Training Accuracy: 1.0
Random Forest - Test Accuracy: 0.9926282051282052
Random Forest - Precision: 0.9898154042011458
Random Forest - Recall: 0.9955185659411011
Random Forest - F1 Score: 0.992658793488669


In [None]:
# Decision Tree Model Training
dt_model = DecisionTreeClassifier(random_state=2)
dt_model.fit(X_train, Y_train)

In [None]:
# Predictions for Decision Tree
X_train_pred_dt = dt_model.predict(X_train)
X_test_pred_dt = dt_model.predict(X_test)

In [None]:
# Accuracy, Precision, Recall, F1 Score for Decision Tree
train_accuracy_dt = accuracy_score(Y_train, X_train_pred_dt)
test_accuracy_dt = accuracy_score(Y_test, X_test_pred_dt)
precision_dt = precision_score(Y_test, X_test_pred_dt)
recall_dt = recall_score(Y_test, X_test_pred_dt)
f1_score_dt = f1_score(Y_test, X_test_pred_dt)

In [None]:
print(f'Decision Tree - Training Accuracy: {train_accuracy_dt}')
print(f'Decision Tree - Test Accuracy: {test_accuracy_dt}')
print(f'Decision Tree - Precision: {precision_dt}')
print(f'Decision Tree - Recall: {recall_dt}')
print(f'Decision Tree - F1 Score: {f1_score_dt}')

Decision Tree - Training Accuracy: 1.0
Decision Tree - Test Accuracy: 0.9942307692307693
Decision Tree - Precision: 0.9945547725816785
Decision Tree - Recall: 0.9939180537772087
Decision Tree - F1 Score: 0.9942363112391931


In [None]:
# Making a Predictive System with Logistic Regression (as an example)
X_new = X_test[17]
prediction = logistic_model.predict(X_new)
print(prediction)

[1]


In [None]:
if prediction[0] == 0:
    print('The news is Real')
else:
    print('The news is Fake')

The news is Fake


In [None]:
print(Y_test[17])

1
