In [None]:
import pandas as pd
import numpy as np
import spacy
import string
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import gensim.downloader as api

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
fake= pd.read_csv("/content/drive/MyDrive/NLP PROJECT/DATASET/Fake.csv")
true= pd.read_csv("/content/drive/MyDrive/NLP PROJECT/DATASET/True.csv")

In [None]:
fake['category']=1
true['category']=0

df=pd.concat([fake,true]).reset_index(drop=True)


In [None]:
# Separate the majority and minority classes
df_majority = df[df['category'] == 1]
df_minority = df[df['category'] == 0]

# Sample 3000 data points from each class
df_majority_sampled = df_majority.sample(n=3000, random_state=42)
df_minority_sampled = df_minority.sample(n=3000, random_state=42)

# Combine the sampled data
df_balanced = pd.concat([df_majority_sampled, df_minority_sampled])

# Shuffle the dataframe
df= df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the balanced dataset if needed
# df_balanced.to_csv('balanced_dataset.csv', index=False)

In [None]:
df_majority = df[df['category'] == 1]
df_minority = df[df['category'] == 0]

# Undersample the majority class
df_majority_undersampled = df_majority.sample(len(df_minority), random_state=42)

# Combine the undersampled majority class with the minority class
df = pd.concat([df_majority_undersampled, df_minority])
df=df[['text','category']]

In [None]:
# Load SpaCy model and define stop words and punctuations
nlp = spacy.load("en_core_web_sm")
stop_words = nlp.Defaults.stop_words
punctuations = string.punctuation


In [None]:
# Load pre-trained Word2Vec embeddings
wv = api.load('word2vec-google-news-300')



In [None]:
# Function to tokenize and preprocess text
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    doc = nlp(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [word.lemma_.lower().strip() for word in doc]

    # Removing stop words and punctuations
    mytokens = [word for word in mytokens if word not in stop_words and word not in punctuations]

    # Return preprocessed list of tokens
    return mytokens

In [None]:
# Apply tokenization to each row in 'data' column
df['data'] = df['text'].apply(spacy_tokenizer)

In [None]:
# Generate word vectors for each document
df['vec'] = df['data'].apply(lambda x: np.mean([wv[token] for token in x if token in wv] or [np.zeros(wv.vector_size)], axis=0))

In [None]:
X = np.vstack(df['vec'])  # Convert list of arrays to a matrix
y = df['category']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=1)

In [None]:
model_pipeline_lr = Pipeline([
    ('lr', LogisticRegression(max_iter=1000))  # Increase max_iter to 1000 or more
])

# Train the model
model_pipeline_lr.fit(X_train, y_train)

# Predict on the test set
y_pred_lr = model_pipeline_lr.predict(X_test)

In [None]:
accuracy_lr = accuracy_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr, average='weighted')
recall_lr = recall_score(y_test, y_pred_lr, average='weighted')
f1_score_lr = f1_score(y_test, y_pred_lr, average='weighted')
classification_report_lr = classification_report(y_test, y_pred_lr)

# Print evaluation metrics
print("Accuracy:", accuracy_lr)
print("Precision:", precision_lr)
print("Recall:", recall_lr)
print("F1 Score:", f1_score_lr)
print("Evaluation Metrics for Logistic Regression Model")
print("------------------------------------------------")
print(classification_report_lr)

Accuracy: 0.9266666666666666
Precision: 0.927783724043807
Recall: 0.9266666666666666
F1 Score: 0.9266452748661159
Evaluation Metrics for Logistic Regression Model
------------------------------------------------
              precision    recall  f1-score   support

           0       0.91      0.95      0.93       592
           1       0.95      0.90      0.93       608

    accuracy                           0.93      1200
   macro avg       0.93      0.93      0.93      1200
weighted avg       0.93      0.93      0.93      1200

