In [None]:
# Step 1: Install Required Libraries
!pip install nltk textblob scikit-learn

# Step 2: Import Libraries
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Step 3: Define LIAR dataset column names
columns = [
    'id', 'label', 'statement', 'subject', 'speaker', 'job_title',
    'state', 'party', 'barely_true', 'false', 'half_true', 'mostly_true',
    'pants_on_fire', 'context'
]

# Step 4: Load .tsv files with correct headers
train_df = pd.read_csv('/content/train.tsv', sep='\t', header=None, names=columns)
test_df = pd.read_csv('/content/test.tsv', sep='\t', header=None, names=columns)
valid_df = pd.read_csv('/content/valid.tsv', sep='\t', header=None, names=columns)

# Step 5: Combine all data
df = pd.concat([train_df, test_df, valid_df], ignore_index=True)

# Step 6: Convert labels to binary (FAKE = 0, REAL = 1)
fake_labels = ['false', 'barely-true', 'pants-fire']
real_labels = ['true', 'mostly-true', 'half-true']

df['binary_label'] = df['label'].apply(lambda x: 0 if x in fake_labels else 1 if x in real_labels else None)
df.dropna(subset=['binary_label'], inplace=True)

# Step 7: Combine statement and context for full text
df['content'] = df['statement'] + ' ' + df['context']

# Step 8: Preprocess text manually without using nltk.tokenize
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    text = str(text).lower()  # convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # remove non-alphabetic characters
    words = text.split()  # split text into words
    words = [stemmer.stem(word) for word in words if word not in stop_words]  # stem words and remove stopwords
    return " ".join(words)

df['clean_text'] = df['content'].apply(preprocess_text)

# Step 9: TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['clean_text']).toarray()
y = df['binary_label'].values

# Step 10: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 11: Train Logistic Regression
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 12: Predict and Evaluate
y_pred = model.predict(X_test)

# Map 0 and 1 to 'fake' and 'real'
label_mapping = {0: 'fake', 1: 'real'}

# Convert predictions to 'real' and 'fake' strings
y_pred_labels = [label_mapping[label] for label in y_pred]
y_test_labels = [label_mapping[label] for label in y_test]

# Print the classification report
print("📊 Classification Report:\n")
print(classification_report(y_test_labels, y_pred_labels))

# Print the confusion matrix
print("🧩 Confusion Matrix:\n")
print(confusion_matrix(y_test_labels, y_pred_labels))


📊 Classification Report:

              precision    recall  f1-score   support

        fake       0.60      0.49      0.54      1121
        real       0.65      0.75      0.70      1438

    accuracy                           0.63      2559
   macro avg       0.63      0.62      0.62      2559
weighted avg       0.63      0.63      0.63      2559

🧩 Confusion Matrix:

[[ 544  577]
 [ 362 1076]]
