In [1]:
import pandas as pd
import string
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer

# Load datasets
train = pd.read_csv("/content/train.csv")
test = pd.read_csv("/content/test.csv")

# Preprocessing function
def preprocess(row):
    author = row['author'] if isinstance(row['author'], str) else ''
    title = row['title'] if isinstance(row['title'], str) else ''
    text = row['text'] if isinstance(row['text'], str) else ''

    content = f"{author} {title} {text}"
    content = content.lower()
    content = ''.join([char for char in content if char not in string.punctuation])
    content = ' '.join([word for word in content.split() if word not in ENGLISH_STOP_WORDS])

    return content

# Apply preprocessing
train['content'] = train.apply(preprocess, axis=1)
test['content'] = test.apply(preprocess, axis=1)


In [2]:
# Using TF-IDF Vectorizer for feature extraction
vectorizer = TfidfVectorizer(max_features=5000)  # Limiting to top 5000 features for computational efficiency
X_train = vectorizer.fit_transform(train['content'])
y_train = train['label']
X_test = vectorizer.transform(test['content'])


In [3]:
from sklearn.linear_model import LogisticRegression

# Initialize and train the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


In [4]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on test set
y_pred = model.predict(X_test)

# Load real values from submit.csv
submit = pd.read_csv("/content/submit.csv")
y_true = submit['label']

# Calculate and print accuracy and other metrics
print(f"Accuracy: {accuracy_score(y_true, y_pred) * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_true, y_pred))


Accuracy: 63.52%

Classification Report:
              precision    recall  f1-score   support

           0       0.59      0.64      0.61      2339
           1       0.68      0.63      0.66      2861

    accuracy                           0.64      5200
   macro avg       0.63      0.64      0.63      5200
weighted avg       0.64      0.64      0.64      5200



In [5]:
from joblib import dump

# Save the Logistic Regression model and vectorizer
dump(model, '/content/logistic_regression_model.joblib')
dump(vectorizer, '/content/tfidf_vectorizer.joblib')


['/content/tfidf_vectorizer.joblib']