In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
import re

# Download necessary NLTK data
nltk.download('vader_lexicon')
nltk.download('stopwords')

# Load dataset
def load_dataset(filepath):
    return pd.read_csv(filepath, sep='\t', on_bad_lines='skip')

# Text cleaning and preprocessing
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetical characters
    text = text.lower()  # Convert to lowercase
    text = text.strip()  # Remove leading/trailing whitespace
    return text

# Extract features
def extract_features(df):
    sia = SentimentIntensityAnalyzer()
    stop_words = set(stopwords.words('english'))

    # Title length and text length
    df['title_length'] = df['title'].apply(lambda x: len(x.split()))
    df['text_length'] = df['text'].apply(lambda x: len(x.split()))

    # Keyword density
    def keyword_density(text):
        keywords = ['shocking', 'breaking', 'exclusive']
        text_tokens = text.split()
        return sum(1 for word in text_tokens if word in keywords) / len(text_tokens)

    df['keyword_density'] = df['text'].apply(keyword_density)

    # Sentiment analysis
    df['title_sentiment'] = df['title'].apply(lambda x: sia.polarity_scores(x)['compound'])
    df['text_sentiment'] = df['text'].apply(lambda x: sia.polarity_scores(x)['compound'])

    # Removing stopwords
    def remove_stopwords(text):
        return ' '.join([word for word in text.split() if word not in stop_words])

    df['cleaned_text'] = df['text'].apply(remove_stopwords)

    return df

# Prepare data for model training
def prepare_data(df):
    tfidf = TfidfVectorizer(max_features=5000)
    X_text_features = tfidf.fit_transform(df['cleaned_text']).toarray()

    # Combine other features
    X_other_features = df[['title_length', 'text_length', 'keyword_density', 'title_sentiment', 'text_sentiment']].values
    X = np.hstack([X_text_features, X_other_features])

    y = df['label']

    return X, y

# Train model
def train_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    # Evaluation metrics
    metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1_score': f1_score(y_test, y_pred),
        'roc_auc': roc_auc_score(y_test, y_proba)
    }

    print("Evaluation Metrics:")
    for metric, value in metrics.items():
        print(f"{metric.capitalize()}: {value:.2f}")

    return model, metrics

# Save results
def save_results(test_data, predictions, filepath):
    results = pd.DataFrame({
        'title': test_data['title'],
        'predicted_label': predictions
    })
    results.to_csv(filepath, index=False)

if __name__ == "__main__":
    train_dataset_path = "/content/train.tsv"  # Path to training dataset
    test_dataset_path = "/content/test.tsv"    # Path to test dataset
    output_path = "results.csv"                      # File to save test predictions

    # Load and preprocess training dataset
    train_data = load_dataset(train_dataset_path)
    train_data['title'] = train_data['title'].apply(clean_text)
    train_data['text'] = train_data['text'].apply(clean_text)
    train_data = extract_features(train_data)

    # Prepare and train the model
    X_train, y_train = prepare_data(train_data)
    model, metrics = train_model(X_train, y_train)

    print("Training Metrics:", metrics)

    # --- NEW SECTION: Test Data Handling ---

    # Load and preprocess test dataset
    test_data = load_dataset(test_dataset_path)
    test_data['title'] = test_data['title'].apply(clean_text)
    test_data['text'] = test_data['text'].apply(clean_text)
    test_data = extract_features(test_data)

    # Prepare test data for predictions
    tfidf = TfidfVectorizer(max_features=5000)
    X_test_text_features = tfidf.fit_transform(test_data['cleaned_text']).toarray()
    X_test_other_features = test_data[['title_length', 'text_length', 'keyword_density',
                                       'title_sentiment', 'text_sentiment']].values
    X_test = np.hstack([X_test_text_features, X_test_other_features])

    # Predict on test data
    predictions = model.predict(X_test)

    # Save predictions to a CSV file
    save_results(test_data, predictions, output_path)

    print(f"Predictions saved to {output_path}")


def load_dataset(filepath):
    try:
        df = pd.read_csv(filepath, sep='\t', on_bad_lines='skip')
        df = df.dropna(subset=['label'])  # Drop rows with missing labels
        df['label'] = df['label'].astype(int)  # Ensure labels are integers
        return df
    except pd.errors.ParserError as e:
        print("ParserError encountered:", e)
        return None

# Ensure no NaN values in the target column during preprocessing
train_data = load_dataset(train_dataset_path)
if train_data['label'].isna().any():
    print("Warning: Found missing values in 'label'. Dropping such rows.")
    train_data = train_data.dropna(subset=['label'])



[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Evaluation Metrics:
Accuracy: 0.99
Precision: 0.99
Recall: 0.99
F1_score: 0.99
Roc_auc: 1.00
Training Metrics: {'accuracy': 0.9886666666666667, 'precision': 0.9901788846018941, 'recall': 0.986028641285365, 'f1_score': 0.9880994049702485, 'roc_auc': 0.9992637980250146}
Predictions saved to results.csv
