In [175]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def preprocess_text(text):
    """Cleans and preprocesses text by removing special characters and converting to lowercase."""
    text = text.lower()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    words = text.split()
    return " ".join(words)

def load_and_preprocess_data(file_path):
    """Loads the dataset, preprocesses the text, and returns train-test splits."""
    df = pd.read_csv(file_path, encoding="ISO-8859-1")
    
    # Combine all headline columns into one
    df["Combined_Headlines"] = df.iloc[:, 2:].astype(str).agg(" ".join, axis=1)
    
    # Apply text preprocessing
    df["Processed_Text"] = df["Combined_Headlines"].apply(preprocess_text)
    
    # Split dataset into training and testing based on date
    df_train = df[df["Date"] < "20150101"]
    df_test = df[df["Date"] >= "20141231"]
    
    X_train, y_train = df_train["Processed_Text"], df_train["Label"]
    X_test, y_test = df_test["Processed_Text"], df_test["Label"]
    
    return X_train, X_test, y_train, y_test

def train_and_evaluate(X_train, X_test, y_train, y_test):
    """Trains a RandomForest model and evaluates its performance."""
    vectorizer = CountVectorizer(ngram_range=(2,2))
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)
    
    clf = RandomForestClassifier(n_estimators=200,criterion='entropy')
    clf.fit(X_train_tfidf, y_train)
    
    y_pred = clf.predict(X_test_tfidf)
    
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    
if __name__ == "__main__":
    file_path = r'E:\stock_sentiment_analysis\Data.csv'  # Change this if your dataset has a different name
    X_train, X_test, y_train, y_test = load_and_preprocess_data(file_path)
    train_and_evaluate(X_train, X_test, y_train, y_test)


Accuracy: 0.8544973544973545
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.73      0.83       186
           1       0.79      0.97      0.87       192

    accuracy                           0.85       378
   macro avg       0.88      0.85      0.85       378
weighted avg       0.88      0.85      0.85       378

Confusion Matrix:
 [[136  50]
 [  5 187]]
