### Logistic Regression prediction steps
* Train a sentiment classification model using the labelled_df.
* Use the trained model to predict sentiment on the unlabelled_df.
* Validate the predictions against the known sentiments in the labelled_df.

## Logistic Regression

In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split



In [2]:
import pandas as pd
import re

file_path = r"C:\Users\sarth\Downloads\dataset-journal-entries.csv"
unlabelled_df = pd.read_csv(file_path)
unlabelled_df.head()

Unnamed: 0,Journal Notes
0,The asset is trading below its 200-day moving ...
1,The asset is trading below its 200-day moving ...
2,The asset is forming a symmetrical pattern (tr...
3,Bullish Channel: The price has been trading wi...
4,Sellers are dominating at key resistance level...


In [3]:
path = r"C:\Users\sarth\Downloads\sentiment-test.csv"
labelled_df = pd.read_csv(path)

labelled_df.head()

Unnamed: 0,Journal Notes,Sentiment
0,Wave C is supposed to end at a specific zone. ...,Positive
1,The asset is forming a symmetrical pattern (tr...,Positive
2,"From the Elliott Wave perspective, we have com...",Positive
3,The asset presents a potential buying opportun...,Positive
4,Bullish Channel: The price has been trading wi...,Positive


In [8]:

# Split the labelled data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(labelled_df['Journal Notes'], labelled_df['Sentiment'], test_size=0.2, random_state=42)

# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust max_features as needed
X_train_vectorized = vectorizer.fit_transform(X_train)
X_val_vectorized = vectorizer.transform(X_val)

# Train a logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vectorized, y_train)

# Predict sentiment on unlabelled data
unlabelled_X_vectorized = vectorizer.transform(unlabelled_df['Journal Notes'])
unlabelled_df['Predicted Sentiment'] = model.predict(unlabelled_X_vectorized)

# Validate predictions against labelled data
accuracy = accuracy_score(y_val, model.predict(X_val_vectorized))
print("Validation Accuracy:", accuracy)

print("Predicted Sentiments on Unlabelled Data:")
print(unlabelled_df[['Journal Notes', 'Predicted Sentiment']])

Validation Accuracy: 0.7333333333333333
Predicted Sentiments on Unlabelled Data:
                                           Journal Notes Predicted Sentiment
0      The asset is trading below its 200-day moving ...            Negative
1      The asset is trading below its 200-day moving ...            Negative
2      The asset is forming a symmetrical pattern (tr...            Positive
3      Bullish Channel: The price has been trading wi...            Positive
4      Sellers are dominating at key resistance level...            Negative
...                                                  ...                 ...
35240  The asset is range-bound, with support and res...             Neutral
35241  RSI is trending upwards, indicating increasing...            Positive
35242  The asset is trading near its moving averages,...             Neutral
35243  The asset has broken above a significant resis...            Positive
35244  Volume is increasing as price moves higher, co...            Posi

## NLTK VADER

In [10]:
import pandas as pd
import re
from nltk.sentiment.vader import SentimentIntensityAnalyzer



# Preprocess text data
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)  # Removing non-word characters
    text = re.sub(r'\s+', ' ', text)  # Removing extra whitespaces
    return text.lower().strip()

labelled_df['Journal Notes'] = labelled_df['Journal Notes'].apply(preprocess_text)
unlabelled_df['Journal Notes'] = unlabelled_df['Journal Notes'].apply(preprocess_text)

# Initialize the VADER sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Predict sentiment on unlabelled data
unlabelled_df['VADER Sentiment'] = unlabelled_df['Journal Notes'].apply(lambda x: 'Positive' if sid.polarity_scores(x)['compound'] > 0 else ('Negative' if sid.polarity_scores(x)['compound'] < 0 else 'Neutral'))

# Display predictions
print("Predicted Sentiments on Unlabelled Data:")
print(unlabelled_df[['Journal Notes', 'VADER Sentiment']])


Predicted Sentiments on Unlabelled Data:
                                           Journal Notes VADER Sentiment
0      the asset is trading below its 200 day moving ...        Positive
1      the asset is trading below its 200 day moving ...        Negative
2      the asset is forming a symmetrical pattern tri...        Positive
3      bullish channel the price has been trading wit...        Positive
4      sellers are dominating at key resistance level...        Negative
...                                                  ...             ...
35240  the asset is range bound with support and resi...        Positive
35241  rsi is trending upwards indicating increasing ...        Positive
35242  the asset is trading near its moving averages ...        Positive
35243  the asset has broken above a significant resis...        Positive
35244  volume is increasing as price moves higher con...        Positive

[35245 rows x 2 columns]


In [16]:

# Preprocess the text data
labelled_df['Journal Notes'] = labelled_df['Journal Notes'].apply(preprocess_text)

# Predicted sentiments from VADER
predicted_sentiments = unlabelled_df.iloc[:5]['VADER Sentiment']  # Take the first 5 predictions to match labelled data

# Actual sentiments from labelled data
actual_sentiments = labelled_df['Sentiment']

# Calculate accuracy
accuracy = accuracy_score(actual_sentiments, predicted_sentiments)
print("Validation Accuracy:", accuracy)


Validation Accuracy: 0.6
