In [2]:
import numpy as np
import pandas as pd
import joblib
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import save_npz, load_npz
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from scipy.sparse import hstack




In [None]:
# Load preprocessed data
X_train = np.load('../data/processed/X_train.npy', allow_pickle=True)
X_test = np.load('../data/processed/X_test.npy', allow_pickle=True)

# convert to pandas dataframes
train_df = pd.DataFrame(X_train, columns=['text'])
test_df = pd.DataFrame(X_test, columns=['text'])

analyzer = SentimentIntensityAnalyzer()

# Add VADER sentiment scores
train_df['vader_score'] = train_df['text'].apply(lambda x: analyzer.polarity_scores(x)['compound'])
test_df['vader_score'] = test_df['text'].apply(lambda x: analyzer.polarity_scores(x)['compound'])

# Add tweet length
train_df['tweet_length'] = train_df['text'].apply(len)
test_df['tweet_length'] = test_df['text'].apply(len)

# Vectorize text
vectorizer = TfidfVectorizer(max_features=10000)
train_tfidf = vectorizer.fit_transform(train_df['text'])
test_tfidf = vectorizer.transform(test_df['text'])
joblib.dump(vectorizer, '../models/vectorizer.pkl')

# Combine VADER sentiment scores and tweet length as additional features
train_additional_features = train_df[['vader_score', 'tweet_length']].values
test_additional_features = test_df[['vader_score', 'tweet_length']].values

# Combine TF-IDF features with additional features
train_feat_vect = hstack([train_tfidf, train_additional_features])
test_feat_vect = hstack([test_tfidf, test_additional_features])

save_npz('../data/processed/train_feat_vect.npz', train_feat_vect)
save_npz('../data/processed/test_feat_vect.npz', test_feat_vect)


In [None]:
# Using Logistic Regression with VADER sentiment scores and tweet length as features

# Load processed training and test data
train_df = load_npz('../data/processed/train_feat_vect.npz')
test_df = load_npz('../data/processed/test_feat_vect.npz')


# Load labels
y_train = np.load('../data/processed/y_train.npy')
y_test = np.load('../data/processed/y_test.npy')

log_reg = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)

log_reg.fit(train_df, y_train)

# Predict on the test set
y_pred_log_reg = log_reg.predict(test_df)

# Evaluate the Logistic Regression model
print("Logistic Regression Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_log_reg)}")
print("Classification Report:")
print(classification_report(y_test, y_pred_log_reg))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_log_reg))

In [41]:

joblib.dump(log_reg, '../models/logistic_regression_model.pkl')

['../models/logistic_regression_model.pkl']