In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack


# 1. Text Length
data['text_length'] = data['lemmatized_text'].apply(len)

# 2. Word Count
data['word_count'] = data['lemmatized_text'].apply(lambda x: len(x.split()))

# 3. Stopword Count
data['stopword_count'] = data['lemmatized_text'].apply(lambda x: sum(1 for word in x.split() if word in stop_words))

# 4. Average Word Length
data['avg_word_length'] = data['lemmatized_text'].apply(lambda x: np.mean([len(word) for word in x.split()]) if len(x.split()) > 0 else 0)

# Prepare features and target
X = data['lemmatized_text']
X_features = data[['text_length', 'word_count', 'stopword_count', 'avg_word_length']]
y = data['target']  # 'target' column has sentiment labels (1 for positive, 0 for negative)

# Standardize numerical features
scaler = StandardScaler()
X_features_scaled = scaler.fit_transform(X_features)
# Split the dataset
X_train, X_test, X_features_train, X_features_test, y_train, y_test = train_test_split(X, X_features_scaled, y, test_size=0.2, random_state=42, stratify=y)

# Convert text data to numerical vectors using TF-IDF
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)
# Combine TF-IDF features with engineered features
X_train_combined = hstack([X_train_tfidf, X_features_train])
X_test_combined = hstack([X_test_tfidf, X_features_test])

# Train a Logistic Regression model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_combined, y_train)

# Predict on the test set
y_pred = log_reg.predict(X_test_combined)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("AUC Score:", roc_auc_score(y_test, log_reg.predict_proba(X_test_combined)[:, 1]))

# Hyperparameter Tuning
param_grid = {'C': [0.1, 1, 10, 100]}
grid_search = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5)
grid_search.fit(X_train_combined, y_train)

# Best model from GridSearchCV
best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

# Evaluate the best model
y_pred_best = best_model.predict(X_test_combined)
print("Best Model Accuracy:", accuracy_score(y_test, y_pred_best))
