# Sentiment Analysis with Multiple Models

This Jupyter notebook demonstrates sentiment analysis using various machine learning models, including Support Vector Machine (SVM), Logistic Regression, Random Forest, and XGBoost. The notebook also includes steps for hyperparameter tuning, model training, and evaluation.

In [1]:
!pip install nltk scikit-learn pandas
!pip install spacy
!python -m spacy download en_core_web_sm

In [2]:
# Step 1: Import necessary libraries
import spacy
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


In [3]:
# Load data
# Example data load - replace with actual sentiment data
data = pd.read_csv('sentiment_data.csv')  # Replace with your dataset path
print(data.head())

# Data preprocessing
nlp = spacy.load('en_core_web_sm')

def preprocess_text(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

data['processed_text'] = data['text'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['sentiment'])

# Split data into train and test sets
X = data['processed_text']
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text to TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [4]:
# Train and evaluate baseline models

# 1. Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train_tfidf, y_train)
y_pred_logreg = logreg.predict(X_test_tfidf)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_logreg))
print(classification_report(y_test, y_pred_logreg))

# 2. Random Forest
rf = RandomForestClassifier()
rf.fit(X_train_tfidf, y_train)
y_pred_rf = rf.predict(X_test_tfidf)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

# 3. XGBoost
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train_tfidf, y_train)
y_pred_xgb = xgb_model.predict(X_test_tfidf)
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))

In [5]:
# Hyperparameter Tuning for Logistic Regression
param_grid_logreg = {
    'C': [0.1, 1, 10],
    'solver': ['liblinear', 'saga'],
    'max_iter': [100, 1000]
}

logreg_grid = GridSearchCV(LogisticRegression(), param_grid_logreg, cv=3, n_jobs=-1)
logreg_grid.fit(X_train_tfidf, y_train)
best_logreg_model = logreg_grid.best_estimator_

# Evaluate Logistic Regression model
y_pred_logreg = best_logreg_model.predict(X_test_tfidf)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_logreg))
print(classification_report(y_test, y_pred_logreg))

# ------------------------------------------
# 3. Random Forest Hyperparameter Tuning
# ------------------------------------------
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'class_weight': ['balanced', None]
}

rf_grid = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=3, n_jobs=-1)
rf_grid.fit(X_train_tfidf, y_train)
best_rf_model = rf_grid.best_estimator_

# Evaluate Random Forest model
y_pred_rf = best_rf_model.predict(X_test_tfidf)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

# ------------------------------------------
# 4. XGBoost Hyperparameter Tuning
# ------------------------------------------
param_grid_xgb = {
    'max_depth': [3, 5, 7],                # Try different max_depth to prevent overfitting
    'learning_rate': [0.01, 0.05, 0.1],    # Lower learning rate to prevent overfitting
    'n_estimators': [100, 200, 300],       # Try different numbers of trees
    'subsample': [0.7, 0.8, 1.0],           # Randomly sample different portions of data for each tree
    'colsample_bytree': [0.7, 0.8, 1.0],    # Sample features at each split
    'scale_pos_weight': [20, 30, 40]        # Adjust class imbalance weight further
}

xgb_grid = GridSearchCV(xgb.XGBClassifier(), param_grid_xgb, cv=3, n_jobs=-1)
xgb_grid.fit(X_train_tfidf, y_train)
best_xgb_model = xgb_grid.best_estimator_

# Evaluate XGBoost model
y_pred_xgb = best_xgb_model.predict(X_test_tfidf)
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))