In [2]:
import os
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..", "src")))

from data_loader import load_data

train_data = load_data("train.csv")
test_data = load_data("test.csv")

In [None]:
X = train_data['review']
y = train_data['sentiment']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)

print("TF-IDF Feature Names:", vectorizer.get_feature_names_out())
print("Train Data Shape:", X_train_tfidf.shape)
print("Validation Data Shape:", X_val_tfidf.shape)

TF-IDF Feature Names: ['00' '000' '10' ... 'zombie' 'zombies' 'zone']
Train Data Shape: (32000, 5000)
Validation Data Shape: (8000, 5000)


Now I will train and compare 3 different models:
* Logistic Regression
* Random Forest
* Support Vector Machine

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Logistic Regression Model
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_tfidf, y_train)
log_reg_preds = log_reg.predict(X_val_tfidf)
log_reg_acc = accuracy_score(y_val, log_reg_preds)
print(f"Logistic Regression Accuracy: {log_reg_acc:.4f}")

# Random Forest Model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_tfidf, y_train)
rf_preds = rf.predict(X_val_tfidf)
rf_acc = accuracy_score(y_val, rf_preds)
print(f"Random Forest Accuracy: {rf_acc:.4f}")

# Support Vector Machine Model
svm = SVC(kernel='linear', random_state=42)
svm.fit(X_train_tfidf, y_train)
svm_preds = svm.predict(X_val_tfidf)
svm_acc = accuracy_score(y_val, svm_preds)
print(f"SVM Accuracy: {svm_acc:.4f}")


Logistic Regression Accuracy: 0.8888
Random Forest Accuracy: 0.8427
SVM Accuracy: 0.8900


We did not get the minimum requested accuracy from Random Forest. I am going to use Grid Search and find out the best parameters.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'max_features': ['sqrt', 'log2'],
    'min_samples_split': [2, 5, 10]
}

rf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(rf, param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train_tfidf, y_train)

best_rf = grid_search.best_estimator_
rf_preds = best_rf.predict(X_val_tfidf)

rf_best_acc = accuracy_score(y_val, rf_preds)
print(f"Optimized Random Forest Accuracy: {rf_best_acc:.4f}")
print(f"Best Parameters: {grid_search.best_params_}")

Fitting 3 folds for each of 54 candidates, totalling 162 fits
Optimized Random Forest Accuracy: 0.8601
Best Parameters: {'max_depth': None, 'max_features': 'log2', 'min_samples_split': 10, 'n_estimators': 300}


Now we can apply the optimized parameters.

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf_optimized = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    max_features='log2',
    min_samples_split=10,
    random_state=42
)

rf_optimized.fit(X_train_tfidf, y_train)

rf_optimized_preds = rf_optimized.predict(X_val_tfidf)

rf_optimized_acc = accuracy_score(y_val, rf_optimized_preds)
print(f"Optimized Random Forest Accuracy: {rf_optimized_acc:.4f}")

Optimized Random Forest Accuracy: 0.8601


According to results, the best model: SVM (with 0.8900 accuracy). So we can choose the SVM model as the final model.<br>
Let's load the test data and make a prediction using our final model:

In [None]:
import joblib

model_path = os.path.join(os.getcwd(), "..", "outputs", "models", "final_svm_model.pkl")

joblib.dump(svm, model_path)
print(f"Final SVM model saved at: {model_path}")

Final SVM model saved at: c:\Users\Serhet\Desktop\data-science-task\notebooks\..\outputs\models\final_svm_model.pkl


In [None]:
X_test = test_data['review']

X_test_tfidf = vectorizer.transform(X_test)

test_preds = svm.predict(X_test_tfidf)

submission = pd.DataFrame({
    'review': X_test,
    'predicted_sentiment': test_preds
})

submission_path = os.path.join(os.getcwd(), "..", "outputs", "predictions", "svm_test_predictions.csv")
submission.to_csv(submission_path, index=False)
print(f"Predictions saved at: {submission_path}")

Predictions saved at: c:\Users\Serhet\Desktop\data-science-task\notebooks\..\outputs\predictions\svm_test_predictions.csv
