In [4]:
import re
import pandas as pd
import string
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Download required NLTK data
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
df = pd.read_csv(r'dataset\sentiment_data.csv')  # Your 5000-row dataset
df.drop_duplicates(inplace=True)

# Define text preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if len(word) > 2]
    return " ".join(tokens)
# Apply preprocessing
df['text'] = df['text'].apply(preprocess_text)


# TF-IDF vectorization
vectorizer = TfidfVectorizer(
    stop_words='english',
    max_features=3000,
    min_df=2,     # Lowered
    max_df=0.95,  # Slightly higher
    ngram_range=(1, 2)
)
X = vectorizer.fit_transform(df['text'])
y = df['label']  # 0 = negative, 1 = positive

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Define models
models = {
    "LogisticRegression": LogisticRegression(C=1.0, max_iter=1000, solver='liblinear'),
    "SVM": LinearSVC(C=1.0, max_iter=1000),
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42)
}

# Train, evaluate and save the best model
best_model = None
best_score = 0
best_name = ""

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    print(f"{name} Accuracy: {acc:.4f}")
    
    if acc > best_score:
        best_model = model
        best_score = acc
        best_name = name

# Save the best model and vectorizer
with open("best_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

print(f"\nBest model: {best_name} saved with accuracy: {best_score:.4f}")


LogisticRegression Accuracy: 0.2500
SVM Accuracy: 0.2500
RandomForest Accuracy: 0.2500

Best model: LogisticRegression saved with accuracy: 0.2500


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\kc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
print(df['text'].apply(lambda x: len(x.split())).describe())

count    20.000000
mean      2.600000
std       1.046297
min       1.000000
25%       2.000000
50%       2.500000
75%       3.250000
max       4.000000
Name: text, dtype: float64
