In [4]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (accuracy_score, classification_report, 
                             confusion_matrix)
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from joblib import Parallel, delayed
import spacy
from contractions import fix
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Enhanced preprocessing function
def clean_text(text):
    text = fix(text)  # Expand contractions
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove non-alphabetic characters
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    text = text.lower()

    # Tokenization and lemmatization with SpaCy
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop]

    return " ".join(tokens)

# Load and preprocess the dataset
def load_and_preprocess_data(filepath):
    df = pd.read_csv(filepath, encoding='ISO-8859-1')
    df['Text'] = df['Text'].apply(clean_text)
    print(df.head())
    print(df['Sentiment'].value_counts())  # Check class distribution
    return df

# Data splitting
def split_data(df):
    X = df['Text']
    y = df['Sentiment']

    # Vectorize text
    vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1, 3), min_df=5, max_df=0.8)
    X_vec = vectorizer.fit_transform(X)

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, stratify=y, random_state=42)

    return X_train, X_test, y_train, y_test, vectorizer

# Cross-validation with hyperparameter tuning and parallel processing
def cross_validate_model(model, X, y, param_grid=None):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    best_model = None
    best_score = -1

    def train_fold(train_idx, val_idx):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        if param_grid:
            grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='f1_weighted', cv=3, n_jobs=-1, verbose=1)
            grid_search.fit(X_train, y_train)
            return grid_search.best_estimator_, grid_search.best_score_
        else:
            model.fit(X_train, y_train)
            score = model.score(X_val, y_val)
            return model, score

    results = Parallel(n_jobs=-1)(delayed(train_fold)(train_idx, val_idx) for train_idx, val_idx in skf.split(X, y))

    for trained_model, score in results:
        if score > best_score:
            best_model = trained_model
            best_score = score

    return best_model

# Train and evaluate models
def train_and_evaluate(models, X_train, X_test, y_train, y_test):
    results = {}

    for name, model_info in models.items():
        model, param_grid = model_info
        try:
            model = cross_validate_model(model, X_train, y_train, param_grid)
            predictions = model.predict(X_test)

            acc = accuracy_score(y_test, predictions)
            report = classification_report(y_test, predictions, zero_division=1, output_dict=True)
            cm = confusion_matrix(y_test, predictions)

            results[name] = {
                'accuracy': acc,
                'classification_report': report,
                'confusion_matrix': cm,
                'model': model
            }
        except Exception as e:
            results[name] = {'error': str(e)}
            print(f"Error training {name}: {e}")

    return results

# Compare models
def compare_models(results):
    metrics = []

    for model_name, result in results.items():
        if 'classification_report' not in result:
            print(f"Skipping {model_name} due to missing classification report.")
            continue

        report = result['classification_report']
        metrics.append({
            'Model': model_name,
            'Accuracy': result['accuracy'],
            'Precision': report['weighted avg']['precision'],
            'Recall': report['weighted avg']['recall'],
            'F1-Score': report['weighted avg']['f1-score']
        })

    if not metrics:
        raise ValueError("No valid model metrics available for comparison.")

    metrics_df = pd.DataFrame(metrics).round(4).sort_values('F1-Score', ascending=False)
    best_model_name = metrics_df.iloc[0]['Model']
    best_model_metrics = metrics_df.iloc[0].to_dict()

    return metrics_df, best_model_name, best_model_metrics

# Print confusion matrices
def print_confusion_matrices(results):
    for model_name, result in results.items():
        if 'confusion_matrix' not in result:
            continue

        cm = result['confusion_matrix']
        print(f"\nConfusion Matrix for {model_name}:")
        print("True\\Pred\t", end="")
        for i in range(cm.shape[1]):
            print(f"{i}\t", end="")
        print("\n" + "-" * 40)

        for i in range(cm.shape[0]):
            print(f"{i}\t", end="")
            for j in range(cm.shape[1]):
                print(f"{cm[i,j]}\t", end="")
            print()

# Main script
if __name__ == "__main__":
    filepath = 'reduced_dataset.csv'
    df = load_and_preprocess_data(filepath)

    X_train, X_test, y_train, y_test, vectorizer = split_data(df)

    models = {
        'SVM': (SVC(kernel='linear', probability=True), {'C': [0.1, 1, 10]}),
        'Random Forest': (RandomForestClassifier(n_jobs=-1), {'n_estimators': [100, 200]}),
        'ANN': (MLPClassifier(max_iter=1000), {'hidden_layer_sizes': [(128,), (128, 64)], 'alpha': [0.0001, 0.001], 'learning_rate': ['constant', 'adaptive']}),
        'Naive Bayes': (MultinomialNB(), {'alpha': [0.1, 0.5, 1]}),
        'KNN': (KNeighborsClassifier(), {'n_neighbors': [3, 5, 7]}),
        'Logistic Regression': (LogisticRegression(max_iter=1000, n_jobs=-1), {'C': [0.01, 0.1, 1]})
    }

    results = train_and_evaluate(models, X_train, X_test, y_train, y_test)

    metrics_df, best_model_name, best_model_metrics = compare_models(results)

    print("\nModel Comparison:")
    print(metrics_df.to_string(index=False))

    print(f"\nBest Model: {best_model_name}")
    print("Best Model Metrics:")
    for metric, value in best_model_metrics.items():
        if metric != 'Model':
            print(f"{metric}: {value:.4f}")

    print("\nConfusion Matrices:")
    print_confusion_matrices(results)


  Sentiment                                               Text
0  positive  like summerslam look arena curtain look overal...
1  positive  television show appeal different kind fan like...
2  negative  film quickly get major chase scene increase de...
3  positive  jane austen definitely approve onebr br gwynet...
4  negative  expectation somewhat high go movie think steve...
Sentiment
positive    20004
negative    19996
Name: count, dtype: int64

Model Comparison:
              Model  Accuracy  Precision  Recall  F1-Score
                SVM    0.8904     0.8907  0.8904    0.8904
Logistic Regression    0.8894     0.8899  0.8894    0.8893
        Naive Bayes    0.8696     0.8702  0.8696    0.8696
                ANN    0.8686     0.8687  0.8686    0.8686
      Random Forest    0.8585     0.8587  0.8585    0.8585
                KNN    0.7596     0.7648  0.7596    0.7584

Best Model: SVM
Best Model Metrics:
Accuracy: 0.8904
Precision: 0.8907
Recall: 0.8904
F1-Score: 0.8904

Confusion Mat

In [5]:
# Save metrics, confusion matrices, hyperparameters, and best model details to a file
output_file = 'model_results_40000.txt'

with open(output_file, 'w') as f:
    # Save model comparison metrics
    f.write("Model Comparison Metrics:\n")
    f.write(metrics_df.to_string(index=False))
    
    # Save best model details
    f.write("\n\nBest Model:\n")
    f.write(f"{best_model_name}\n")
    f.write("Best Model Metrics:\n")
    for metric, value in best_model_metrics.items():
        if metric != 'Model':
            f.write(f"{metric}: {value:.4f}\n")
    
    # Save hyperparameters for all models
    f.write("\n\nHyperparameters for All Models:\n")
    for model_name, result in results.items():
        model = result['model']
        f.write(f"\n{model_name} Hyperparameters:\n")
        if hasattr(model, 'get_params'):  # Check if hyperparameters are available
            params = model.get_params()
            for param, value in params.items():
                f.write(f"{param}: {value}\n")
        else:
            f.write("Hyperparameters not available for this model.\n")
    
    # Save confusion matrices
    f.write("\n\nConfusion Matrices:\n")
    for model_name, result in results.items():
        f.write(f"\nConfusion Matrix for {model_name}:\n")
        cm = result['confusion_matrix']
        f.write("True\\Pred\t" + "\t".join(map(str, range(cm.shape[1]))) + "\n")
        f.write("-" * 40 + "\n")
        for i in range(cm.shape[0]):
            f.write(f"{i}\t" + "\t".join(map(str, cm[i])) + "\n")

print(f"Results saved to '{output_file}'")



Results saved to 'model_results_40000.txt'


In [6]:
import joblib
import numpy as np

In [7]:
# Save the best model and vectorizer
best_model_name = metrics_df.iloc[0]['Model']
best_model = results[best_model_name]['model']  # Retrieve the best model
joblib.dump(best_model, 'best_model_1.joblib')  # Save the best model
joblib.dump(vectorizer, 'vectorizer_1.joblib')  # Save the vectorizer
print(f"Best model and vectorizer saved as 'best_model.joblib' and 'vectorizer.joblib'.")


Best model and vectorizer saved as 'best_model.joblib' and 'vectorizer.joblib'.


In [17]:

# Load the best model and vectorizer
best_model = joblib.load('best_model_1.joblib')  
vectorizer = joblib.load('vectorizer_1.joblib')  


def predict_sentiment(user_input):
    # Preprocess the input text 
    clean_input = clean_text(user_input)
    
    # Transform the input using the saved vectorizer
    input_vec = vectorizer.transform([clean_input])
    
    # Predict sentiment using the loaded model
    prediction = best_model.predict(input_vec)
    
    # Return the predicted sentiment
    return prediction[0]

# User Input
user_input = input("Enter text to predict sentiment: ")
predicted_sentiment = predict_sentiment(user_input)

print(f"Predicted Sentiment: {predicted_sentiment}")


Predicted Sentiment: positive
