In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings("ignore")
import joblib

# Data Loading and Analysis
data = pd.read_csv('data.csv')
def clean_text(text):
    if not isinstance(text, str):
        return ''
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    text = text.lower()
    return text


nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = clean_text(text)
    words = text.split()
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

data['clean_text'] = data['Review text'].apply(preprocess_text)

# Model Training
X = data['clean_text']
y = data['Ratings'] > 3  # Considering ratings 4 and 5 as positive, rest as negative
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [5]:
from sklearn.ensemble import RandomForestClassifier

# Define hyperparameters for grid search for Random Forest
parameters_rf = {
    'vect__ngram_range': [(1, 1), (1, 2)],  # unigrams or bigrams
    'tfidf__use_idf': (True, False),
    'clf__n_estimators': [50, 100, 200],  # Number of trees in the forest
    'clf__max_depth': [None, 10, 20]  # Maximum depth of the tree
}

# Add Random Forest classifier to the classifiers dictionary
classifiers['Random Forest'] = (RandomForestClassifier(), parameters_rf)

# Perform grid search for each classifier
for classifier_name, (classifier, params) in classifiers.items():
    pipeline = create_pipeline(classifier)
    grid_search = GridSearchCV(pipeline, params, cv=5, n_jobs=-1, scoring='f1_macro')
    grid_search.fit(X_train, y_train)
    
    # Print best parameters and F1 score
    print(f"Best parameters for {classifier_name}: {grid_search.best_params_}")
    
    # Evaluate on training data
    train_pred = grid_search.predict(X_train)
    train_f1 = f1_score(y_train, train_pred, average='macro')
    
    # Evaluate on testing data
    test_pred = grid_search.predict(X_test)
    test_f1 = f1_score(y_test, test_pred, average='macro')
    
    print(f"Training F1 Score for {classifier_name}: {train_f1}")
    print(f"Testing F1 Score for {classifier_name}: {test_f1}")
    print("\n")


Best parameters for Logistic Regression: {'clf__C': 10, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}
Training F1 Score for Logistic Regression: 0.9111056221932412
Testing F1 Score for Logistic Regression: 0.7777467844413452


Best parameters for Multinomial Naive Bayes: {'clf__alpha': 0.1, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}
Training F1 Score for Multinomial Naive Bayes: 0.893777440289826
Testing F1 Score for Multinomial Naive Bayes: 0.7639693675005219


Best parameters for Support Vector Machine: {'clf__C': 10, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 1)}
Training F1 Score for Support Vector Machine: 0.9254202907010682
Testing F1 Score for Support Vector Machine: 0.7846522587694043


Best parameters for Random Forest: {'clf__max_depth': None, 'clf__n_estimators': 100, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}
Training F1 Score for Random Forest: 0.925694064317482
Testing F1 Score for Random Forest: 0.76354004260981




In [7]:
model = grid_search.best_estimator_
joblib.dump(model, f'{classifier_name.lower().replace(" ", "_")}_model.pkl')

['random_forest_model.pkl']

In [8]:
import joblib
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# Define a function to create the pipeline
def create_pipeline(classifier):
    return Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', classifier)
    ])

# Define hyperparameters for grid search for each classifier
parameters_lr = {
    'vect__ngram_range': [(1, 1), (1, 2)],  
    'tfidf__use_idf': (True, False),
    'clf__C': [0.1, 1, 10, 100],  
    'clf__penalty': ['l1', 'l2']
}

parameters_nb = {
    'vect__ngram_range': [(1, 1), (1, 2)],  
    'tfidf__use_idf': (True, False),
    'clf__alpha': [0.1, 1, 10, 100]
}

parameters_svm = {
    'vect__ngram_range': [(1, 1), (1, 2)],  
    'tfidf__use_idf': (True, False),
    'clf__C': [0.1, 1, 10, 100],  
    'clf__kernel': ['linear', 'rbf']
}

parameters_rf = {
    'vect__ngram_range': [(1, 1), (1, 2)],  
    'tfidf__use_idf': (True, False),
    'clf__n_estimators': [100, 200, 300, 500],  
    'clf__max_depth': [None, 10, 20, 50]
}

# Define the classifiers with respective parameter grids
classifiers = {
    'Logistic Regression': (LogisticRegression(max_iter=1000), parameters_lr),
    'Multinomial Naive Bayes': (MultinomialNB(), parameters_nb),
    'Support Vector Machine': (SVC(), parameters_svm),
    'Random Forest': (RandomForestClassifier(), parameters_rf)
}

# Perform grid search for each classifier
for classifier_name, (classifier, params) in classifiers.items():
    pipeline = create_pipeline(classifier)
    grid_search = GridSearchCV(pipeline, params, cv=5, n_jobs=-1, scoring='f1_macro')
    grid_search.fit(X_train, y_train)
    
    # Print best parameters and F1 score
    print(f"Best parameters for {classifier_name}: {grid_search.best_params_}")
    
    # Save the model
    model = grid_search.best_estimator_
    joblib.dump(model, f'{classifier_name.lower().replace(" ", "_")}_model.pkl')
    
    # Evaluate on training data
    train_pred = grid_search.predict(X_train)
    train_f1 = f1_score(y_train, train_pred, average='macro')
    
    # Evaluate on testing data
    test_pred = grid_search.predict(X_test)
    test_f1 = f1_score(y_test, test_pred, average='macro')
    
    print(f"Training F1 Score for {classifier_name}: {train_f1}")
    print(f"Testing F1 Score for {classifier_name}: {test_f1}")
    print("\n")


Best parameters for Logistic Regression: {'clf__C': 10, 'clf__penalty': 'l2', 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}
Training F1 Score for Logistic Regression: 0.9111056221932412
Testing F1 Score for Logistic Regression: 0.7777467844413452


Best parameters for Multinomial Naive Bayes: {'clf__alpha': 0.1, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}
Training F1 Score for Multinomial Naive Bayes: 0.893777440289826
Testing F1 Score for Multinomial Naive Bayes: 0.7639693675005219


Best parameters for Support Vector Machine: {'clf__C': 10, 'clf__kernel': 'linear', 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}
Training F1 Score for Support Vector Machine: 0.9239911181880196
Testing F1 Score for Support Vector Machine: 0.781779118373315


Best parameters for Random Forest: {'clf__max_depth': None, 'clf__n_estimators': 200, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 1)}
Training F1 Score for Random Forest: 0.9256031149951818
Testing F1 Score for Random F

In [9]:
model

Pipeline(steps=[('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer(use_idf=False)),
                ('clf', RandomForestClassifier(n_estimators=200))])