In [4]:
# Importing necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import nltk

# Ensure necessary NLTK data is downloaded
nltk.download('stopwords')
nltk.download('punkt')

# Step 1: Read the dataset
file_path = r"C:\Users\parathasarathi s\Downloads\amazon_alexa_data (2).csv"  # Ensure the path is correct
df = pd.read_csv(file_path)

# Step 2: Remove null values
df = df.dropna()

# Step 3: Preprocess the Amazon Alexa reviews
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

def preprocess_text(text):
    words = word_tokenize(text)
    words = [word.lower() for word in words if word.isalpha()]
    words = [word for word in words if word not in stop_words]
    words = [ps.stem(word) for word in words]
    return ' '.join(words)

df['clean_reviews'] = df['verified_reviews'].apply(preprocess_text)

# Step 4: Transform the words into vectors
# Using Count Vectorizer
cv = CountVectorizer()
X_cv = cv.fit_transform(df['clean_reviews'])

# Using TF-IDF Vectorizer
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(df['clean_reviews'])

# Step 5: Split data into training and test data
X_train_cv, X_test_cv, y_train_cv, y_test_cv = train_test_split(X_cv, df['feedback'], test_size=0.2, random_state=42)
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf, df['feedback'], test_size=0.2, random_state=42)

# Step 6: Apply models and generate predictions
# a) Multinomial Naïve Bayes Classification
nb_model = MultinomialNB()
nb_model.fit(X_train_cv, y_train_cv)
nb_pred = nb_model.predict(X_test_cv)

# b) Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train_tfidf)
lr_pred = lr_model.predict(X_test_tfidf)

# c) KNN Classification
knn_model = KNeighborsClassifier()
knn_model.fit(X_train_tfidf, y_train_tfidf)
knn_pred = knn_model.predict(X_test_tfidf)

# Step 7: Predict the feedback for test data
# Predictions are stored in nb_pred, lr_pred, knn_pred

# Step 8: Compute Confusion matrix and classification report
# for each of these models
conf_matrix_nb = confusion_matrix(y_test_cv, nb_pred)
class_report_nb = classification_report(y_test_cv, nb_pred)

conf_matrix_lr = confusion_matrix(y_test_tfidf, lr_pred)
class_report_lr = classification_report(y_test_tfidf, lr_pred)

conf_matrix_knn = confusion_matrix(y_test_tfidf, knn_pred)
class_report_knn = classification_report(y_test_tfidf, knn_pred)

# Step 9: Report the model with the best accuracy
# This can be done by comparing the accuracy scores
accuracy_nb = nb_model.score(X_test_cv, y_test_cv)
accuracy_lr = lr_model.score(X_test_tfidf, y_test_tfidf)
accuracy_knn = knn_model.score(X_test_tfidf, y_test_tfidf)

best_model = max([(accuracy_nb, 'Multinomial Naïve Bayes'), (accuracy_lr, 'Logistic Regression'), (accuracy_knn, 'KNN')],
                 key=lambda x: x[0])

print(f"The model with the best accuracy is {best_model[1]} with an accuracy of {best_model[0]*100:.2f}%.")

# Print classification reports and confusion matrices
print("\nClassification Report for Multinomial Naive Bayes:")
print(class_report_nb)
print("Confusion Matrix for Multinomial Naive Bayes:")
print(conf_matrix_nb)

print("\nClassification Report for Logistic Regression:")
print(class_report_lr)
print("Confusion Matrix for Logistic Regression:")
print(conf_matrix_lr)

print("\nClassification Report for KNN:")
print(class_report_knn)
print("Confusion Matrix for KNN:")
print(conf_matrix_knn)


[nltk_data] Downloading package stopwords to C:\Users\parathasarathi
[nltk_data]     s\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\parathasarathi
[nltk_data]     s\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The model with the best accuracy is Multinomial Naïve Bayes with an accuracy of 91.90%.

Classification Report for Multinomial Naive Bayes:
              precision    recall  f1-score   support

           0       0.64      0.28      0.39        58
           1       0.93      0.98      0.96       572

    accuracy                           0.92       630
   macro avg       0.79      0.63      0.67       630
weighted avg       0.90      0.92      0.90       630

Confusion Matrix for Multinomial Naive Bayes:
[[ 16  42]
 [  9 563]]

Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       1.00      0.03      0.07        58
           1       0.91      1.00      0.95       572

    accuracy                           0.91       630
   macro avg       0.96      0.52      0.51       630
weighted avg       0.92      0.91      0.87       630

Confusion Matrix for Logistic Regression:
[[  2  56]
 [  0 572]]

Classification Report f