In [1]:
# Importing essential libraries
import numpy as np
import pandas as pd

In [2]:
# Loading the dataset
messages = pd.read_csv('feedback_dataset.csv')

In [3]:
messages.shape

(5200, 2)

In [4]:
messages.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5200 entries, 0 to 5199
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       5200 non-null   object
 1   sentiment  5200 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 81.4+ KB


In [5]:
messages.columns

Index(['text', 'sentiment'], dtype='object')

In [6]:
messages.head()

Unnamed: 0,text,sentiment
0,Display is excellent and camera is as good as ...,1
1,Battery life is also great!,1
2,Protects the phone on all sides.,1
3,"Clear Skype Calls, Long Battery Life, Long Range.",1
4,Great Hands Free Device.,1


# 

# Data Cleaning & Preprocessing

In [7]:
messages.sentiment.unique()

array([1, 0], dtype=int64)

In [8]:
def to_sentiment(sentiment):
  sentiment = sentiment
  if sentiment == 0:
    return 'negative'
  else: 
    return 'positive'
messages['target'] = messages.sentiment.apply(to_sentiment)

In [9]:
messages['target']

0       positive
1       positive
2       positive
3       positive
4       positive
          ...   
5195    negative
5196    negative
5197    negative
5198    negative
5199    negative
Name: target, Length: 5200, dtype: object

In [10]:
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer #to find the root word

In [12]:
ps = PorterStemmer()
corpus = []

In [13]:
for i in range (0,len(messages)):
    # Cleaning special character from the reviews
    feedback = re.sub('[^a-zA-Z]',' ',str(messages['text'][i]))
    
    # Converting the entire review into lower case
    feedback = feedback.lower()
    
    # Tokenizing the review by words
    feedback_words = feedback.split()
    
    # Stemming the words and removing the stopwords
    feedback = [ps.stem(word) for word in feedback_words if not word in set(stopwords.words('english')) ]
    
    # Joining the stemmed words
    feedback = ' '.join(feedback)

    # Creating a corpus
    corpus.append(feedback)

In [14]:
corpus[0:10]

['display excel camera good year',
 'batteri life also great',
 'protect phone side',
 'clear skype call long batteri life long rang',
 'great hand free devic',
 'even take self portrait outsid exterior display cool',
 'tri mani mani handsfre gadget one final work well',
 'magic help',
 'best phone market',
 'work well']

In [15]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = messages['target'].values

In [16]:
X.shape

(5200, 1500)

In [17]:
y.shape

(5200,)

# Training Model

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

# Naive Bayes (MulitinomialNB)

In [19]:
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

MultinomialNB()

In [20]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [21]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.84      0.79      0.81       529
    positive       0.79      0.84      0.82       511

    accuracy                           0.81      1040
   macro avg       0.82      0.81      0.81      1040
weighted avg       0.82      0.81      0.81      1040



In [22]:
# Accuracy
from sklearn.metrics import accuracy_score
score1 = accuracy_score(y_test,y_pred)
print("---- Scores ----")
print("Accuracy score is: {}%".format(round(score1*100,2)))

---- Scores ----
Accuracy score is: 81.44%


In [23]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[417, 112],
       [ 81, 430]], dtype=int64)

In [24]:
# Hyperparameter tuning the Naive Bayes Classifier
best_accuracy = 0.0
alpha_val = 0.0
for i in np.arange(0.1,1.1,0.1):
  temp_classifier = MultinomialNB(alpha=i)
  temp_classifier.fit(X_train, y_train)
  temp_y_pred = temp_classifier.predict(X_test)
  score = accuracy_score(y_test, temp_y_pred)
  print("Accuracy score for alpha={} is: {}%".format(round(i,1), round(score*100,2)))
  if score>best_accuracy:
    best_accuracy = score
    alpha_val = i
print('--------------------------------------------')
print('The best accuracy is {}% with alpha value as {}'.format(round(best_accuracy*100, 2), round(alpha_val,1)))

Accuracy score for alpha=0.1 is: 80.58%
Accuracy score for alpha=0.2 is: 80.58%
Accuracy score for alpha=0.3 is: 80.87%
Accuracy score for alpha=0.4 is: 81.06%
Accuracy score for alpha=0.5 is: 81.25%
Accuracy score for alpha=0.6 is: 80.96%
Accuracy score for alpha=0.7 is: 81.06%
Accuracy score for alpha=0.8 is: 81.25%
Accuracy score for alpha=0.9 is: 81.35%
Accuracy score for alpha=1.0 is: 81.44%
--------------------------------------------
The best accuracy is 81.44% with alpha value as 1.0


# Support Vector Machine Classifier

In [25]:
from sklearn.svm import SVC
model = SVC()
model.fit(X_train, y_train)

SVC()

In [26]:
# Predicting the Test set results
y_pred = model.predict(X_test)

In [27]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.80      0.91      0.85       529
    positive       0.89      0.76      0.82       511

    accuracy                           0.84      1040
   macro avg       0.84      0.83      0.83      1040
weighted avg       0.84      0.84      0.83      1040



In [28]:
# Accuracy
from sklearn.metrics import accuracy_score
score1 = accuracy_score(y_test,y_pred)
print("---- Scores ----")
print("Accuracy score is: {}%".format(round(score1*100,2)))

---- Scores ----
Accuracy score is: 83.56%


In [29]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[480,  49],
       [122, 389]], dtype=int64)

In [30]:
from sklearn.model_selection import GridSearchCV

# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
			'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
			'kernel': ['rbf']}

grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)

# fitting the model for grid search
grid.fit(X_train, y_train)


Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=  12.6s
[CV 2/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=  11.9s
[CV 3/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=  12.2s
[CV 4/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=  11.8s
[CV 5/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=  12.4s
[CV 1/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=  12.4s
[CV 2/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=  11.5s
[CV 3/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=  11.6s
[CV 4/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=  11.7s
[CV 5/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=  11.6s
[CV 1/5] END ..................C=0.1, gamma=0.01, kernel=rbf; total time=  13.8s
[CV 2/5] END ..................C=0.1, gamma=0.0

[CV 2/5] END ....................C=1000, gamma=1, kernel=rbf; total time=  10.1s
[CV 3/5] END ....................C=1000, gamma=1, kernel=rbf; total time=  10.0s
[CV 4/5] END ....................C=1000, gamma=1, kernel=rbf; total time=  10.0s
[CV 5/5] END ....................C=1000, gamma=1, kernel=rbf; total time=  10.1s
[CV 1/5] END ..................C=1000, gamma=0.1, kernel=rbf; total time=   9.0s
[CV 2/5] END ..................C=1000, gamma=0.1, kernel=rbf; total time=   8.9s
[CV 3/5] END ..................C=1000, gamma=0.1, kernel=rbf; total time=   8.7s
[CV 4/5] END ..................C=1000, gamma=0.1, kernel=rbf; total time=   9.0s
[CV 5/5] END ..................C=1000, gamma=0.1, kernel=rbf; total time=   8.8s
[CV 1/5] END .................C=1000, gamma=0.01, kernel=rbf; total time=   6.9s
[CV 2/5] END .................C=1000, gamma=0.01, kernel=rbf; total time=   6.3s
[CV 3/5] END .................C=1000, gamma=0.01, kernel=rbf; total time=   6.3s
[CV 4/5] END ...............

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf']},
             verbose=3)

In [31]:
# print best parameter after tuning
print(grid.best_params_)
 
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
SVC(C=10, gamma=0.1)


In [32]:
grid_predictions = grid.predict(X_test)
# print classification report
print(classification_report(y_test, grid_predictions))

              precision    recall  f1-score   support

    negative       0.84      0.88      0.86       529
    positive       0.87      0.82      0.85       511

    accuracy                           0.85      1040
   macro avg       0.85      0.85      0.85      1040
weighted avg       0.85      0.85      0.85      1040



# Logistic Regression

In [47]:
# Fitting Logistic Regression to the Training set
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

LogisticRegression()

In [34]:
# Predicting the Test set results
y_pred = log_reg.predict(X_test)

In [35]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.82      0.82      0.82       529
    positive       0.81      0.81      0.81       511

    accuracy                           0.82      1040
   macro avg       0.82      0.82      0.82      1040
weighted avg       0.82      0.82      0.82      1040



In [36]:
# Accuracy
from sklearn.metrics import accuracy_score
score_lg = accuracy_score(y_test,y_pred)
print("---- Scores ----")
print("Accuracy score is: {}%".format(round(score_lg*100,2)))

---- Scores ----
Accuracy score is: 81.63%


In [37]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[433,  96],
       [ 95, 416]], dtype=int64)

In [57]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# defining parameter range
param_grid = {'solver': ['lbfgs'],
			'penalty': ['l1', 'l2'],
			'C': [0.01, 0.1, 1, 10, 100]}

grid = GridSearchCV(log_reg, param_grid, scoring='accuracy', n_jobs=-1, cv=cv)

# fitting the model for grid search
grid.fit(X_train, y_train)

        nan 0.82011218        nan 0.80777244]


GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=10, random_state=1),
             estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2'],
                         'solver': ['lbfgs']},
             scoring='accuracy')

In [49]:
# print best parameter after tuning
print(grid.best_params_)
 
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

{'C': 1, 'penalty': 'l2', 'solver': 'newton-cg'}
LogisticRegression(C=1, solver='newton-cg')


In [50]:
grid_predictions_lg = grid.predict(X_test)
# print classification report
print(classification_report(y_test, grid_predictions_lg))

              precision    recall  f1-score   support

    negative       0.82      0.82      0.82       529
    positive       0.81      0.81      0.81       511

    accuracy                           0.82      1040
   macro avg       0.82      0.82      0.82      1040
weighted avg       0.82      0.82      0.82      1040



# Random Forest

In [51]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)

RandomForestClassifier()

In [52]:
# Predicting the Test set results
y_pred = model.predict(X_test)

In [53]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.84      0.84      0.84       529
    positive       0.83      0.84      0.83       511

    accuracy                           0.84      1040
   macro avg       0.84      0.84      0.84      1040
weighted avg       0.84      0.84      0.84      1040



In [54]:
# Accuracy
from sklearn.metrics import accuracy_score
score = accuracy_score(y_test,y_pred)
print("---- Scores ----")
print("Accuracy score is: {}%".format(round(score*100,2)))

---- Scores ----
Accuracy score is: 83.75%


In [55]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[444,  85],
       [ 84, 427]], dtype=int64)

In [56]:
from sklearn.model_selection import GridSearchCV
# defining parameter range
param_grid = {'n_estimators': [10, 100, 1000],
			'max_features': ['sqrt', 'log2']}

grid = GridSearchCV(RandomForestClassifier(), param_grid, refit = True, verbose = 3)

# fitting the model for grid search
grid.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END .............max_features=sqrt, n_estimators=10; total time=   0.7s
[CV 2/5] END .............max_features=sqrt, n_estimators=10; total time=   0.7s
[CV 3/5] END .............max_features=sqrt, n_estimators=10; total time=   0.7s
[CV 4/5] END .............max_features=sqrt, n_estimators=10; total time=   0.7s
[CV 5/5] END .............max_features=sqrt, n_estimators=10; total time=   0.6s
[CV 1/5] END ............max_features=sqrt, n_estimators=100; total time=   6.5s
[CV 2/5] END ............max_features=sqrt, n_estimators=100; total time=   6.5s
[CV 3/5] END ............max_features=sqrt, n_estimators=100; total time=   6.6s
[CV 4/5] END ............max_features=sqrt, n_estimators=100; total time=   6.9s
[CV 5/5] END ............max_features=sqrt, n_estimators=100; total time=   6.6s
[CV 1/5] END ...........max_features=sqrt, n_estimators=1000; total time= 1.0min
[CV 2/5] END ...........max_features=sqrt, n_esti

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'max_features': ['sqrt', 'log2'],
                         'n_estimators': [10, 100, 1000]},
             verbose=3)

In [60]:
grid_predictions_rf = grid.predict(X_test)
# print classification report
print(classification_report(y_test, grid_predictions_rf))

              precision    recall  f1-score   support

    negative       0.82      0.82      0.82       529
    positive       0.81      0.81      0.81       511

    accuracy                           0.82      1040
   macro avg       0.82      0.82      0.82      1040
weighted avg       0.82      0.82      0.82      1040

