In [1]:
# Importing essential libraries for data processing
import numpy as np
import pandas as pd
# For removing stopwords
import re
import nltk
from nltk.corpus import stopwords
#to find the root word
from nltk.stem.porter import PorterStemmer 
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
# Importing essential models
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split,GridSearchCV,RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
# For calculating accuracy,recall,precision and confusion matrix
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix

In [2]:
# Loading the dataset
messages = pd.read_csv('feedback_dataset.csv')

In [3]:
messages.shape

(5200, 2)

In [4]:
messages.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5200 entries, 0 to 5199
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       5200 non-null   object
 1   sentiment  5200 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 81.4+ KB


In [5]:
messages.columns

Index(['text', 'sentiment'], dtype='object')

In [6]:
messages.head()

Unnamed: 0,text,sentiment
0,Display is excellent and camera is as good as ...,1
1,Battery life is also great!,1
2,Protects the phone on all sides.,1
3,"Clear Skype Calls, Long Battery Life, Long Range.",1
4,Great Hands Free Device.,1


# 

# Data Cleaning & Preprocessing

In [7]:
messages.sentiment.unique()

array([1, 0], dtype=int64)

In [8]:
def to_sentiment(sentiment):
  sentiment = sentiment
  if sentiment == 0:
    return 'negative'
  else: 
    return 'positive'
messages['target'] = messages.sentiment.apply(to_sentiment)

In [9]:
messages['target']

0       positive
1       positive
2       positive
3       positive
4       positive
          ...   
5195    negative
5196    negative
5197    negative
5198    negative
5199    negative
Name: target, Length: 5200, dtype: object

In [10]:
nltk.download('stopwords')
ps = PorterStemmer()
corpus = []

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
for i in range (0,len(messages)):
    # Cleaning special character from the reviews
    feedback = re.sub('[^a-zA-Z]',' ',str(messages['text'][i]))
    
    # Converting the entire feedback into lower case
    feedback = feedback.lower()
    
    # Tokenizing the feedback by words
    feedback_words = feedback.split()
    
    # Stemming the words and removing the stopwords
    feedback = [ps.stem(word) for word in feedback_words if not word in set(stopwords.words('english')) ]
    
    # Joining the stemmed words
    feedback = ' '.join(feedback)

    # Creating a corpus
    corpus.append(feedback)

In [26]:
df['feedback'] = pd.DataFrame(corpus)

In [27]:
df

Unnamed: 0,0,feedback
0,display excel camera good year,display excel camera good year
1,batteri life also great,batteri life also great
2,protect phone side,protect phone side
3,clear skype call long batteri life long rang,clear skype call long batteri life long rang
4,great hand free devic,great hand free devic
...,...,...
5195,rough handl,rough handl
5196,yeah quit well book avail,yeah quit well book avail
5197,uncertain,uncertain
5198,averag,averag


In [12]:
corpus[0:10]

['display excel camera good year',
 'batteri life also great',
 'protect phone side',
 'clear skype call long batteri life long rang',
 'great hand free devic',
 'even take self portrait outsid exterior display cool',
 'tri mani mani handsfre gadget one final work well',
 'magic help',
 'best phone market',
 'work well']

In [13]:
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = messages['target'].values

In [14]:
X.shape

(5200, 1500)

In [15]:
y.shape

(5200,)

# Training Model

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

# Naive Bayes (MulitinomialNB)

In [17]:
# Fitting Naive Bayes to the Training set
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# calculating the classification accuracies
print("Training Accuracy :", classifier.score(X_train, y_train))
print("Testing Accuracy :", classifier.score(X_test, y_test))

Training Accuracy : 0.8754807692307692
Testing Accuracy : 0.8144230769230769


In [18]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [19]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.84      0.79      0.81       529
    positive       0.79      0.84      0.82       511

    accuracy                           0.81      1040
   macro avg       0.82      0.81      0.81      1040
weighted avg       0.82      0.81      0.81      1040



In [20]:
# Accuracy
score1 = accuracy_score(y_test,y_pred)
print("---- Scores ----")
print("Accuracy score is: {}%".format(round(score1*100,2)))

---- Scores ----
Accuracy score is: 81.44%


In [21]:
# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[417, 112],
       [ 81, 430]], dtype=int64)

In [22]:
# Hyperparameter tuning the Naive Bayes Classifier
best_accuracy = 0.0
alpha_val = 0.0
for i in np.arange(0.1,1.1,0.1):
  temp_classifier = MultinomialNB(alpha=i)
  temp_classifier.fit(X_train, y_train)
  temp_y_pred = temp_classifier.predict(X_test)
  score = accuracy_score(y_test, temp_y_pred)
  print("Accuracy score for alpha={} is: {}%".format(round(i,1), round(score*100,2)))
  if score>best_accuracy:
    best_accuracy = score
    alpha_val = i
print('--------------------------------------------')
print('The best accuracy is {}% with alpha value as {}'.format(round(best_accuracy*100, 2), round(alpha_val,1)))

Accuracy score for alpha=0.1 is: 80.58%
Accuracy score for alpha=0.2 is: 80.58%
Accuracy score for alpha=0.3 is: 80.87%
Accuracy score for alpha=0.4 is: 81.06%
Accuracy score for alpha=0.5 is: 81.25%
Accuracy score for alpha=0.6 is: 80.96%
Accuracy score for alpha=0.7 is: 81.06%
Accuracy score for alpha=0.8 is: 81.25%
Accuracy score for alpha=0.9 is: 81.35%
Accuracy score for alpha=1.0 is: 81.44%
--------------------------------------------
The best accuracy is 81.44% with alpha value as 1.0


# Support Vector Machine Classifier

In [23]:
model = SVC()
model.fit(X_train, y_train)

# calculating the classification accuracies
print("Training Accuracy :", model.score(X_train, y_train))
print("Testing Accuracy :", model.score(X_test, y_test))

Training Accuracy : 0.9495192307692307
Testing Accuracy : 0.8355769230769231


In [24]:
# Predicting the Test set results
y_pred = model.predict(X_test)

In [25]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.80      0.91      0.85       529
    positive       0.89      0.76      0.82       511

    accuracy                           0.84      1040
   macro avg       0.84      0.83      0.83      1040
weighted avg       0.84      0.84      0.83      1040



In [26]:
# Accuracy
score1 = accuracy_score(y_test,y_pred)
print("---- Scores ----")
print("Accuracy score is: {}%".format(round(score1*100,2)))

---- Scores ----
Accuracy score is: 83.56%


In [27]:
# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[480,  49],
       [122, 389]], dtype=int64)

In [None]:

# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
			'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
			'kernel': ['rbf']}

grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)

# fitting the model for grid search
grid.fit(X_train, y_train)


Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=  15.2s
[CV 2/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=  14.9s
[CV 3/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=  15.7s
[CV 4/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=  15.5s
[CV 5/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=  15.9s
[CV 1/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=  18.9s
[CV 2/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=  18.1s
[CV 3/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=  17.4s
[CV 4/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=  16.4s
[CV 5/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=  14.6s
[CV 1/5] END ..................C=0.1, gamma=0.01, kernel=rbf; total time=  16.9s
[CV 2/5] END ..................C=0.1, gamma=0.0

In [None]:
# print best parameter after tuning
print(grid.best_params_)
 
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

In [None]:
grid_predictions = grid.predict(X_test)
# print classification report
print(classification_report(y_test, grid_predictions))

# Logistic Regression

In [None]:
# Fitting Logistic Regression to the Training set
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# calculating the classification accuracies
print("Training Accuracy :", model.score(X_train, y_train))
print("Testing Accuracy :", model.score(X_test, y_test))

In [None]:
# Predicting the Test set results
y_pred = log_reg.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
# Accuracy
score_lg = accuracy_score(y_test,y_pred)
print("---- Scores ----")
print("Accuracy score is: {}%".format(round(score_lg*100,2)))

In [None]:
# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# defining parameter range
param_grid = {'solver': ['lbfgs'],
			'penalty': ['l2'],
			'C': [0.01, 0.1, 1, 10, 100]}

grid = GridSearchCV(log_reg, param_grid, scoring='accuracy', n_jobs=-1, cv=cv)

# fitting the model for grid search
grid.fit(X_train, y_train)

In [None]:
# print best parameter after tuning
print(grid.best_params_)
 
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

In [None]:
grid_predictions_lg = grid.predict(X_test)
# print classification report
print(classification_report(y_test, grid_predictions_lg))

# Random Forest

In [None]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

# calculating the classification accuracies
print("Training Accuracy :", model.score(X_train, y_train))
print("Testing Accuracy :", model.score(X_test, y_test))

In [None]:
# Predicting the Test set results
y_pred = model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
# Accuracy
score = accuracy_score(y_test,y_pred)
print("---- Scores ----")
print("Accuracy score is: {}%".format(round(score*100,2)))

In [None]:
# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
# defining parameter range
param_grid = {'n_estimators': [10, 100, 1000],
			'max_features': ['sqrt', 'log2']}

grid = GridSearchCV(RandomForestClassifier(), param_grid, refit = True, verbose = 3)

# fitting the model for grid search
grid.fit(X_train, y_train)

In [None]:
grid_predictions_rf = grid.predict(X_test)
# print classification report
print(classification_report(y_test, grid_predictions_rf))

In [None]:
# print best parameter after tuning
print(grid.best_params_)
 
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

## XGBoost Classfier

In [30]:
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(df.feedback).toarray()
y = messages['sentiment'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [32]:
model = XGBClassifier()
model.fit(X_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [34]:
count_df = pd.DataFrame(X_train, columns=cv.get_feature_names())
count_df['etiket'] = y_train

In [35]:
# fit model no training data

model = XGBClassifier()
model.fit(X_train, y_train)

# make predictions for test data

y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

# evaluate predictions

accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 80.38%


In [46]:
import xgboost as xgb
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(9,12)
    for min_child_weight in range(5,8)
]
params = {
    # Parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    # Other parameters
    'objective':'reg:linear',
}
num_boost_round = 999
params['eval_metric'] = "mae"
xgb.DMatrix(X, label=y)
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)
# Define initial best params and MAE
min_mae = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best MAE
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

OverflowError: Python int too large to convert to C long