# Natural Language Processing (NLP)

# NLP model with ML on Trip_advisor_review dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import math
import warnings
warnings.filterwarnings('ignore')

In [2]:
trip = pd.read_csv(r"D:\Python\Python data set lec\NLP\Trip_advisor_review.csv" )     

In [3]:
trip.shape

(20491, 2)

In [4]:
trip.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


# To check null values

In [5]:
trip.isnull().sum()

Review    0
Rating    0
dtype: int64

# Target variable 

In [6]:
trip.Rating.value_counts()

5    9054
4    6039
3    2184
2    1793
1    1421
Name: Rating, dtype: int64

# Process the Data

# Convert all upper string case into lower string case 

In [7]:
trip.Review = trip.Review.str.lower()

In [8]:
trip

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5
...,...,...
20486,"best kept secret 3rd time staying charm, not 5...",5
20487,great location price view hotel great quick pl...,4
20488,"ok just looks nice modern outside, desk staff ...",2
20489,hotel theft ruined vacation hotel opened sept ...,1


# Remove the stopwords & punctuation marks by using user-defined function

In [9]:
from nltk.corpus import stopwords

In [10]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [11]:
len(stopwords.words('english'))

179

In [12]:
import string

In [13]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [14]:
def text_process(mess):            ### creating a function
    """                                                        ## A docstring
    1. remove the punctuation
    2. remove the stopwords
    3. return the list of clean textwords
    
    """
    nopunc = [char for char in mess if char not in string.punctuation]
    nopunc = "".join(nopunc)
    
    return [ word for word in nopunc.split() if word not in stopwords.words("english")]


In [15]:
trip.Review.apply(text_process)

0        [nice, hotel, expensive, parking, got, good, d...
1        [ok, nothing, special, charge, diamond, member...
2        [nice, rooms, 4, experience, hotel, monaco, se...
3        [unique, great, stay, wonderful, time, hotel, ...
4        [great, stay, great, stay, went, seahawk, game...
                               ...                        
20486    [best, kept, secret, 3rd, time, staying, charm...
20487    [great, location, price, view, hotel, great, q...
20488    [ok, looks, nice, modern, outside, desk, staff...
20489    [hotel, theft, ruined, vacation, hotel, opened...
20490    [people, talking, ca, nt, believe, excellent, ...
Name: Review, Length: 20491, dtype: object

# CountVectorizer ... Count of each & every words

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
bow_transformer = CountVectorizer(analyzer = text_process).fit(trip['Review'])

In [18]:
bow_transformer.vocabulary_  

{'nice': 49206,
 'hotel': 36909,
 'expensive': 28378,
 'parking': 53208,
 'got': 33582,
 'good': 33347,
 'deal': 21915,
 'stay': 68556,
 'anniversary': 6999,
 'arrived': 8200,
 'late': 42141,
 'evening': 27588,
 'took': 73466,
 'advice': 5264,
 'previous': 56545,
 'reviews': 61027,
 'valet': 76980,
 'check': 16606,
 'quick': 57945,
 'easy': 25627,
 'little': 43220,
 'disappointed': 23592,
 'nonexistent': 49873,
 'view': 77537,
 'room': 61740,
 'clean': 17592,
 'size': 65815,
 'bed': 11090,
 'comfortable': 18553,
 'woke': 80136,
 'stiff': 68860,
 'neck': 48790,
 'high': 36036,
 'pillows': 54643,
 'soundproof': 67119,
 'like': 42920,
 'heard': 35439,
 'music': 48313,
 'night': 49383,
 'morning': 47723,
 'loud': 43892,
 'bangs': 9939,
 'doors': 24555,
 'opening': 51490,
 'closing': 17921,
 'hear': 35436,
 'people': 53896,
 'talking': 70917,
 'hallway': 34815,
 'maybe': 45647,
 'noisy': 49763,
 'neighbors': 48969,
 'aveda': 9092,
 'bath': 10462,
 'products': 57041,
 'goldfish': 33298,
 'to

In [19]:
len(bow_transformer.vocabulary_)

81408

# Term Document Frequency(TDM)

In [20]:
trip_bow = bow_transformer.transform(trip.Review)

In [21]:
trip_bow.shape

(20491, 81408)

In [22]:
type(trip_bow)

scipy.sparse.csr.csr_matrix

# Train-test split

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
x_train , x_test , y_train , y_test = train_test_split(trip_bow , trip.Rating , test_size=.2)

# 1) Logistic_Regression

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix , classification_report
logreg= LogisticRegression()

In [30]:
logreg.fit(x_train , y_train ) 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [31]:
pred_log = logreg.predict(x_test) 
pred_log

array([3, 5, 1, ..., 4, 5, 3], dtype=int64)

In [32]:
tab_log = confusion_matrix( pred_log , y_test)
tab_log

array([[ 197,   66,   20,    6,    5],
       [  76,  137,   80,   31,   10],
       [  16,   88,  126,  135,   29],
       [   6,   37,  155,  554,  399],
       [   7,   19,   41,  468, 1391]], dtype=int64)

In [33]:
logreg.coef_ 

array([[ 6.04730726e-02,  4.96311079e-02, -2.65804765e-02, ...,
        -3.41328072e-04, -5.69659697e-04, -2.88474065e-04],
       [-4.24449363e-02, -1.36387322e-01,  7.63829109e-03, ...,
        -4.37593034e-04, -9.42584136e-04, -6.80410258e-04],
       [ 3.83844846e-01,  4.29790619e-02,  4.29231078e-02, ...,
        -8.25792726e-04, -2.32118321e-03, -9.90809222e-03],
       [ 1.62286005e-01, -7.99727144e-02, -5.20730489e-02, ...,
        -1.64573191e-02, -1.01180889e-01, -1.53559861e-01],
       [-5.64158987e-01,  1.23749866e-01,  2.80921264e-02, ...,
         1.80620329e-02,  1.05014317e-01,  1.64436837e-01]])

In [34]:
logreg.intercept_ 

array([-0.07458033, -0.05736765, -0.32988261,  0.22974277,  0.23208783])

In [35]:
print(classification_report(y_test , pred_log))

              precision    recall  f1-score   support

           1       0.67      0.65      0.66       302
           2       0.41      0.39      0.40       347
           3       0.32      0.30      0.31       422
           4       0.48      0.46      0.47      1194
           5       0.72      0.76      0.74      1834

    accuracy                           0.59      4099
   macro avg       0.52      0.51      0.52      4099
weighted avg       0.58      0.59      0.58      4099



# 2) Decision Tree

# Grid search

In [39]:
from sklearn.model_selection import GridSearchCV

In [40]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()

In [41]:
search_dict = {"criterion" : ('entropy' , 'gini'),
              "max_depth" : ( 3,4,5,6,7,8,9 ,10, 12),
              "min_samples_split": (25, 50 , 75, 100 ,150,200)}

In [42]:
grid_dt = GridSearchCV(dtc , param_grid= search_dict)

In [43]:
grid_dt.fit(x_train, y_train)

GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ('entropy', 'gini'),
                         'max_depth': (3, 4, 5, 6, 7, 8, 9, 10, 12),
                         'min_samples_split': (25, 50, 75, 100, 150, 200)})

In [44]:
grid_dt.best_params_

{'criterion': 'gini', 'max_depth': 12, 'min_samples_split': 200}

# DT by using Grid search

In [45]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(criterion="gini" , max_depth = 12 , min_samples_split=200 , class_weight="balanced" )

In [46]:
dtc.fit(x_train , y_train)

DecisionTreeClassifier(class_weight='balanced', max_depth=12,
                       min_samples_split=200)

In [47]:
pred_dtc = dtc.predict(x_test)
pred_dtc

array([1, 5, 3, ..., 5, 4, 2], dtype=int64)

In [48]:
tab_dtc = confusion_matrix(pred_dtc , y_test)
tab_dtc

array([[189, 101,  59,  89, 136],
       [ 67, 103, 101, 179, 173],
       [ 16,  70, 128, 229, 173],
       [ 25,  50,  96, 423, 570],
       [  5,  23,  38, 274, 782]], dtype=int64)

In [49]:
print(classification_report( y_test , pred_dtc))

              precision    recall  f1-score   support

           1       0.33      0.63      0.43       302
           2       0.17      0.30      0.21       347
           3       0.21      0.30      0.25       422
           4       0.36      0.35      0.36      1194
           5       0.70      0.43      0.53      1834

    accuracy                           0.40      4099
   macro avg       0.35      0.40      0.36      4099
weighted avg       0.48      0.40      0.42      4099



# 3) DT with ADA-BOOST

In [50]:
from sklearn.ensemble import AdaBoostClassifier
abc_dtc = AdaBoostClassifier(dtc , n_estimators=25)

In [51]:
abc_dtc.fit(x_train, y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(class_weight='balanced',
                                                         max_depth=12,
                                                         min_samples_split=200),
                   n_estimators=25)

In [52]:
pred_dtc_boost = abc_dtc.predict(x_test)
pred_dtc_boost

array([2, 5, 4, ..., 4, 4, 4], dtype=int64)

In [53]:
tab_dtc_boost = confusion_matrix(pred_dtc_boost , y_test)
tab_dtc_boost 

array([[ 129,   36,    8,    8,    3],
       [ 108,  143,   91,   97,   77],
       [  41,   83,  140,  224,  158],
       [  14,   57,  121,  478,  500],
       [  10,   28,   62,  387, 1096]], dtype=int64)

In [54]:
print(classification_report( y_test , pred_dtc_boost)) 

              precision    recall  f1-score   support

           1       0.70      0.43      0.53       302
           2       0.28      0.41      0.33       347
           3       0.22      0.33      0.26       422
           4       0.41      0.40      0.40      1194
           5       0.69      0.60      0.64      1834

    accuracy                           0.48      4099
   macro avg       0.46      0.43      0.43      4099
weighted avg       0.53      0.48      0.50      4099



# 4) Random Tree

In [55]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

In [56]:
search_rf = {"criterion" : ('entropy' , 'gini'),
              "max_depth" : (3,4,5,6,7),
              "n_estimators": (25,50, 100 , 150, 200)}

In [57]:
grid_rf = GridSearchCV(rf , param_grid= search_rf)

In [58]:
grid_rf.fit(x_train , y_train)

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'criterion': ('entropy', 'gini'),
                         'max_depth': (3, 4, 5, 6, 7),
                         'n_estimators': (25, 50, 100, 150, 200)})

In [59]:
grid_rf.best_params_

{'criterion': 'gini', 'max_depth': 7, 'n_estimators': 25}

# RF by using Grid search

In [60]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(criterion= 'gini', max_depth= 7 , n_estimators= 25 , class_weight="balanced")

In [61]:
rf.fit(x_train , y_train)

RandomForestClassifier(class_weight='balanced', max_depth=7, n_estimators=25)

In [62]:
pred_rfc = rf.predict(x_test)
pred_rfc

array([2, 5, 4, ..., 5, 5, 1], dtype=int64)

In [63]:
tab_rfc = confusion_matrix(pred_rfc , y_test)
tab_rfc

array([[ 238,  173,   94,   86,  114],
       [  25,   50,   41,   70,   47],
       [   7,   40,   74,  127,   56],
       [   8,   38,  104,  330,  321],
       [  24,   46,  109,  581, 1296]], dtype=int64)

In [64]:
print(classification_report( y_test , pred_rfc))  

              precision    recall  f1-score   support

           1       0.34      0.79      0.47       302
           2       0.21      0.14      0.17       347
           3       0.24      0.18      0.20       422
           4       0.41      0.28      0.33      1194
           5       0.63      0.71      0.67      1834

    accuracy                           0.48      4099
   macro avg       0.37      0.42      0.37      4099
weighted avg       0.47      0.48      0.46      4099



# 5) RF with ADA-BOOST

In [65]:
from sklearn.ensemble import AdaBoostClassifier
abc_rfc = AdaBoostClassifier(rf)

In [66]:
abc_rfc.fit(x_train , y_train)

AdaBoostClassifier(base_estimator=RandomForestClassifier(class_weight='balanced',
                                                         max_depth=7,
                                                         n_estimators=25))

In [67]:
pred_rfc_boost = abc_rfc.predict(x_test)
pred_rfc_boost

array([3, 5, 3, ..., 5, 4, 2], dtype=int64)

In [68]:
tab_rfc_boost = confusion_matrix(pred_rfc_boost , y_test)
tab_rfc_boost

array([[ 135,   15,    4,    0,    2],
       [ 142,  176,   70,   35,   24],
       [  17,  106,  191,  186,   55],
       [   4,   41,  140,  697,  625],
       [   4,    9,   17,  276, 1128]], dtype=int64)

In [69]:
print(classification_report( y_test , pred_rfc_boost))

              precision    recall  f1-score   support

           1       0.87      0.45      0.59       302
           2       0.39      0.51      0.44       347
           3       0.34      0.45      0.39       422
           4       0.46      0.58      0.52      1194
           5       0.79      0.62      0.69      1834

    accuracy                           0.57      4099
   macro avg       0.57      0.52      0.53      4099
weighted avg       0.62      0.57      0.58      4099



# 6) Naive Bayes

In [72]:
from sklearn.naive_bayes import MultinomialNB
naive_bays = MultinomialNB()

In [73]:
nb = naive_bays.fit(x_train , y_train)

In [74]:
pred_nb = nb.predict(x_test)
pred_nb 

array([5, 5, 4, ..., 4, 5, 4], dtype=int64)

In [75]:
tab_nb = confusion_matrix(pred_nb , y_test)
tab_nb

array([[  84,    7,    3,    1,    2],
       [ 136,   83,   20,    7,    4],
       [   4,   10,   10,    6,    8],
       [  58,  192,  307,  590,  284],
       [  20,   55,   82,  590, 1536]], dtype=int64)

In [76]:
print(classification_report(y_test , pred_nb))

              precision    recall  f1-score   support

           1       0.87      0.28      0.42       302
           2       0.33      0.24      0.28       347
           3       0.26      0.02      0.04       422
           4       0.41      0.49      0.45      1194
           5       0.67      0.84      0.75      1834

    accuracy                           0.56      4099
   macro avg       0.51      0.37      0.39      4099
weighted avg       0.54      0.56      0.52      4099



# 7) KNN 

In [79]:
trip_bow.shape

(20491, 81408)

In [87]:
k=np.sqrt(trip_bow.shape[0])
k

143.1467778191322

In [88]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors= 143)

In [89]:
knn.fit(x_train , y_train)

KNeighborsClassifier(n_neighbors=143)

In [90]:
pred_knn = knn.predict(x_test)
pred_knn

array([5, 5, 5, ..., 5, 5, 5], dtype=int64)

In [91]:
tab_knn = confusion_matrix(pred_knn , y_test)
tab_knn

array([[   3,    0,    0,    0,    0],
       [   3,    2,    0,    0,    0],
       [   0,    0,    0,    0,    0],
       [  20,   55,   68,  200,   81],
       [ 276,  290,  354,  994, 1753]], dtype=int64)

In [92]:
print(classification_report(y_test , pred_knn))

              precision    recall  f1-score   support

           1       1.00      0.01      0.02       302
           2       0.40      0.01      0.01       347
           3       0.00      0.00      0.00       422
           4       0.47      0.17      0.25      1194
           5       0.48      0.96      0.64      1834

    accuracy                           0.48      4099
   macro avg       0.47      0.23      0.18      4099
weighted avg       0.46      0.48      0.36      4099



# 8) SVM 

In [99]:
from sklearn.model_selection import GridSearchCV

In [100]:
search_dict = {"kernel" : ('linear', 'poly' , 'rbf', 'sigmoid')}

In [101]:
from sklearn.svm import SVC
svc_model = SVC()

In [102]:
grid_svc = GridSearchCV(svc_model , param_grid= search_dict)

In [110]:
grid_svc.fit(x_train , y_train)

GridSearchCV(estimator=SVC(),
             param_grid={'kernel': ('linear', 'poly', 'rbf', 'sigmoid')})

In [111]:
grid_svc.best_params_    

{'kernel': 'rbf'}

# SVM by using grid search

In [93]:
from sklearn.svm import SVC

In [94]:
svm_best = SVC(kernel ='rbf')

In [95]:
svm_best.fit(x_train , y_train)

SVC()

In [96]:
pred_svm = svm_best.predict(x_test)
pred_svm

array([3, 5, 4, ..., 5, 5, 2], dtype=int64)

In [97]:
tab_svm = confusion_matrix(pred_svm , y_test)
tab_svm 

array([[ 169,   44,   10,    2,    1],
       [  78,  136,   55,   11,    5],
       [   6,   37,   69,   28,    2],
       [  18,   77,  233,  611,  277],
       [  31,   53,   55,  542, 1549]], dtype=int64)

In [98]:
print(classification_report(y_test , pred_svm))

              precision    recall  f1-score   support

           1       0.75      0.56      0.64       302
           2       0.48      0.39      0.43       347
           3       0.49      0.16      0.24       422
           4       0.50      0.51      0.51      1194
           5       0.69      0.84      0.76      1834

    accuracy                           0.62      4099
   macro avg       0.58      0.49      0.52      4099
weighted avg       0.60      0.62      0.60      4099



# Sentiment Analysis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import math

In [2]:
trip = pd.read_csv(r"D:\Python\Python data set lec\NLP\Trip_advisor_review.csv" ) 

In [3]:
trip.shape

(20491, 2)

In [4]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sent = SentimentIntensityAnalyzer()

In [5]:
score_com = []
score_pos = []
score_neg = []

for i in range (0,trip.shape[0]):
    score = sent.polarity_scores(trip.iloc[i][0]) #this [1] is for the 2nd column(review colomun)
    score1 = score["compound"]
    score_com.append(score1)
    # below 2 lines to extract the positive sentimnet
    score2 = score['pos']
    score_pos.append(score2)

    # below 2 lines to extarct the negative sentimnet
    
    score2 = score["neg"]
    score_neg.append(score2)

In [6]:
np.mean(score_com)

0.8034298862915427

# now we create dataframe

In [7]:
trip['com_score'] = score_com # adding the new column to the datafram
trip['pos_score'] = score_pos
trip['neg_score'] = score_neg

In [8]:
trip.head()

Unnamed: 0,Review,Rating,com_score,pos_score,neg_score
0,nice hotel expensive parking got good deal sta...,4,0.9747,0.285,0.072
1,ok nothing special charge diamond member hilto...,2,0.9787,0.189,0.11
2,nice rooms not 4* experience hotel monaco seat...,3,0.9889,0.219,0.081
3,"unique, great stay, wonderful time hotel monac...",5,0.9912,0.385,0.06
4,"great stay great stay, went seahawk game aweso...",5,0.9797,0.221,0.135


In [9]:
trip['com_score'].mean()

0.8034298862915489

In [10]:
trip['pos_score'].mean()

0.3103757747303704

In [11]:
trip['neg_score'].mean()

0.07190839880923351

# TextBlob

In [12]:
trip = pd.read_csv(r"D:\Python\Python data set lec\NLP\Trip_advisor_review.csv" ) 

In [13]:
from textblob import TextBlob

In [14]:
polarity_score=[]

for i in range(0,trip.shape[0]):
    score=TextBlob(trip.iloc[i][0])
    score1=score.sentiment[0]
    polarity_score.append(score1)

In [15]:
trip =pd.concat([trip,pd.Series(polarity_score)], axis=1)
trip

Unnamed: 0,Review,Rating,0
0,nice hotel expensive parking got good deal sta...,4,0.208744
1,ok nothing special charge diamond member hilto...,2,0.214923
2,nice rooms not 4* experience hotel monaco seat...,3,0.294420
3,"unique, great stay, wonderful time hotel monac...",5,0.504825
4,"great stay great stay, went seahawk game aweso...",5,0.384615
...,...,...,...
20486,"best kept secret 3rd time staying charm, not 5...",5,0.221729
20487,great location price view hotel great quick pl...,4,0.503704
20488,"ok just looks nice modern outside, desk staff ...",2,0.171220
20489,hotel theft ruined vacation hotel opened sept ...,1,0.122573


In [16]:
trip.rename(columns={trip.columns[2]: "sentiment_score"}, inplace=True)

In [17]:
trip.head()

Unnamed: 0,Review,Rating,sentiment_score
0,nice hotel expensive parking got good deal sta...,4,0.208744
1,ok nothing special charge diamond member hilto...,2,0.214923
2,nice rooms not 4* experience hotel monaco seat...,3,0.29442
3,"unique, great stay, wonderful time hotel monac...",5,0.504825
4,"great stay great stay, went seahawk game aweso...",5,0.384615
