In [1]:

!python --version

Python 3.10.6


In [1]:

import os, types
import pandas as pd
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.
cos_client = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='ii38G_dRe9BcY0As7o5vTfGzu-HY5Im0KGl3Y2f3AQKR',
    ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3.private.us-south.cloud-object-storage.appdomain.cloud')

bucket = 'hotel-donotdelete-pr-bl5bnbcbo4dnmu'
object_key = 'hotel_reviews.csv'

body = cos_client.get_object(Bucket=bucket,Key=object_key)['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

trip = pd.read_csv(body)
trip.head()


Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [2]:
# Let's create a new data frame
 
trip = trip[(trip['Rating']==5)|(trip['Rating']==2)|(trip['Rating']==1)][['Review','Rating']]

# Lets modify the Rating column
trip['Rating'] = trip['Rating'].apply(lambda rating: 'Pos' if rating==5 else 'Neg')

In [3]:
# reseting the index because after removing some rows, the index gets crowded
trip.reset_index(inplace=True)
trip.head()

Unnamed: 0,index,Review,Rating
0,1,ok nothing special charge diamond member hilto...,Neg
1,3,"unique, great stay, wonderful time hotel monac...",Pos
2,4,"great stay great stay, went seahawk game aweso...",Pos
3,5,love monaco staff husband stayed hotel crazy w...,Pos
4,6,"cozy stay rainy city, husband spent 7 nights m...",Pos


In [4]:
trip['Rating'].value_counts()

Pos    9054
Neg    3214
Name: Rating, dtype: int64

In [5]:
pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m52.6 MB/s[0m eta [36m0:00:00[0m
Collecting regex>=2021.8.3
  Downloading regex-2022.10.31-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (770 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m770.5/770.5 kB[0m [31m77.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: regex, nltk
Successfully installed nltk-3.8.1 regex-2022.10.31
Note: you may need to restart the kernel to use updated packages.


In [6]:
#Data cleaning and preprocessing

import re
import nltk

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [7]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/wsuser/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /home/wsuser/nltk_data...


True

In [8]:
# Lemmatization object
ps = WordNetLemmatizer()
corpus = []

In [9]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /home/wsuser/nltk_data...


True

In [10]:
# Text preprocessing
# keep only text based
# lower all the letters
# split the words
for i in range(0,len(trip)):
    review = re.sub('[^a-zA-Z]'," ",trip['Review'][i])
    review = review.lower()
    review = review.split()
    review = [ps.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [12]:
X = cv.fit_transform(corpus).toarray()

In [13]:
X.shape

(12268, 34569)

In [14]:
y = pd.get_dummies(trip['Rating'])
y = y.iloc[:,1].values
y

array([0, 1, 1, ..., 0, 0, 0], dtype=uint8)

In [15]:
#train test split
from sklearn.model_selection import train_test_split
X_train, X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=3)

Naive Bayes

In [16]:
#Naive bayes classifier

from sklearn.naive_bayes import MultinomialNB
model1 = MultinomialNB().fit(X_train,y_train)

# for the accuracy
model1.score(X_test,y_test)

y_pred = model1.predict(X_test)

In [17]:
#compare y test and y_pred
#confusion matrix is a 2x2 matrix and it tells,
#how many number of elements are correctly predicted.

from sklearn.metrics import confusion_matrix
confusion_m = confusion_matrix(y_test,y_pred)

confusion_m

array([[ 574,   69],
       [  44, 1767]])

In [18]:
#checking accuracy score

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test,y_pred)

accuracy

0.9539527302363489

In [19]:
#checking precision score

from sklearn.metrics import precision_score
precision_score(y_test,y_pred)

0.9624183006535948

In [20]:
#checking recall score

from sklearn.metrics import recall_score
recall_score(y_test,y_pred)

0.9757040309221424

In [21]:
#checking f-beta score

from sklearn.metrics import fbeta_score
fbeta_score(y_test,y_pred,beta=1)

0.9690156292843433

In [22]:
# Checking the training model with custom input data
message = "the hotel was nice and comfy"
data = [message]
vect = cv.transform(data).toarray()
my_prediction = model1.predict(vect)
my_prediction_prob = model1.predict_proba(vect)

if my_prediction==1:
    print("Positive")
    print(my_prediction_prob[0][1])
else:
    print("Negative")
    print(my_prediction_prob[0][0])

Positive
0.9689465878406804


In [23]:
message = "the hotel was bad and the staff was rude"
data = [message]
vect = cv.transform(data).toarray()
my_prediction = model1.predict(vect)
my_prediction_prob = model1.predict_proba(vect)

if my_prediction==1:
    print("Positive")
    print(my_prediction_prob[0][1])
else:
    print("Negative")
    print(my_prediction_prob[0][0])

Negative
0.8445454685022524


Logistic Regression

In [24]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(solver='liblinear')

In [25]:
model2 = logreg.fit(X_train, y_train)

In [26]:
y_pred_class = logreg.predict(X_test)

In [27]:
from sklearn import metrics

In [28]:
metrics.accuracy_score(y_test, y_pred)

0.9539527302363489

In [29]:
metrics.precision_score(y_test, y_pred)

0.9624183006535948

In [30]:
metrics.precision_score(y_test, y_pred)

0.9624183006535948

In [31]:
metrics.recall_score(y_test, y_pred)

0.9757040309221424

In [32]:
metrics.fbeta_score(y_test, y_pred, beta=1)

0.9690156292843433

In [33]:
metrics.roc_auc_score(y_test, y_pred)

0.9341972720707136

In [34]:
pip install metrics

Collecting metrics
  Downloading metrics-0.3.3.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting Pygments==2.2.0
  Downloading Pygments-2.2.0-py2.py3-none-any.whl (841 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m841.7/841.7 kB[0m [31m62.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pathspec==0.5.5
  Downloading pathspec-0.5.5.tar.gz (21 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting pathlib2>=2.3.0
  Downloading pathlib2-2.3.7.post1-py2.py3-none-any.whl (18 kB)
Building wheels for collected packages: metrics, pathspec
  Building wheel for metrics (setup.py) ... [?25ldone
[?25h  Created wheel for metrics: filename=metrics-0.3.3-py2.py3-none-any.whl size=17808 sha256=37ea96a8bc031064b8a5a8c3b412b57011eaa61f364ca99f528d48b48c553efc
  Stored in directory: /tmp/wsuser/.cache/pip/wheels/29/7a/e7/1175d9ff10607b8f02aa37c32392cb28cdda0aea8fcc2b514b
  Building wheel for pathspec (setup.py) ... [?25ldone
[?25h  Creat

In [35]:
metrics.confusion_matrix(y_test, y_pred)

array([[ 574,   69],
       [  44, 1767]])

In [36]:
message = "cozy stay rainy city"
data = [message]
vect = cv.transform(data).toarray()
my_prediction = model2.predict(vect)
my_prediction_prob = model2.predict_proba(vect)

if my_prediction==1:
    print("Positive")
    print(my_prediction_prob[0][1])
else:
    print("Negative")
    print(my_prediction_prob[0][0])

Positive
0.6713142877649974


In [37]:
message = "the hotel was bad and the staff was rude"
data = [message]
vect = cv.transform(data).toarray()
my_prediction = model2.predict(vect)
my_prediction_prob = model2.predict_proba(vect)

if my_prediction==1:
    print("Positive")
    print(my_prediction_prob[0][1])
else:
    print("Negative")
    print(my_prediction_prob[0][0])

Negative
0.8996652277103434


In [38]:
# Dump the machine learning model outsite so you can use outsite and not retrain again and again
import pickle

#pickle file for logistic regression

filename = 'ibm_logistic_regression_model.pkl'
pickle.dump(model2, open(filename, 'wb'))
pickle.dump(cv, open('ibm_tranform_logistic.pkl', 'wb'))

In [39]:
#pickle file for naive bayes

filename = 'ibm_naive_bayes_model.pkl'
pickle.dump(model1, open(filename, 'wb'))
pickle.dump(cv, open('ibm_transform_naive.pkl', 'wb'))

In [40]:
!sudo apt install python3-pip

/usr/bin/sh: sudo: command not found


In [41]:
!pip install -U ibm-watson-machine-learning



Collecting ibm-watson-machine-learning
  Downloading ibm_watson_machine_learning-1.0.283-py3-none-any.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m80.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ibm-watson-machine-learning
  Attempting uninstall: ibm-watson-machine-learning
    Found existing installation: ibm-watson-machine-learning 1.0.280
    Uninstalling ibm-watson-machine-learning-1.0.280:
      Successfully uninstalled ibm-watson-machine-learning-1.0.280
Successfully installed ibm-watson-machine-learning-1.0.283


In [69]:
!tar -zcvf hotel_reviews_lr_model.tgz ibm_logistic_regression_model.pkl

ibm_logistic_regression_model.pkl


In [70]:
!tar -zcvf hotel_reviews_nb_model.tgz ibm_naive_bayes_model.pkl

ibm_naive_bayes_model.pkl


In [71]:
!tar -zcvf hotel_reviews_tl_model.tgz ibm_tranform_logistic.pkl

ibm_tranform_logistic.pkl


In [93]:
!tar -zcvf hotel_reviews_tn_model.tgz ibm_transform_naive.pkl

ibm_transform_naive.pkl


In [94]:
ls

hotel_reviews_lr_model.gz   hotel_reviews_tl_model.tgz
hotel_reviews_lr_model.tgz  hotel_reviews_tn_model.tgz
hotel_reviews_model.tgz     hotel.tar.gz
hotel_reviews_model.tn_tgz  ibm_logistic_regression_model.pkl
hotel_reviews_nb_model.gz   ibm_naive_bayes_model.pkl
hotel_reviews_nb_model.tgz  ibm_tranform_logistic.pkl
hotel_reviews_tl_model.gz   ibm_transform_naive.pkl


In [74]:
from ibm_watson_machine_learning import APIClient
wml_credentials={
    "url":"https://us-south.ml.cloud.ibm.com",
    "apikey":"9vcF0gJroT3555oirhShPIKfqOI2EVrEVci-7iBKUvM6"
}

client=APIClient(wml_credentials)

In [75]:
def guid_space_name(client,hotel_deploy):
    space=client.spaces.get_details()
    return(next(item for item in space['resources'] if item['entity']['name']==hotel_deploy)['metadata']['id'])

In [76]:
space_uid=guid_space_name(client,'hotel_deploy')
print("Space UID "+space_uid)

Space UID 76dfba38-5b8a-42cd-9922-c96dacae02ee


In [77]:
client.set.default_space(space_uid)


'SUCCESS'

In [78]:
client.software_specifications.list()


-----------------------------  ------------------------------------  ----  ------------------  -------------------------------
NAME                           ID                                    TYPE  STATE               REPLACEMENT
default_py3.6                  0062b8c9-8b7d-44a0-a9b9-46c416adcbd9  base  unsupported         runtime-22.1-py3.9
kernel-spark3.2-scala2.12      020d69ce-7ac1-5e68-ac1a-31189867356a  base  not_provided
pytorch-onnx_1.3-py3.7-edt     069ea134-3346-5748-b513-49120e15d288  base  not_provided
scikit-learn_0.20-py3.6        09c5a1d0-9c1e-4473-a344-eb7b665ff687  base  unsupported         runtime-22.1-py3.9
spark-mllib_3.0-scala_2.12     09f4cff0-90a7-5899-b9ed-1ef348aebdee  base  unsupported
pytorch-onnx_rt22.1-py3.9      0b848dd4-e681-5599-be41-b5f6fccc6471  base  supported
ai-function_0.1-py3.6          0cdb0f1e-5376-4f4d-92dd-da3b69aa9bda  base  unsupported         runtime-22.1-py3.9
shiny-r3.6                     0e6e79df-875e-4f24-8ae9-62dcc2148306  base  n

In [79]:
software_space_uid=client.software_specifications.get_uid_by_name('tensorflow_rt22.1-py3.9')

In [80]:
software_space_uid

'acd9c798-6974-5d2f-a657-ce06e986df4d'

In [81]:
ls

hotel_reviews_lr_model.tgz  hotel.tar.gz
hotel_reviews_model.tgz     ibm_logistic_regression_model.pkl
hotel_reviews_model.tn_tgz  ibm_naive_bayes_model.pkl
hotel_reviews_nb_model.tgz  ibm_tranform_logistic.pkl
hotel_reviews_tl_model.tgz  ibm_transform_naive.pkl


In [82]:
model_details1=client.repository.store_model(model='hotel_reviews_lr_model.tgz',
                                             meta_props={client.repository.ModelMetaNames.NAME:"lr_model",
                                                         client.repository.ModelMetaNames.TYPE:"scikit-learn_1.0",
                                                         client.repository.ModelMetaNames.SOFTWARE_SPEC_UID:software_spec_uid})
model_id1=client.repository.get_model_id(model_details1)

In [83]:
model_id1

'fc198241-fefc-488e-b08c-fe549a640493'

In [102]:
client.repository.download("fc198241-fefc-488e-b08c-fe549a640493","hotel_reviews_lr_model.tar.gz")


Successfully saved model content to file: 'hotel_reviews_lr_model.tar.gz'


'/home/wsuser/work/hotel_reviews_lr_model.tar.gz'

In [86]:
model_details2=client.repository.store_model(model='hotel_reviews_tl_model.tgz',
                                             meta_props={client.repository.ModelMetaNames.NAME:"tlr_model",
                                                         client.repository.ModelMetaNames.TYPE:"scikit-learn_1.0",
                                                         client.repository.ModelMetaNames.SOFTWARE_SPEC_UID:software_spec_uid})
model_id2=client.repository.get_model_id(model_details2)

In [87]:
model_id2

'5d683f85-fd33-4c0b-b473-c06569923cd5'

In [103]:
client.repository.download("5d683f85-fd33-4c0b-b473-c06569923cd5","hotel_reviews_tl_model.tar.gz")


Successfully saved model content to file: 'hotel_reviews_tl_model.tar.gz'


'/home/wsuser/work/hotel_reviews_tl_model.tar.gz'

In [89]:
model_details3=client.repository.store_model(model='hotel_reviews_nb_model.tgz',
                                             meta_props={client.repository.ModelMetaNames.NAME:"nb_model",
                                                         client.repository.ModelMetaNames.TYPE:"scikit-learn_1.0",
                                                         client.repository.ModelMetaNames.SOFTWARE_SPEC_UID:software_spec_uid})
model_id3=client.repository.get_model_id(model_details3)

In [90]:
model_id3

'0fcbb13f-8561-4c24-97b9-2c5b77fb4689'

In [104]:
client.repository.download("0fcbb13f-8561-4c24-97b9-2c5b77fb4689","hotel_reviews_nb_model1.tar.gz")


Successfully saved model content to file: 'hotel_reviews_nb_model1.tar.gz'


'/home/wsuser/work/hotel_reviews_nb_model1.tar.gz'

In [95]:
ls

hotel_reviews_lr_model.gz   hotel_reviews_tl_model.tgz
hotel_reviews_lr_model.tgz  hotel_reviews_tn_model.tgz
hotel_reviews_model.tgz     hotel.tar.gz
hotel_reviews_model.tn_tgz  ibm_logistic_regression_model.pkl
hotel_reviews_nb_model.gz   ibm_naive_bayes_model.pkl
hotel_reviews_nb_model.tgz  ibm_tranform_logistic.pkl
hotel_reviews_tl_model.gz   ibm_transform_naive.pkl


In [96]:
model_details4=client.repository.store_model(model='hotel_reviews_tn_model.tgz',
                                             meta_props={client.repository.ModelMetaNames.NAME:"tnb_model",
                                                         client.repository.ModelMetaNames.TYPE:"scikit-learn_1.0",
                                                         client.repository.ModelMetaNames.SOFTWARE_SPEC_UID:software_spec_uid})
model_id4=client.repository.get_model_id(model_details4)

In [98]:
model_id4

'f1353c29-24c1-481e-aee1-b9be5a470cea'

In [105]:
client.repository.download("f1353c29-24c1-481e-aee1-b9be5a470cea","hotel_reviews_tn_model.tar.gz")


Successfully saved model content to file: 'hotel_reviews_tn_model.tar.gz'


'/home/wsuser/work/hotel_reviews_tn_model.tar.gz'