In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('alexa.csv',sep="\t")
df.head(5)

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


# PREPROCESS THE DATA

In [3]:
df.isnull().sum()

rating              0
date                0
variation           0
verified_reviews    1
feedback            0
dtype: int64

In [4]:
df.shape

(3150, 5)

In [5]:
df.describe()

Unnamed: 0,rating,feedback
count,3150.0,3150.0
mean,4.463175,0.918413
std,1.068506,0.273778
min,1.0,0.0
25%,4.0,1.0
50%,5.0,1.0
75%,5.0,1.0
max,5.0,1.0


In [6]:
df = df.dropna()

In [7]:
df.isnull().sum()

rating              0
date                0
variation           0
verified_reviews    0
feedback            0
dtype: int64

In [8]:
df['verified_reviews'].head(10)

0                                        Love my Echo!
1                                            Loved it!
2    Sometimes while playing a game, you can answer...
3    I have had a lot of fun with this thing. My 4 ...
4                                                Music
5    I received the echo as a gift. I needed anothe...
6    Without having a cellphone, I cannot use many ...
7    I think this is the 5th one I've purchased. I'...
8                                          looks great
9    Love it! I’ve listened to songs I haven’t hear...
Name: verified_reviews, dtype: object

In [9]:
data = df[["rating","verified_reviews"]]

In [10]:
data.head()

Unnamed: 0,rating,verified_reviews
0,5,Love my Echo!
1,5,Loved it!
2,4,"Sometimes while playing a game, you can answer..."
3,5,I have had a lot of fun with this thing. My 4 ...
4,5,Music


In [11]:
def compute_sentiments(labels):
    sentiment = []
    for label in labels:
        if label > 3.0:
            sentiments = 1
        elif label <=3.0:
            sentiments = 0
        sentiment.append(sentiments)
    return sentiment

In [12]:
data['sentiments'] = compute_sentiments(data.rating)
data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['sentiments'] = compute_sentiments(data.rating)


Unnamed: 0,rating,verified_reviews,sentiments
0,5,Love my Echo!,1
1,5,Loved it!,1
2,4,"Sometimes while playing a game, you can answer...",1
3,5,I have had a lot of fun with this thing. My 4 ...,1
4,5,Music,1


In [13]:
data1 = data[['verified_reviews','sentiments']]
data1.head()

Unnamed: 0,verified_reviews,sentiments
0,Love my Echo!,1
1,Loved it!,1
2,"Sometimes while playing a game, you can answer...",1
3,I have had a lot of fun with this thing. My 4 ...,1
4,Music,1


In [14]:
data1['sentiments'].unique()

array([1, 0], dtype=int64)

# data tranformation

In [15]:
x = data1['verified_reviews']
y = data1['sentiments']

In [16]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

def preprocess_data(sentence):
    def remove_tag(text):
        """Remove all HTML tags"""
        soup = BeautifulSoup(text, "html.parser")
        cleaned = soup.get_text(separator=" ")
        return cleaned

    # Convert to lowercase
    tokens = sentence.lower()

    # Remove HTML tags
    tokens = remove_tag(tokens)

    # Remove punctuations and numbers
    tokens = re.sub('[^a-zA-Z]', ' ', tokens)

    # Remove single characters
    tokens = re.sub(r'\b\w\b', '', tokens)

    # Remove multiple spaces
    tokens = re.sub(r'\s+', ' ', tokens)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens.split() if word not in stop_words]

    return tokens

In [17]:
sentence = "This is an example sentence for preprocessing."
processed_sentence = preprocess_data(sentence)
print(processed_sentence)

['example', 'sentence', 'preprocessing']


# train the data

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [19]:
tfidf = TfidfVectorizer(tokenizer=preprocess_data)

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,stratify=data1.sentiments,random_state=0)

In [22]:
x_train.shape

(2519,)

In [23]:
x_test.shape

(630,)

# model training

In [24]:
from sklearn.svm import LinearSVC

In [25]:
classifier = LinearSVC()

In [26]:
from sklearn.pipeline import Pipeline

In [27]:
pipeline = Pipeline([('tfidf',tfidf), ('clf',classifier)])

In [28]:
pipeline.fit(x_train,y_train)

  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")


# checking model performance

In [29]:
y_pred = pipeline.predict(x_test)

  soup = BeautifulSoup(text, "html.parser")


In [30]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [31]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.76      0.57      0.65        82
           1       0.94      0.97      0.96       548

    accuracy                           0.92       630
   macro avg       0.85      0.77      0.80       630
weighted avg       0.91      0.92      0.92       630



In [32]:
confusion_matrix(y_test,y_pred)

array([[ 47,  35],
       [ 15, 533]], dtype=int64)

In [33]:
print("accuracy in % =" ,round(accuracy_score(y_test,y_pred)*100,2))

accuracy in % = 92.06


In [34]:
import joblib

model = pipeline
filename = 'pipeline_model.pkl'

joblib.dump(model, filename)

['pipeline_model.pkl']

In [35]:
import pickle

model = pipeline  # Your trained pipeline model
filename = 'pip_model.pkl'  # Choose a filename for your saved model

# Save the model as a pickle file
with open(filename, 'wb') as file:
    pickle.dump(model, file)

# predict sentiments using model

In [36]:
prediction = pipeline.predict(["this is good"])
if prediction ==1:
    print("result: positive ")
else:
    print("result: negative")

result: positive 


In [37]:
prediction = pipeline.predict(["this is bad"])
if prediction ==1:
    print("result: positive ")
else:
    print("result: negative")

result: negative


In [42]:
new_review = []
pred_sentiment = []

while True:
    review = input("Please type an Alexa review (Type 'skip' to exit) - ")
    if review == 'skip':
        print("See you soon!")
        break
    else:
        prediction = pipeline.predict([review])
        if prediction == 1:
            result = 'Positive'
            print("Result: This review is positive\n")
            new_review.append(review)
            pred_sentiment.append(result)
        else:
            result = 'Negative'
            print("Result: This review is negative\n")
            new_review.append(review)
            pred_sentiment.append(result)

Please type an Alexa review (Type 'skip' to exit) - alexa is good
Result: This review is positive

Please type an Alexa review (Type 'skip' to exit) - alexa is bad
Result: This review is negative

Please type an Alexa review (Type 'skip' to exit) - alexa is positive
Result: This review is positive

Please type an Alexa review (Type 'skip' to exit) - alexa is negative
Result: This review is positive

Please type an Alexa review (Type 'skip' to exit) - skip
See you soon!


In [46]:
summary = pd.DataFrame({'review' : new_review ,'result' : pred_sentiment})

In [47]:
summary.head()

Unnamed: 0,review,result
0,alexa is good,Positive
1,alexa is bad,Negative
2,alexa is positive,Positive
3,alexa is negative,Positive


In [48]:
summary.to_csv("predicted_sentiments" , sep = "\t"  ,encoding = "UTF-8" ,index=False)

In [49]:
summary

Unnamed: 0,review,result
0,alexa is good,Positive
1,alexa is bad,Negative
2,alexa is positive,Positive
3,alexa is negative,Positive
