In [1]:
# check the current working directory
import os
cwd = os.getcwd()
cwd

'/Users/saimouzhang/Documents/GitHub'

In [2]:
# Database source: https://s3.amazonaws.com/amazon-reviews-pds/tsv/index.txt
import numpy as np
import pandas as pd
df = pd.read_csv('amazonreviews.tsv', sep='\t')
df.head()

Unnamed: 0,label,review
0,pos,Stuning even for the non-gamer: This sound tra...
1,pos,The best soundtrack ever to anything.: I'm rea...
2,pos,Amazing!: This soundtrack is my favorite music...
3,pos,Excellent Soundtrack: I truly like this soundt...
4,pos,"Remember, Pull Your Jaw Off The Floor After He..."


In [3]:
# take a look at the entire text for the first review
print(df['review'][0])

Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^


In [4]:
# check the overall number of the reviews
len(df)

10000

In [5]:
# check if there are any missing values. 
#If yes, remove them; otherwise, go ahead
df.isnull().sum()

label     0
review    0
dtype: int64

In [6]:
# Luckily, there are no missing values for either label or review.

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X = df['review']

In [9]:
y = df['label']

In [10]:
# perform train test split 
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3, 
                                                    random_state=42)

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [12]:
# build a pipeline to first vectoriize the data (first tuple in the list)
# then include the classifier (second tuple in the list) 
text_clf = Pipeline([('tfidf',TfidfVectorizer()),
                     ('clf',LinearSVC())])

In [13]:
# Run the pipeline to fit our training data
text_clf.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

In [14]:
# Test our data, and form the prediction so that 
# we can compare the prediction value with our y-test value in the next step
predictions = text_clf.predict(X_test)

In [15]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
        

In [16]:
print(confusion_matrix(y_test, predictions))

[[1353  165]
 [ 210 1272]]


In [17]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         neg       0.87      0.89      0.88      1518
         pos       0.89      0.86      0.87      1482

    accuracy                           0.88      3000
   macro avg       0.88      0.87      0.87      3000
weighted avg       0.88      0.88      0.87      3000



In [18]:
print(accuracy_score(y_test, predictions))

0.875
