### Import libraries

In [21]:
import pandas as pd
import numpy as np

#### Import YELP Data into a dataframe

In [22]:
data_yelp = pd.read_table('./sentiment labelled sentences//yelp_labelled.txt', header = None)

print(data_yelp.shape)

data_yelp[:3]

(1000, 2)


Unnamed: 0,0,1
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0


#### Import Amazon Data into a DataFrame

In [23]:
data_amazon = pd.read_table('./sentiment labelled sentences/amazon_cells_labelled.txt', header = None)

print(data_amazon.shape)

data_amazon[:3]

(1000, 2)


Unnamed: 0,0,1
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1


#### Combine the datasets

In [24]:
# Stack the 3 dataframes on top of another
reviews = pd.concat([data_amazon, data_yelp], axis = 0, keys = ["Amazon", "yelp"])

# Set column names
reviews.columns = ['review', 'label']

reviews

Unnamed: 0,Unnamed: 1,review,label
Amazon,0,So there is no way for me to plug it in here i...,0
Amazon,1,"Good case, Excellent value.",1
Amazon,2,Great for the jawbone.,1
Amazon,3,Tied to charger for conversations lasting more...,0
Amazon,4,The mic is great.,1
...,...,...,...
yelp,995,I think food should have flavor and texture an...,0
yelp,996,Appetite instantly gone.,0
yelp,997,Overall I was not impressed and would not go b...,0
yelp,998,"The whole experience was underwhelming, and I ...",0


### Perform text preprocessing

### Text Preprocessing

We will implement the following steps in our preprocessing pipeline.

- Tokenisation
- Lemmatization
- Stop Words Removal
- Punctuations Removal
- Vectorisation

#### Stopwords

In [25]:
import nltk

stopwords = nltk.corpus.stopwords.words('english')

#### string punctuation

In [26]:
import string

punctuations = string.punctuation

print(punctuations)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [44]:
# Define a function to remove all stopwords
def remove_stopwords(tokenized_text):    
    text = [word for word in tokenized_text if word not in stopwords]
    return text


In [53]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [word for word in tokens if word not in stopwords]
    #lememtization
    #stemming
    return text

In [43]:
# Define a function to split our sentences into a list of words
import re

def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

#### Vectorize the words

In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a vectorizer object and pass the preprocess function we created to the tokeniser argument
tfvectorizer = TfidfVectorizer(tokenizer = clean_text)

In [55]:
from sklearn.svm import LinearSVC

# Initialise Model Object
classifier = LinearSVC()


In [56]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split( reviews['review'], reviews['label'], 
                                                    test_size = 0.2, random_state = 42)


In [40]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Arunabh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [57]:
from sklearn.pipeline import Pipeline

# Create the  pipeline to clean, tokenize, vectorize, and classify using"Count Vectorizor"
# Multiple models can be added to the Pipeline object to be executed in sequence.
model_pipe = Pipeline( [ ('vectorizer', tfvectorizer), 
                         ('classifier', classifier) ] )

In [58]:
model_pipe.fit(X_train,y_train)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(tokenizer=<function clean_text at 0x0000022060A78948>)),
                ('classifier', LinearSVC())])

In [59]:
preds = model_pipe.predict(X_test)
preds[:10]

array([1, 0, 1, 0, 0, 1, 1, 1, 0, 0], dtype=int64)

In [60]:

X_test[:10]

yelp    860    This place is pretty good, nice little vibe in...
Amazon  353    Their network coverage in Los Angeles is horri...
yelp    333                                Everything was gross.
Amazon  905    Not nearly as good looking as the AMAZON pictu...
yelp    289    I hate to disagree with my fellow Yelpers, but...
        273    Stopped by this place while in Madison for the...
Amazon  938                 Logitech Bluetooth Headset is a 10!.
yelp    731    Ryan's Bar is definitely one Edinburgh establi...
Amazon  65     The one big drawback of the MP3 player is that...
yelp    323    A couple of months later, I returned and had a...
Name: review, dtype: object

#### compute accuracy

In [52]:
from sklearn.metrics import accuracy_score 

# Accuracy
print("Train Accuracy: ", model_pipe.score(X_train, y_train))

# Accuracy
print("Test Accuracy: ", model_pipe.score(X_test, y_test))

Train Accuracy:  0.998125
Test Accuracy:  0.8425
