In [2]:
!pip install pandas

Collecting pandas
  Downloading pandas-1.0.5-cp37-cp37m-win_amd64.whl (8.7 MB)
Installing collected packages: pandas
Successfully installed pandas-1.0.5


In [3]:
import pandas as pd

In [4]:
df_yelp = pd.read_table('yelp_labelled.txt')
df_imdb = pd.read_table('imdb_labelled.txt')

In [5]:
df_amz = pd.read_table("amazon_cells_labelled.txt")

In [6]:
frames = [df_yelp, df_imdb, df_amz]

In [7]:
df_yelp.columns

Index(['Wow... Loved this place.', '1'], dtype='object')

In [8]:
for colname in frames:
    colname.columns = ['Message', 'Target']

In [9]:
for colname in frames:
    print(colname.columns)

Index(['Message', 'Target'], dtype='object')
Index(['Message', 'Target'], dtype='object')
Index(['Message', 'Target'], dtype='object')


In [10]:
keys = ['Yelp', 'Imdb', 'Amazon']

In [11]:
df = pd.concat(frames, keys=keys)

In [12]:
df.head()

Unnamed: 0,Unnamed: 1,Message,Target
Yelp,0,Crust is not good.,0
Yelp,1,Not tasty and the texture was just nasty.,0
Yelp,2,Stopped by during the late May bank holiday of...,1
Yelp,3,The selection on the menu was great and so wer...,1
Yelp,4,Now I am getting angry and I want my damn pho.,0


In [13]:
df.shape

(2745, 2)

In [14]:
df.to_csv('sentiment.csv')

In [15]:
df.columns

Index(['Message', 'Target'], dtype='object')

In [16]:
df.isnull().sum()

Message    0
Target     0
dtype: int64

In [17]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS 
nlp = spacy.load('en')

In [18]:
stopwords = list(STOP_WORDS)

In [19]:
stopwords

['very',
 'around',
 'someone',
 'below',
 'everyone',
 'then',
 'unless',
 'often',
 'does',
 'front',
 'first',
 'once',
 'from',
 'but',
 'while',
 'may',
 'for',
 'whereas',
 'against',
 'anyway',
 'fifty',
 'give',
 'everything',
 'elsewhere',
 "'ll",
 'of',
 'yet',
 'or',
 'as',
 'before',
 'indeed',
 'which',
 'under',
 'might',
 'himself',
 'ten',
 'whither',
 'are',
 'former',
 'if',
 'the',
 '’m',
 'in',
 'than',
 'noone',
 'your',
 'n‘t',
 'amount',
 'besides',
 'anyone',
 'none',
 'regarding',
 'when',
 "'ve",
 'on',
 'this',
 'latter',
 'always',
 'get',
 'else',
 'whereafter',
 'meanwhile',
 'less',
 'herself',
 '’re',
 'whence',
 'made',
 'amongst',
 'up',
 'both',
 'our',
 'within',
 'had',
 'yourselves',
 'quite',
 'enough',
 'formerly',
 'how',
 'those',
 'he',
 'two',
 '’ve',
 'thence',
 'alone',
 'eight',
 'at',
 'although',
 '’s',
 'back',
 'whether',
 'never',
 'across',
 'thru',
 'more',
 'almost',
 'doing',
 'ourselves',
 'to',
 'being',
 'hence',
 'off',
 'othe

In [20]:
docx = nlp("this is how john walker was walking. He was also running beside the pool")

In [21]:
for word in docx:
    print(word.text, 'lemma =>',word.lemma_)

this lemma => this
is lemma => be
how lemma => how
john lemma => john
walker lemma => walker
was lemma => be
walking lemma => walk
. lemma => .
He lemma => -PRON-
was lemma => be
also lemma => also
running lemma => run
beside lemma => beside
the lemma => the
pool lemma => pool


In [23]:
for word in docx:
    if word.lemma_ != '-PRON-':
        print(word.lemma_.lower().strip())

this
be
how
john
walker
be
walk
.
be
also
run
beside
the
pool


In [25]:
[word.lemma_.lower().strip() if word.lemma_ != '-PRON-' else word.lower_ for word in docx ]

['this',
 'be',
 'how',
 'john',
 'walker',
 'be',
 'walk',
 '.',
 'he',
 'be',
 'also',
 'run',
 'beside',
 'the',
 'pool']

In [26]:
[ word for word in docx if word.is_stop == False and not word.is_punct]

[john, walker, walking, running, pool]

In [27]:
import string
punctuations = string.punctuation

In [28]:
from spacy.lang.en import English
parser = English()

In [47]:
def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [word.lemma_.lower().strip() if word.lemma_ != '-PRON-' else word.lower_ for word in mytokens]
    mytokens = [  word for word in mytokens if word not in stopwords and word not in punctuations]
    return mytokens

In [48]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score 
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline 
from sklearn.svm import LinearSVC

In [49]:
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}

# Basic function to clean the text 
def clean_text(text):     
    return text.strip().lower()

In [50]:
vectorizer = CountVectorizer(tokenizer =spacy_tokenizer, ngram_range=(1,1))
classifier = LinearSVC()

In [51]:
tfvectorizer = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [52]:
X = df['Message']
ylabels = df['Target']

In [53]:
from sklearn.model_selection import train_test_split

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size =0.2, random_state=42)

In [55]:
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', classifier)])

In [56]:
pipe.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('cleaner', <__main__.predictors object at 0x000001FAE37BB9E8>), ('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
      ...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [57]:
sample_pred = pipe.predict(X_test)

In [58]:
for (sample,pred) in zip(X_test,sample_pred):
    print(sample,'Pred',pred)

Great pork sandwich. Pred 1
It is a true classic.   Pred 1
It's close to my house, it's low-key, non-fancy, affordable prices, good food. Pred 0
Audio Quality is poor, very poor. Pred 0
We loved the biscuits!!! Pred 1
I don't have very many words to say about this place, but it does everything pretty well. Pred 0
Was not happy. Pred 1
The headsets are easy to use and everyone loves them. Pred 1
I miss it and wish they had one in Philadelphia! Pred 0
Still it's quite interesting and entertaining to follow.   Pred 1
All three broke within two months of use. Pred 0
Oh yeah, and the storyline was pathetic too.   Pred 0
IT'S REALLY EASY. Pred 1
Every element of this story was so over the top, excessively phony and contrived that it was painful to sit through.   Pred 0
The food was outstanding and the prices were very reasonable. Pred 1
I am so tired of clichés that is just lazy writing, and here they come in thick and fast.   Pred 1
Ordered an appetizer and took 40 minutes and then the pizz

In [59]:
print("Accuracy: ", pipe.score(X_test, y_test))
print("Accuracy: ", pipe.score(X_test, sample_pred))

Accuracy:  0.8069216757741348
Accuracy:  1.0


In [62]:
pipe.predict(['this is am  movie'])

array([1], dtype=int64)