# Load datasets

In [None]:
!wget "https://raw.githubusercontent.com/vfp1/bts-dsf-2020/main/data/imdb_labelled.txt"
!wget "https://raw.githubusercontent.com/vfp1/bts-dsf-2020/main/data/amazon_cells_labelled.txt"
!wget "https://raw.githubusercontent.com/vfp1/bts-dsf-2020/main/data/yelp_labelled.txt"

In [1]:
import pandas as pd
import spacy
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS

In [2]:
df_yelp = pd.read_table('Data/yelp_labelled.txt')
df_imdb = pd.read_table('Data/imdb_labelled.txt')
df_amz = pd.read_table('Data/amazon_cells_labelled.txt')
# Concatenate our Datasets
frames = [df_yelp,df_imdb,df_amz]

In [3]:
print(type(frames))
print(type(frames[0]))

<class 'list'>
<class 'pandas.core.frame.DataFrame'>


In [4]:
for column in frames: 
    column.columns = ["Message","Target"]

In [None]:
df_yelp.Message

In [None]:
frames[0]

In [5]:
# Assign a Key to Make it Easier
keys = ['Yelp','IMDB','Amazon']
# Merge or Concat our Datasets
df = pd.concat(frames,keys=keys)
df.head()

Unnamed: 0,Unnamed: 1,Message,Target
Yelp,0,Crust is not good.,0
Yelp,1,Not tasty and the texture was just nasty.,0
Yelp,2,Stopped by during the late May bank holiday of...,1
Yelp,3,The selection on the menu was great and so wer...,1
Yelp,4,Now I am getting angry and I want my damn pho.,0


# Cleaning dataset

In [None]:
#!python -m spacy download en_core_web_sm

In [6]:
#Loading English tokenizer
parser = English()
nlp = spacy.load("en_core_web_sm")

In [39]:
#df[:1]['Message'].values[0]
#sent = df.loc['Yelp','Message'][0]
sent = 'My name is Roberto and I love school.'

In [50]:
#mytokens = parser(sent)
mytokens = nlp(sent)
#mytokens = parser(df.Message)

In [41]:
type(mytokens)
mytokens

My name is Roberto and I love school.

In [32]:
#Remove stop words
stop_words = spacy.lang.en.stop_words.STOP_WORDS

In [9]:
len(stop_words)
stop_words

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

In [42]:
for word in mytokens:
    print(word.lemma_.lower().strip())

my
name
be
roberto
and
i
love
school
.


In [51]:
#We convert to lower cases since stop words are so.
mytokens = [word.lower_ for word in mytokens]
mytokens = [word for word in mytokens if word not in stop_words and word not in punctuations]
mytokens

['roberto', 'love', 'school']

In [52]:
def lemmatize(doc):
    return [token.lemma_ for token in doc if not 
           (token.is_punct or token.is_space or token.lower_ in STOP_WORDS)]

In [54]:
sent = 'My name is Roberto and I love school.'
mytokens = nlp(sent)
lemmatize(mytokens)

['Roberto', 'love', 'school']

In [62]:
for token in mytokens:
    print(token.is_punct, token.is_space,token.lower_ in STOP_WORDS, token.lower_ ,token.lemma_)

False False True my my
False False True name name
False False True is be
False False False roberto Roberto
False False True and and
False False True i I
False False False love love
False False False school school
True False False . .


In [48]:
import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en_core_web_sm')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)
    print(mytokens)

    # Lemmatizing each token and converting each token into lowercase
    #mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    #print(mytokens)
    
    # Removing stop words and punctuations
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [None]:
punctuations

In [None]:
spacy_tokenizer(sent)

In [None]:
from sklearn.base import TransformerMixin 

# This function will clean the text
def clean_text(text):     
    return text.strip().lower()
    
#Custom transformer using Python standard library (you could use spacy as well)
class predictors(TransformerMixin):

    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# We create our bag of words (bow) using our tokenizer and defining an ngram range
bow = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,2)) 
# Using Tfidf
tfvectorizer = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Features and Labels
X = df['Message']
ylabels = df['Target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3, random_state=42)