## Objective 01 - extract text features and use them in classification pipelines



In [1]:
# import 
import pandas as pd

# read te locally saved file from the link above
df_yelp = pd.read_csv('yelp_labelled.txt', names=['sentence', 'label'], sep='\t')
df_yelp.head()

Unnamed: 0,sentence,label
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [2]:
# import train-test split
from sklearn.model_selection import train_test_split

# create the function and target varilable
sentences = df_yelp['sentence']
y = df_yelp['label']

# train test split
sentences_train, sentences_test, y_train, y_test = train_test_split(
    sentences, y, test_size=0.25, random_state=42
)

In [3]:
sentences_train.shape

(750,)

In [5]:
# import the  tf-idf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# instantiate and fit the tf-idf vectorizer
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(2,2))
vectorizer.fit(sentences_train)

# vectorize the training and testing data
X_train = vectorizer.transform(sentences_train)
X_test = vectorizer.transform(sentences_test)

# display the properties of the vectorized text
X_train

<750x2864 sparse matrix of type '<class 'numpy.float64'>'
	with 3051 stored elements in Compressed Sparse Row format>

In [8]:
# import the classifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# instantiate and fit a model
classifier = LogisticRegression(solver='lbfgs')

classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)

print('Accuracy: ', score)

Accuracy:  0.588


In [9]:
from sklearn.pipeline import Pipeline

# define the pipeline
pipe = Pipeline([('vect', vectorizer), # Vectorizer
                 ('clf', classifier)   # classifier
                 ])

# define the parameter space for the grid search
parameters = {'clf__C': [1, 10, 1000000]} # C: regularization strength

# implement a grid search with cross-validation
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(pipe, parameters, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(sentences, y);

# print out the best score
grid_search.best_score_

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of  15 | elapsed:    2.4s remaining:    2.1s
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:    2.4s finished


0.611

## Objective 02 - apply latent semantic indexing (LSA) to a document classification problem

In [15]:
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression

# read teh locally saved file

df_yelp = pd.read_csv('yelp_labelled.txt', names=['sentence', 'label'], sep='\t')
df_yelp.head()

# create the features and target
sentences = df_yelp['sentence']
y = df_yelp['label']

# instantiate the tf-idf vectorizer
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(2,2))

# instantiate the classifier (defaults)
classifier = LogisticRegression(solver='lbfgs')

# Instantiate the LSA (SVD) algorithm (defaults)
svd = TruncatedSVD()


In [20]:
# create the pipeline
from sklearn.pipeline import Pipeline

# LSA Part
lsa = Pipeline([('vect', vectorizer),
                ('svd', svd)])

# combine into one pipeline
pipe = Pipeline([('lsa', lsa),
                 ('clf', classifier)])

# define the parameter space for the grid search
parameters = {
    'lsa__vect__max_df': (0.9,1.0), # max document frequency
    'lsa__svd__n_components': (100,250),
}

# implement a grid search with cross-validation
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(pipe, parameters, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(sentences, y)

# Display the best score from the grid-search
grid_search.best_score_

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of  20 | elapsed:    2.4s remaining:   21.7s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    3.7s finished


0.5959999999999999

## Objective 03 - benchmark and compare various vectorization methods in document classification tasks

In [1]:
# import spaCy and the large pretrained model(includes word embedding)
import spacy
nlp = spacy.load('en_core_web_md')

In [2]:
# imports
import pandas as pd
from sklearn.model_selection import train_test_split

# read the file locally saved
df_yelp = pd.read_csv('yelp_labelled.txt', names=['sentence', 'label'], sep='\n')
df_yelp.head()

Unnamed: 0,sentence,label
0,Wow... Loved this place.\t1,
1,Crust is not good.\t0,
2,Not tasty and the texture was just nasty.\t0,
3,Stopped by during the late May bank holiday of...,
4,The selection on the menu was great and so wer...,


In [4]:
# Imports
import pandas as pd
from sklearn.model_selection import train_test_split

# Read in the locally saved file from UCI website
df_yelp = pd.read_csv('yelp_labelled.txt', names=['sentence', 'label'], sep='\t')
df_yelp.head()

# Create the features and target
sentences = df_yelp['sentence']
y = df_yelp['label']

# Train-test split
sentences_train, sentences_test, y_train, y_test = train_test_split(
    sentences, y, test_size=0.25, random_state=42)

# Function to return the vector for each sentence in a document
def get_word_vectors(docs):
    return [nlp(doc).vector for doc in docs]

# Get the vectors for each sentence (mean of all the word vectors)
X_train = get_word_vectors(sentences_train)
X_test = get_word_vectors(sentences_test)

from sklearn.linear_model import LogisticRegression

# Instantiate the classifier (defaults)
classifier = LogisticRegression(solver='lbfgs')

# Fit the model
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)

# Print out the accuracy score
print("Accuracy including word embeddings: ", score)

Accuracy including word embeddings:  0.852
