In [2]:
# Imports
import pandas as pd

# Read in the locally saved file from the link above
df_yelp = pd.read_csv('yelp_labelled.txt', names=['sentence', 'label'], sep='\t')
df_yelp.head()

Unnamed: 0,sentence,label
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [3]:
# Import train-test split
from sklearn.model_selection import train_test_split

# Create the feature and target variables
sentences = df_yelp['sentence']
y = df_yelp['label']

# Train-test split
sentences_train, sentences_test, y_train, y_test = train_test_split(
    sentences, y, test_size=0.25, random_state=42)

In [4]:
# Import the tf-idf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Instantiate and fit the tf-idf vectorizer
vectorizer = TfidfVectorizer(stop_words='english', ngram_range = (2,2))
vectorizer.fit(sentences_train)

# Vectorize the training and testing data
X_train = vectorizer.transform(sentences_train)
X_test  = vectorizer.transform(sentences_test)

# Display the properties of the vectorized text
X_train

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 3051 stored elements and shape (750, 2864)>

In [5]:
# Import the classifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Instantiate and fit a model
classifier = LogisticRegression(solver='lbfgs')

classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)

print("Accuracy:", score)

Accuracy: 0.588


In [6]:
from sklearn.pipeline import Pipeline

# Define the Pipeline
pipe = Pipeline([('vect', vectorizer), # vectorizer
                 ('clf', classifier) # classifier
                ])

# Define the parameter space for the grid serach
parameters = {'clf__C': [1, 10, 1000000]} # C: regularization strength


# Implement a grid search with cross-validation
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(pipe, parameters, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(sentences, y);

# Print out the best score
grid_search.best_score_

Fitting 5 folds for each of 3 candidates, totalling 15 fits


np.float64(0.611)

In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression

# Read in the locally saved file from the link above

df_yelp = pd.read_csv('yelp_labelled.txt', names=['sentence', 'label'], sep='\t')
df_yelp.head()

# Create the features and target
sentences = df_yelp['sentence']
y = df_yelp['label']

# Instantiate the tf-idf vectorizer
vectorizer = TfidfVectorizer(stop_words='english', ngram_range = (2,2))

# Instantiate the classifier (defaults)
classifier = LogisticRegression(solver='lbfgs')

# Instantiate the LSA (SVD) algorithm (defaults)
svd = TruncatedSVD()

In [8]:
# Create the pipelines
from sklearn.pipeline import Pipeline

# LSA part
lsa = Pipeline([('vect', vectorizer), ('svd', svd)])

# Combine into one pipeline
pipe = Pipeline([('lsa', lsa), ('clf', classifier)])

# Define the parameter space for the grid search
parameters = {
    'lsa__svd__n_components': (100,250),
    'lsa__vect__max_df': (0.9, 1.0), # max document frequency
}

# Implement a grid search with cross-validation
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(pipe, parameters, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(sentences, y);

# Display the best score from the grid-search
grid_search.best_score_

Fitting 5 folds for each of 4 candidates, totalling 20 fits


np.float64(0.593)

In [10]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [11]:
# Import spaCy and the large pretrained model (includes word embeddings)
import spacy
nlp = spacy.load("en_core_web_lg")

In [12]:
# Imports
import pandas as pd
from sklearn.model_selection import train_test_split

# Read in the locally saved file from UCI website
df_yelp = pd.read_csv('yelp_labelled.txt', names=['sentence', 'label'], sep='\t')
df_yelp.head()

# Create the features and target
sentences = df_yelp['sentence']
y = df_yelp['label']

# Train-test split
sentences_train, sentences_test, y_train, y_test = train_test_split(
    sentences, y, test_size=0.25, random_state=42)

# Function to return the vector for each sentence in a document
def get_word_vectors(docs):
    return [nlp(doc).vector for doc in docs]

# Get the vectors for each sentence (mean of all the word vectors)
X_train = get_word_vectors(sentences_train)
X_test = get_word_vectors(sentences_test)

from sklearn.linear_model import LogisticRegression

# Instantiate the classifier (defaults)
classifier = LogisticRegression(solver='lbfgs')

# Fit the model
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)

# Print out the accuracy score
print("Accuracy including word embeddings: ", score)

Accuracy including word embeddings:  0.856
