# Cleaning the dataset even further

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
# Import stopwords.
from nltk.corpus import stopwords # Import the stopword list

%matplotlib inline

In [None]:
# Import train_test_split.
from sklearn.model_selection import train_test_split

# Create train_test_split.
X_train, X_test, y_train, y_test = train_test_split(train[['id','review']],
                                                    train['sentiment'],
                                                    test_size = 0.25,
                                                    random_state = 42)

In [None]:
from bs4 import BeautifulSoup             

# Initialize the BeautifulSoup object on a single movie review     
example1 = BeautifulSoup(X_train['review'][2])

# Print the raw review and then the output of get_text(), for 
# comparison
print(X_train['review'][2])
print()
print(example1.get_text())

In [None]:
# Use regular expressions to do a find-and-replace
letters_only = re.sub("[^a-zA-Z]",           # The pattern to search for "^" this means "not"
                      " ",                   # The pattern to replace it with
                      example1.get_text())   # The text to search

In [None]:
# Convert letters_only to lower case.
lower_case = letters_only.lower()

# Split lower_case up at each space.
words = lower_case.split() # This is tokenizing!

In [None]:
def review_to_words(raw_review):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    
    # 1. Remove HTML.
    review_text = BeautifulSoup(raw_review).get_text()
    
    # 2. Remove non-letters.
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    
    # 3. Convert to lower case, split into individual words.
    words = letters_only.lower().split()
    
    # 4. In Python, searching a set is much faster than searching
    # a list, so convert the stopwords to a set.
    stops = set(stopwords.words('english'))
    
    # 5. Remove stopwords.
    meaningful_words = [w for w in words if w not in stops]
    
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return(" ".join(meaningful_words))

In [None]:
# Initialize an empty list to hold the clean reviews.
clean_train_reviews = []
clean_test_reviews = []

print("Cleaning and parsing the training set movie reviews...")

# Instantiate counter.
j = 0

# For every review in our training set...
for train_review in X_train['review']:
    
    # Convert review to words, then append to clean_train_reviews.
    clean_train_reviews.append(review_to_words(train_review))
    
    # If the index is divisible by 1000, print a message.
    if (j + 1) % 1000 == 0:
        print(f'Review {j + 1} of {total_reviews}.')
    
    j += 1

# Let's do the same for our testing set.
print("Cleaning and parsing the testing set movie reviews...")

# For every review in our testing set...
for test_review in X_test['review']:
    
    # Convert review to words, then append to clean_train_reviews.
    clean_test_reviews.append(review_to_words(test_review))
    
    # If the index is divisible by 1000, print a message.
    if (j + 1) % 1000 == 0:
        print(f'Review {j + 1} of {total_reviews}.')
        
    j += 1

In [None]:
# Import CountVectorizer.
from sklearn.feature_extraction.text import CountVectorizer

# Instantiate the "CountVectorizer" object, which is sklearn's
# bag of words tool.
vectorizer = CountVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = None,
                             max_features = 5000) 

In [None]:
# fit_transform() does two things: First, it fits the model and 
# learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a 
# list of strings.

train_data_features = vectorizer.fit_transform(clean_train_reviews)

test_data_features = vectorizer.transform(clean_test_reviews)


In [None]:
# Instantiate logistic regression model.
lr = LogisticRegression(solver = 'liblinear') # lbfgs vs. livlinear

# Fit model to training data.
lr.fit(train_data_features, y_train)

# Evaluate model on training data.
lr.score(train_data_features, y_train)