In [24]:
# Import pandas and numpy
import pandas as pd
import numpy as np
# Import the required dependencies from sklearn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn import metrics

# Set the column width to view the text message data.
pd.set_option('max_colwidth', 200)

In [25]:
# Load the movie review dataset.
imdb_reviews_df = pd.read_csv('Resources/imdb_reviews.csv')
# Display the first five rows of the dataset. 
imdb_reviews_df.head()

Unnamed: 0,label,review
0,neg,"A very, very, very slow-moving, aimless movie about a distressed, drifting young man."
1,neg,"Not sure who was more lost - the flat characters or the audience, nearly half of whom walked out."
2,neg,"Attempting artiness with black & white and clever camera angles, the movie disappointed - became even more ridiculous - as the acting was poor and the plot and lines almost non-existent."
3,neg,Very little music or anything to speak of.
4,pos,The best scene in the movie was when Gerardo is trying to find a song that keeps running through his head.


In [26]:
# Check for missing values. 
imdb_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   748 non-null    object
 1   review  748 non-null    object
dtypes: object(2)
memory usage: 11.8+ KB


In [27]:
# Get a sample of a review.
imdb_reviews_df["review"][2]

'Attempting artiness with black & white and clever camera angles, the movie disappointed - became even more ridiculous - as the acting was poor and the plot and lines almost non-existent.  '

In [28]:
#  Get the number of "pos" and "neg" from the "label" column:
imdb_reviews_df['label'].value_counts()

label
pos    386
neg    362
Name: count, dtype: int64

## Split the data into training & testing data sets.

In [29]:
# Set the features variable to the "review" column.
X = imdb_reviews_df['review']
# Set the target variable to the "label" column.
y = imdb_reviews_df['label']

# Split data into training and testing and use `test_size = 30%`.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [31]:
#  Build a pipeline using `TfidfVectorizer()`, without `stopwords='english`, and `LinearSVC()`.
text_clf = Pipeline([('tfidf', TfidfVectorizer()),('clf', LinearSVC()),])

# Fit the model to the transformed data.
text_clf.fit(X_train, y_train)



In [32]:
# Validate the model by checking the model's training and testing accuracy.
print('Train Accuracy: %.3f' % text_clf.score(X_train, y_train))
print('Test Accuracy: %.3f' % text_clf.score(X_test, y_test))

Train Accuracy: 0.998
Test Accuracy: 0.742


## Run predictions and analyze the results.

In [33]:
# Retrieve the first 30 predictions from the model.
test_predictions = text_clf.predict(X_test)
print(test_predictions[:30])

['pos' 'pos' 'pos' 'pos' 'pos' 'neg' 'pos' 'neg' 'neg' 'pos' 'pos' 'pos'
 'pos' 'neg' 'pos' 'pos' 'neg' 'neg' 'neg' 'neg' 'pos' 'neg' 'neg' 'neg'
 'neg' 'neg' 'pos' 'neg' 'neg' 'neg']


In [34]:
# Create the confusion matrix on the test data and predictions
print(metrics.confusion_matrix(y_test,test_predictions))

# Print a classification report
print(metrics.classification_report(y_test,test_predictions))

# Print the overall accuracy
print(metrics.accuracy_score(y_test,test_predictions))

[[72 39]
 [19 95]]
              precision    recall  f1-score   support

         neg       0.79      0.65      0.71       111
         pos       0.71      0.83      0.77       114

    accuracy                           0.74       225
   macro avg       0.75      0.74      0.74       225
weighted avg       0.75      0.74      0.74       225

0.7422222222222222


### Feed a review into the model's `predict()` method

In [35]:
# Add a review of a movie.
barbie_review = """I was curious to see how they would evolve the "stereotypical Barbie" into something more. 
But the messaging in this movie was so heavy handed that it completely lost the plot. 
I consider myself a proponent of gender equality, and this ain't the way to get it."""

In [36]:
# Print the classification of the review.
print(text_clf.predict([barbie_review])) 

['pos']


## Repeat the analysis with the `english` stopwords. 

Now let's repeat the process above and see if the removal of stopwords improves or impairs our score.

In [37]:
# Build a LinearSVC pipeline using`TfidfVectorizer()`, with `stopwords`, and `LinearSVC()`.
text_clf_2 = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),('clf', LinearSVC())])

# Fit the data to the model.
text_clf_2.fit(X_train, y_train)



In [38]:
# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % text_clf_2.score(X_train, y_train))
print('Test Accuracy: %.3f' % text_clf_2.score(X_test, y_test))

Train Accuracy: 0.990
Test Accuracy: 0.756


In [39]:
# Retrieve the first 30 predictions from the model.
test_predictions_2 = text_clf_2.predict(X_test)
print(test_predictions_2[:30])

['pos' 'neg' 'neg' 'pos' 'pos' 'neg' 'pos' 'pos' 'neg' 'neg' 'pos' 'pos'
 'pos' 'neg' 'pos' 'pos' 'neg' 'neg' 'pos' 'neg' 'pos' 'neg' 'neg' 'neg'
 'pos' 'neg' 'pos' 'neg' 'neg' 'neg']


In [40]:
# Create the confusion matrix on the test data and predictions
print(metrics.confusion_matrix(y_test,test_predictions_2))

# Print a classification report
print(metrics.classification_report(y_test,test_predictions_2))

# Print the overall accuracy
print(metrics.accuracy_score(y_test,test_predictions_2))


# [[72 39]
#  [19 95]]

[[75 36]
 [19 95]]
              precision    recall  f1-score   support

         neg       0.80      0.68      0.73       111
         pos       0.73      0.83      0.78       114

    accuracy                           0.76       225
   macro avg       0.76      0.75      0.75       225
weighted avg       0.76      0.76      0.75       225

0.7555555555555555


Our score didn't change that much. We went from 74.2 % without filtering stopwords to 75.6% after adding a stopword filter to our pipeline. Keep in mind that 748 movie reviews is a relatively small dataset. The real gain from stripping stopwords is improved processing speed; depending on the size of the corpus, it might save hours.

### Feed the previous review into the model's `predict()` method.

In [41]:
# Print the classification of the review.
print(text_clf_2.predict([barbie_review]))  

['pos']


**Question:** Did the review change? 

**Answer:** No.

**Question:** If so, why do you think it changed? 

## Repeat the analysis using the following custom stopwords. 

In [42]:
# Create custom stopwords.
custom_stopwords = ['a', 'about', 'an', 'and', 'are', 'as', 'at', 'be', 'been', 'but', 'by', 'can', \
             'even', 'ever', 'for', 'from', 'get', 'had', 'has', 'have', 'he', 'her', 'hers', 'his', \
             'how', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'me', 'my', 'of', 'on', 'or', \
             'see', 'seen', 'she', 'so', 'than', 'that', 'the', 'their', 'there', 'they', 'this', \
             'to', 'was', 'we', 'were', 'what', 'when', 'which', 'who', 'will', 'with', 'you']

In [43]:
# Build a LinearSVC pipeline using`TfidfVectorizer()`, with custom_stopwords, and `LinearSVC()`.
text_clf_3 = Pipeline([('tfidf', TfidfVectorizer(stop_words=custom_stopwords)),('clf', LinearSVC())])

# Fit the data to the model.
text_clf_3.fit(X_train, y_train)



In [44]:
# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % text_clf_3.score(X_train, y_train))
print('Test Accuracy: %.3f' % text_clf_3.score(X_test, y_test))

Train Accuracy: 0.998
Test Accuracy: 0.778


In [45]:
# Get predictions
test_predictions_3 = text_clf_3.predict(X_test)
print(test_predictions_3[:30])

['pos' 'neg' 'neg' 'pos' 'pos' 'neg' 'pos' 'neg' 'neg' 'pos' 'pos' 'pos'
 'pos' 'neg' 'pos' 'pos' 'neg' 'neg' 'pos' 'neg' 'pos' 'neg' 'neg' 'neg'
 'neg' 'pos' 'pos' 'neg' 'neg' 'neg']


In [46]:
# Create the confusion matrix on the test data and predictions
print(metrics.confusion_matrix(y_test,test_predictions_3))

# Print a classification report
print(metrics.classification_report(y_test,test_predictions_3))

# Print the overall accuracy
print(metrics.accuracy_score(y_test,test_predictions_3))


# [[72 39]
#  [19 95]]


# [[75 36]
#  [19 95]]

[[77 34]
 [16 98]]
              precision    recall  f1-score   support

         neg       0.83      0.69      0.75       111
         pos       0.74      0.86      0.80       114

    accuracy                           0.78       225
   macro avg       0.79      0.78      0.78       225
weighted avg       0.78      0.78      0.78       225

0.7777777777777778


In [47]:
# Print the classification of the review.
print(text_clf_3.predict([barbie_review]))

['neg']


**Question:** Did the review change? 

**Answer:** Yes.

**Question:** If so, why do you think it changed? 

**Answer:** There are many words in the stopword list that may influence the classification of movie reviews.Using a custom or domain specific custom stopword list can help improve the algorithm.