In [1]:
import nltk
nltk.download('stopwords') 
nltk.download('wordnet')
nltk.download('movie_reviews')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\RAHUL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\RAHUL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\RAHUL\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

# 1. Data preparation ðŸ”¡ âž¡ ðŸ”¢

In [7]:
import pandas as pd
import numpy as np
from nltk.corpus import movie_reviews, stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, accuracy_score

We will transform movie_reviews tagged corpus from nltk to a pandas dataframe with the script below:

In [3]:
reviews = []
for fileid in movie_reviews.fileids():
    tag, filename = fileid.split('/')
    reviews.append((tag, movie_reviews.raw(fileid)))
sample = pd.DataFrame(reviews, columns=['target', 'document'])
print(f'Dimensions: {sample.shape}')
sample.head()

Dimensions: (2000, 2)


Unnamed: 0,target,document
0,neg,"plot : two teen couples go to a church party ,..."
1,neg,the happy bastard's quick movie review \ndamn ...
2,neg,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs..."
4,neg,synopsis : a mentally unstable man undergoing ...


You will see that the dataframe has 2 columns: a column for the targets, the polarity sentiment, and a column for the reviews (i.e. documents) for 2000 reviews. Each review is either tagged as positive or negative review. Letâ€™s check the counts of the target classes:

In [5]:
sample['target'].value_counts()


neg    1000
pos    1000
Name: target, dtype: int64

In [8]:
sample['target'] = np.where(sample['target']=='pos', 1, 0)
sample['target'].value_counts()

0    1000
1    1000
Name: target, dtype: int64

In [9]:
sample.head()

Unnamed: 0,target,document
0,0,"plot : two teen couples go to a church party ,..."
1,0,the happy bastard's quick movie review \ndamn ...
2,0,it is movies like these that make a jaded movi...
3,0,""" quest for camelot "" is warner bros . ' firs..."
4,0,synopsis : a mentally unstable man undergoing ...


When it comes to partitioning data, we have 2 options:

1.Split the sample data into 3 groups: train, validation and test, where train is used to fit the model, validation is used to evaluate fitness of interim models, and test is used to assess final model fitness.

2.Split the sample data into 2 groups: train and test, where train is further split into train and validation set k times using k-fold cross validation, and test is used to assess final model fitness. With k-fold cross validation:

First: Train is split into k pieces.

Second: Take one piece for validation set to evaluate fitness of interim models after fitting the model to the remaining k-1 pieces.

Third: Repeat the second step k-1 times using a different piece for the validation set each time and the remaining for the train set such that each piece of train is used as validation set only once.

In [10]:
X_train, X_test, y_train, y_test = train_test_split(sample['document'], sample['target'], test_size=0.3, random_state=123)
print(f'Train dimensions: {X_train.shape, y_train.shape}')
print(f'Test dimensions: {X_test.shape, y_test.shape}')
# Check out target distribution
print(y_train.value_counts())
print(y_test.value_counts())

Train dimensions: ((1400,), (1400,))
Test dimensions: ((600,), (600,))
1    700
0    700
Name: target, dtype: int64
1    300
0    300
Name: target, dtype: int64


### Preprocess documents

Tokenise

Normalise

Remove stop words

Count vectorise

Transform to tf-idf representation

In [11]:
def preprocess_text(text):
    # Tokenise words while ignoring punctuation
    tokeniser = RegexpTokenizer(r'\w+')
    tokens = tokeniser.tokenize(text)
    
    # Lowercase and lemmatise 
    lemmatiser = WordNetLemmatizer()
    lemmas = [lemmatiser.lemmatize(token.lower(), pos='v') for token in tokens]
    
    # Remove stop words
    keywords= [lemma for lemma in lemmas if lemma not in stopwords.words('english')]
    return keywords

In [12]:
# Create an instance of TfidfVectorizer
vectoriser = TfidfVectorizer(analyzer=preprocess_text)
# Fit to the data and transform to feature matrix
X_train_tfidf = vectoriser.fit_transform(X_train)
X_train_tfidf.shape

(1400, 27676)

## Modelling

In [13]:
sgd_clf = SGDClassifier(random_state=123)
sgf_clf_scores = cross_val_score(sgd_clf, X_train_tfidf, y_train, cv=5)
print(sgf_clf_scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (sgf_clf_scores.mean(), sgf_clf_scores.std() * 2))

[0.82857143 0.85       0.84285714 0.81785714 0.81428571]
Accuracy: 0.83 (+/- 0.03)


In [17]:
sgf_clf_pred = cross_val_predict(sgd_clf, X_train_tfidf, y_train, cv=5)
print(confusion_matrix(y_train, sgf_clf_pred))

[[580 120]
 [117 583]]


In [14]:
cross_val_score(sgd_clf, X_train_tfidf, y_train, cv=5, scoring='accuracy')

array([0.82857143, 0.85      , 0.84285714, 0.81785714, 0.81428571])

### Attempt to improve performance

In [19]:
grid = {'fit_intercept': [True,False],
        'early_stopping': [True, False],
        'loss' : ['hinge', 'log_loss', 'squared_hinge'],
        'penalty' : ['l2', 'l1', 'none']}
search = GridSearchCV(estimator=sgd_clf, param_grid=grid, cv=5)
search.fit(X_train_tfidf, y_train)
search.best_params_

{'early_stopping': False,
 'fit_intercept': False,
 'loss': 'log_loss',
 'penalty': 'l1'}

In [20]:
grid_sgd_clf_scores = cross_val_score(search.best_estimator_, X_train_tfidf, y_train, cv=5)
print(grid_sgd_clf_scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (grid_sgd_clf_scores.mean(), grid_sgd_clf_scores.std() * 2))

[0.85       0.85714286 0.83571429 0.84285714 0.82857143]
Accuracy: 0.84 (+/- 0.02)


### Final model

Now that we have finalised the model, letâ€™s put the data transformation step as well as the model in a pipeline:

In [21]:
pipe = Pipeline([('vectoriser', vectoriser),
                 ('classifier', search.best_estimator_)])
pipe.fit(X_train, y_train)

In the code shown above, the pipeline first transforms the unstructured data to a feature matrix, then fits the preprocessed data to the model. This is an elegant way of putting together the essential steps in a single pipeline.

Letâ€™s assess the predictive power of the model on the test set. Here, we will pass the test data to the pipeline, which will first preprocess the data then make predictions using the previously fitted model:

In [22]:
y_test_pred = pipe.predict(X_test)
print("Accuracy: %0.2f" % (accuracy_score(y_test, y_test_pred)))
print(confusion_matrix(y_test, y_test_pred))

Accuracy: 0.85
[[249  51]
 [ 37 263]]
