In this notebook, we train a text classifier to detect the sentiment of movie reviews.

## Load dataset

In [None]:
# Import the pandas package, then use the "read_csv" function to get the labeled training data
import pandas as pd       
training = pd.read_csv("../input/word2vec-nlp-tutorial/labeledTrainData.tsv.zip", header=0, \
                       delimiter="\t", quoting=3)
print(training.shape)

# Get target variable
y_train = training['sentiment']
x_train = training.drop(["sentiment"], axis=1)
                      
training.head()


## Data exploration

We check if the training dataset is balanced.

In [None]:
import seaborn as sns
sns.countplot(y_train)

Let's see the number of words averaged for positive and negative reviews.

In [None]:
import matplotlib.pyplot as plt 
import numpy as np
# Get mean of positive and negative reviews
avg_pos_reviews = training[training.sentiment==1].review.apply(lambda x: len(x.split())).mean()
avg_neg_reviews = training[training.sentiment==0].review.apply(lambda x: len(x.split())).mean()

plt.figure(figsize=(10, 3))
plt.barh(['Positive', 'Negative'], [avg_pos_reviews, avg_neg_reviews], height=0.5)
plt.xticks(np.arange(0, 300, 25))
plt.xlabel('Average Number of words')
plt.ylabel('Sentiment')
plt.show()

## Data processing

In [None]:
import nltk
import numpy as np
import matplotlib.pyplot as plt

#### Stopwords

We load the stopwords list and remove from the list those we do not want to delete from the text. We do not remove negation words because reviews with a lot of these can indicate that the review has a negative sentiment.

In [None]:
# Import list of stopwords from library NLTK
from nltk.corpus import stopwords

stopwords_list = set(stopwords.words("english"))
print(f'List of stopwords:\n{stopwords_list}\n')

# We remove negation words in list of stopwords
no_stopwords = ["not","don't",'aren','don','ain',"aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't",
               'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't",
               "won't", 'wouldn', "wouldn't"]
for no_stopword in no_stopwords:
    stopwords_list.remove(no_stopword)
    
#print(stopwords_list)
print(f'Final list of stopwords:\n{stopwords_list}')

#### Lemmatize reviews

In [None]:
# Import Lemmatizer from NLTK
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

# function that receive a list of words and do lemmatization:
def lemma_stem_text(words_list):
    # Lemmatizer
    text = [lemmatizer.lemmatize(token.lower()) for token in words_list]# eighties->eight or messages->message or drugs->drug
    text = [lemmatizer.lemmatize(token.lower(), "v") for token in text]# going-> go or started->start or watching->watch
    return text

word_example = "feet"
print(f'The word "{word_example}" is transformed to "{lemma_stem_text([word_example])[0]}"')

#### Negations

We create a function to change negation abbreviate forms to standard using a regular expression.

In [None]:
import re
re_negation = re.compile("n't ")

# function that receive a sequence of woords and return the same sequence transforming
# abbreviated negations to the standard form.
def negation_abbreviated_to_standard(sent):
    sent = re_negation.sub(" not ", sent)
    return sent

word_example = "I aren't "
print(f'The sentence "{word_example}" is transformed to "{negation_abbreviated_to_standard(word_example)}"')

We create a function to clean the text of a review using the functions defined previously.

In [None]:
# Import function BeautifulSoup to clean text of HTML tags
from bs4 import BeautifulSoup 

def review_to_words(raw_review):
    # 1. Remove HTML tags
    review_text = BeautifulSoup(raw_review).get_text() 
    
    # 2. Transform abbreviated negations to the standard form.
    review_text = negation_abbreviated_to_standard(review_text)
    
    # 3. Remove non-letters and non-numbers   
    letters_numbers_only = re.sub("[^a-zA-Z_0-9]", " ", review_text) 
    
    # 4. Convert to lower case and split into individual words (tokenization)
    words = np.char.lower(letters_numbers_only.split())                             
    
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stopwords_list]   
    
    # 6. Apply lemmatization function
    lemma_words = lemma_stem_text(meaningful_words)
    
    # 7. Join the words back into one string separated by space, and return the result.
    return( " ".join(lemma_words))   

We see how is cleaned the first review in the training set.

In [None]:
# Clean first review
clean_review = review_to_words(x_train["review"][0] )

# Print original review, sentiment and cleaned review
print(f'Text of original review:\n{x_train["review"][0]}\n')
print(f'Sentiment review: {y_train[0]}\n')
print(f'Text of cleaned review:\n{clean_review}') 

We clean the text of all reviews in the training set.

In [None]:
# We get the text of reviews in the training set
reviews = x_train['review']

# We initialize an empty list to add the clean reviews
cleaned_train_reviews = []

# We loop over each review and clean it  
for i in reviews:
    cleaned_train_reviews.append(review_to_words(i))

## Vectorization

In [None]:
# Import tf-idf encoding from sklearn library
from sklearn.feature_extraction.text import TfidfVectorizer

# Define some hiperparameters of encoded
vectorizer = TfidfVectorizer(max_features=20000, ngram_range = (1,2))

# Create the training set with the words encoded as features of the reviews
train_data_features = vectorizer.fit_transform(cleaned_train_reviews)

print(train_data_features.shape)

## Model

In [None]:
# Import the logistic regression model from sklearn 
from sklearn.linear_model import LogisticRegression

# Define the model
model = LogisticRegression(random_state=0, solver='lbfgs',
                            multi_class='multinomial')
# Train model
model.fit(train_data_features, y_train)


## Predictions in Test dataset

We predict the sentiment of the reviews in the test dataset.

In [None]:
# Read the test data
test = pd.read_csv("../input/word2vec-nlp-tutorial/testData.tsv.zip", header=0, delimiter="\t", \
                   quoting=3 )
print(test.shape)

# Create an empty list and append the clean reviews one by one
num_reviews = len(test["review"])
clean_test_reviews = [] 

# Clean the text of all reviews in the training set
print("Cleaning and parsing the test set movie reviews...\n")
for i in range(0,num_reviews):
    clean_review = review_to_words( test["review"][i] )
    clean_test_reviews.append( clean_review )

# Create the test set with the words encoded as features of the reviews
test_data_features = vectorizer.transform(clean_test_reviews)


# Use the logistic regression model to make sentiment label predictions
result = model.predict(test_data_features)

# Copy the results to a pandas dataframe with an "id" column and a "sentiment" column
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )

Finally, we build a dataframe to submission.

In [None]:
# Use pandas to save the dataframe
output.to_csv("submission.csv", index=False, quoting=3 )

## Appendix

Create training and validation datasets. We train both logistic regression and random forest classifiers and evaluate them in the validation dataset. 

In [None]:
# We split train dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_data_features, y_train, test_size=0.2, random_state=42)

In [None]:
# We train two models: random forest and logistic regression
from sklearn.ensemble import RandomForestClassifier
# Initialize a Random Forest classifier with 500 trees
forest = RandomForestClassifier(n_estimators = 500, max_depth = None, min_samples_split=2, min_samples_leaf =1,
                                bootstrap = True, random_state=0)
# Train the model
forest = forest.fit(X_train, y_train)
# Print score of model(using test dataset)
print(forest.score(X_test, y_test))

In [None]:
# Initialize a logistic regression model 
logistic = LogisticRegression(random_state=0, solver='lbfgs',
                            multi_class='multinomial')
# Train the model
logistic = logistic.fit(X_train, y_train)
# Print score of model(using test dataset)
print(logistic.score(X_test, y_test))

In [None]:
y_pred_forest  = forest.predict(X_test)
y_pred_logistic  = logistic.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix_forest = confusion_matrix(y_test, y_pred_forest, labels=[1,0])
confusion_matrix_forest

In [None]:
import seaborn as sns
# plot the confusion matrix
ax = plt.axes()
sns.heatmap(confusion_matrix_forest, annot=True, fmt="d")
ax.set_title('Confusion matrix Random Forest')

In [None]:
confusion_matrix_logistic = confusion_matrix(y_test, y_pred_logistic, labels=[1,0])
confusion_matrix_logistic


In [None]:
# plot the confusion matrix
ax = plt.axes()
sns.heatmap(confusion_matrix_logistic, annot=True, fmt="d")
ax.set_title('Confusion matrix Logistic Regression')
