In [11]:
import pandas as pd
import re
from textblob import TextBlob
from wordsegment import load, segment
load()

# Load the data.
df = pd.read_csv('IMDB Dataset.csv') # Load the dataset.
'''
N_sample = 1000 # Set the testing sample size.
df = df.sample(N_sample) # Select a small subset for testing.
'''
df['sentiment'] = (df['sentiment'] == 'positive').astype(int) # Convert the 'sentiment' values to 0/1.
text_list = df['review'].to_list() # Get the list of reviews as strings.
N = len(text_list) # Size of the dataset.

# Remove the tags and backslashes.
def remove_tags_backslashes(df, text_list, verbose=True):
    text_list = [re.sub(r'<.*?>', '', text) for text in text_list] # Remove the tags.
    text_list = [re.sub(r'\\.', '', text) for text in text_list] # Remove the backslashes with symbols.
    if verbose:
        print ('Tags and backslashes removal: success.')
    df['review'] = text_list
    return (df, text_list)

# Convert shortcuts like "who'll" to full sentences like "who will".
def expand_contractions(df, text_list, verbose=True):
    contractions_dict = {"n't": " not", "'ll": " will", "'ve": " have", "'re": " are", "'d": " would",
        "'s": " is", "'m": " am", "'cause": " because", "y'all": "you all",
        "o'clock": "of the clock", "won't": "will not", "can't": "cannot",
        "gonna": "going to", "wanna": "want to", "gotta": "got to"} # Dictionary of contraction substitutions.
    def expand(text): # Function expanding contractions.
        for contraction, expanded in contractions_dict.items():
            text = re.sub(re.escape(contraction), expanded, text) # Do the expansions.
        return text
    text_list = [expand(text) for text in text_list] # Expand all contractions.
    if verbose:
        print ('Shortcuts expansion: success.')
    df['review'] = text_list
    return (df, text_list)

# Process the extraneous symbols.
def process_extra_symbols(df, text_list, verbose=True):
    f_extra_symbols = [len(re.findall(r'[^a-zA-Z0-9\s]', text)) for text in text_list] # Get the list of punctuation numbers.
    text_list = [re.sub(r'[^a-zA-Z0-9\s]', '', text) for text in text_list] # Remove extraneous symbols.
    text_list = [re.sub(r'\s+', ' ', text).strip() for text in text_list] # Assure single space separation.
    f_extra_symbols = [f/(1 + text.count(' ')) for f, text in zip(f_extra_symbols, text_list)] # Get the fractions of extraneous symbols.
    df['f Punctuation'] = f_extra_symbols # Add them to the dataframe.
    if verbose:
        print ('Extra symbol removal: success.')
    df['review'] = text_list
    return (df, text_list)

# Process the capitalized letters.
def process_caps(df, text_list, verbose=True):
    f_caps = [sum(1 for word in text.split() if word.isupper()) for text in text_list] # Get the list of numbers of capitalized letters.
    text_list = [text.lower() for text in text_list] # Decapitalize all sentences.
    f_caps = [f/len(re.sub(' ', '', text)) for f, text in zip(f_caps, text_list)] # Get the fractions of capitalized letters.
    df['f Capitalized'] = f_caps # Add them to the dataframe.
    if verbose:
        print ('Capitalized letter removal: success.')
    df['review'] = text_list
    return (df, text_list)

# Separate the merged words.
def separate_merged_words(df, text_list, N, verbose=True, N_updates=50):
    if verbose:
        print ("Initiating merged word separation...")
    f_errors = [0] * N # Initialize the error count.
    T_updates = N//N_updates if N_updates > 0 else None # Calculate the period of the diagnostic output.
    def fix_merged_words(text): # Function returning a sentence with separated merged words.
        old_words = text.split() # Tokenize by spaces.
        new_words = [] # Initialize the list of corrected words.
        for word in old_words: # Loop over all original words.
            new_words = new_words + segment(word) # Split and append them to the list.
        return " ".join(new_words) # Return the corrected string.
    for i in range(N): # Loop over all reviews.
        text_updated = fix_merged_words(text_list[i]) # Split all words in a review. 
        f_errors[i] = len(text_list[i]) - len(text_updated) # Count them as errors.
        text_list[i] = text_updated[:] # Update the review.
        if verbose and (not (T_updates is None)) and (i+1)%T_updates == 0:
            print (str(i+1) + ' out of ' + str(N) + ' sentences processed.')
    if verbose:
        print ('Merged word separation: success.')
    df['review'] = text_list
    return (df, text_list, f_errors)

# Correct spelling errors.
def correct_errors(df, text_list, N, f_errors=None, verbose=True, N_updates=50):
    if verbose:
        print ("Initiating the error correction process...")
    if f_errors is None: # If the vector of space errors was not supplied...
        f_errors = [0] * N # ... then initialize the error count.
    print ('1')
    T_updates = N//N_updates if N_updates > 0 else None # Calculate the period of the diagnostic output.
    print ('2')
    for i in range(N): # Loop ovr all entries.
        blob = TextBlob(text_list[i]) # Initialize the spell checker.
        print ('3')
        words = text_list[i].split() # Split the words.
        print ('4')
        words_corrected = blob.correct().words # Correct the words.
        print ('5')
        f_errors[i] += sum(1 for word in words if not (word in words_corrected)) # Find the number of incorrect words.
        print ('6')
        text_list[i] = str(blob.correct()).lower() # Update the review. 
        print ('7')
        f_errors[i] = f_errors[i] / (5 + len(words)) # Count the regularized number of spelling errors.
        print ('8')
        if verbose and (not (T_updates is None)) and (i+1)%T_updates == 0:
            print (str(i+1) + ' out of ' + str(N) + ' sentences processed.')
    df['f Errors'] = f_errors # Add them to the dataframe.
    if verbose:
        print ('Error correction: success.')
    df['review'] = text_list
    return (df, text_list)

# Perform blob sentiment analysis.
def blob_analyze_sentiment(df, text_list, N, verbose=True):
    sentiment_polarity_list = [0] * N # Initialize the list of polarity scores.
    sentiment_subjectivity_list = [0] * N # Initialize the list of subjectivity scores.
    for i in range(N): # Loope over the dataset. 
        blob = TextBlob(text_list[i]) # Initialize the sentiment analyzer.
        sentiment_polarity_list[i] = (blob.sentiment.polarity + 1) / 2 # Obtain the polarity score.
        sentiment_subjectivity_list[i] = 1 - blob.sentiment.subjectivity # Obtain the subjectivity score.
    df['Blob subjectivity score'] = sentiment_subjectivity_list # Add the subjectivity scores to the dataframe.
    df['Blob polarity score'] = sentiment_polarity_list # Add the polarity scores to the dataframe.
    if verbose:
        print ('Blob processing: success.')
    return df
        
# RUN THE SELECTED PRE-PROCESSING PROCEDURES. UNCOMMENT THE ONES TO BE RUN.
df, text_list = remove_tags_backslashes(df, text_list)
df, text_list = expand_contractions(df, text_list)
df, text_list = process_extra_symbols(df, text_list)
df, text_list = process_caps(df, text_list)
f_errors = None
df, text_list, f_errors = separate_merged_words(df, text_list, N)
#df, text_list = correct_errors(df, text_list, N, f_errors)
df = blob_analyze_sentiment(df, text_list, N)

df.to_csv('IMDB_dataset_preprocessed.csv')

Tags and backslashes removal: success.
Shortcuts expansion: success.
Extra symbol removal: success.
Capitalized letter removal: success.
Initiating merged word separation...
20 out of 1000 sentences processed.
40 out of 1000 sentences processed.
60 out of 1000 sentences processed.
80 out of 1000 sentences processed.
100 out of 1000 sentences processed.
120 out of 1000 sentences processed.
140 out of 1000 sentences processed.
160 out of 1000 sentences processed.
180 out of 1000 sentences processed.
200 out of 1000 sentences processed.
220 out of 1000 sentences processed.
240 out of 1000 sentences processed.
260 out of 1000 sentences processed.
280 out of 1000 sentences processed.
300 out of 1000 sentences processed.
320 out of 1000 sentences processed.
340 out of 1000 sentences processed.
360 out of 1000 sentences processed.
380 out of 1000 sentences processed.
400 out of 1000 sentences processed.
420 out of 1000 sentences processed.
440 out of 1000 sentences processed.
460 out of 1000 