# Text-Summarization-for-Movie-Review

## Libraries

In [148]:
import numpy as np
import pandas as pd 
from bs4 import BeautifulSoup
import nltk
import re
import string
import spacy
from contractions import CONTRACTION_MAP
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.tokenize import word_tokenize 



## Load CSV

In [149]:
data = pd.read_csv("../Data/IMDB_Dataset.csv")

In [150]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [151]:
data.describe(include='all')

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [152]:
data.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# NLP Implementation

In [153]:
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match) if contraction_mapping.get(match) else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
    
    try:
        expanded_text = contractions_pattern.sub(expand_match, text)
        expanded_text = re.sub("'", "", expanded_text)
    except:
        return text
    return expanded_text

In [154]:
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

In [155]:
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [156]:
def show_lemmas(text):
    print("\n\nWord         POS     Lemma")
    print("--------------------------")
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma_}')

In [157]:
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text

In [158]:
pre_content = []

for text in data['review'][:10]:
    #Convert everything to lowercase
    pre_text = text.lower()

    #Remove HTML tags
    pre_text = strip_html_tags(pre_text)

    #Contraction mapping
    pre_text = expand_contractions(pre_text)

    # Remove (‘s)
    pre_text = re.sub('"','', pre_text)

    #Remove any text inside the parenthesis ( )
    pre_text = re.sub(r'\([^)]*\)', '', pre_text)

    #Eliminate punctuations and special characters
    pre_text = remove_special_characters(pre_text)

    #Remove stopwords
    pre_text = remove_stopwords(pre_text)
    
    #Remove short words

    pre_content.append(pre_text)

for view in pre_content[:5]:
    print(view)
    print(":::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::")

one reviewers mentioned watching 1 oz episode hooked right exactly happened methe first thing struck oz brutality unflinching scenes violence set right word go trust show faint hearted timid show pulls punches regards drugs sex violence hardcore classic use wordit called oz nickname given oswald maximum security state penitentary focuses mainly emerald city experimental section prison cells glass fronts face inwards privacy high agenda em city home manyaryans muslims gangstas latinos christians italians irish moreso scuffles death stares dodgy dealings shady agreements never far awayi would say main appeal show due fact goes shows would dare forget pretty pictures painted mainstream audiences forget charm forget romanceoz mess around first episode ever saw struck nasty surreal could say ready watched developed taste oz got accustomed high levels graphic violence violence injustice watching oz may become comfortable uncomfortable viewingthats get touch darker side
::::::::::::::::::::::

In [162]:
for text in pre_content:
    getToken = word_tokenize(text)
    print(getToken)
    print('----------------------------')