**Pre-Processing**

In [1]:
import pandas as pd
import numpy as np
import re
import emoji
from spellchecker import SpellChecker

import string
import nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import OneHotEncoder
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [16]:
train = pd.read_csv('Train.csv')
train.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [18]:
train['text'][4]

'Im a die hard Dads Army fan and nothing will ever change that. I got all the tapes, DVD\'s and audiobooks and every time i watch/listen to them its brand new. <br /><br />The film. The film is a re run of certain episodes, Man and the hour, Enemy within the gates, Battle School and numerous others with a different edge. Introduction of a new General instead of Captain Square was a brilliant move - especially when he wouldn\'t cash the cheque (something that is rarely done now).<br /><br />It follows through the early years of getting equipment and uniforms, starting up and training. All in all, its a great film for a boring Sunday afternoon. <br /><br />Two draw backs. One is the Germans bogus dodgy accents (come one, Germans cant pronounced the letter "W" like us) and Two The casting of Liz Frazer instead of the familiar Janet Davis. I like Liz in other films like the carry ons but she doesn\'t carry it correctly in this and Janet Davis would have been the better choice.'

In [3]:
train.tail()

Unnamed: 0,text,label
39995,"""Western Union"" is something of a forgotten cl...",1
39996,This movie is an incredible piece of work. It ...,1
39997,My wife and I watched this movie because we pl...,0
39998,"When I first watched Flatliners, I was amazed....",1
39999,"Why would this film be so good, but only gross...",1


In [4]:
train['label'].value_counts()

0    20019
1    19981
Name: label, dtype: int64

In [6]:
stopwords= nltk.corpus.stopwords.words('english')
punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''

def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'URL',text)

def remove_HTML(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)


def remove_not_ASCII(text):
    text = ''.join([word for word in text if word in string.printable])
    return text

def remove_number(text):
    num = re.compile(r'[-+]?[.\d]*[\d]+[:,.\d]*')
    return num.sub(r'NUMBER', text)


def remove_stopwords(text):
    text_tokens = word_tokenize(text)
    tokens_without_sw = [word for word in text_tokens if not word in stopwords]
    text = ' '.join([str(elem) for elem in tokens_without_sw])
    return text

def lemma(text):
    tokens = word_tokenize(text)
    wnl = WordNetLemmatizer()
    text = ' '.join([wnl.lemmatize(words) for words in tokens])
    return text

def remove_punc(text):
    for ele in text:
        if ele in punc:
            text = re.sub(ele, "",text)
    return text
    
def cleanText(txt):
    txt = re.sub(r'[^\w\s]', '', txt)
    txt = re.sub(r'\n','',txt)
    # to remove emojis
    txt = re.sub(emoji.get_emoji_regexp(), r"", txt)
    txt = re.sub(r'https?:\/\/[A-Za-z0-9\.\/]+','',txt)
    txt = re.sub(r"https?://\S+|www\.\S+","",txt)
    txt = re.sub(r"<.*?>","",txt)
    
    txt = remove_URL(txt)
    txt = remove_HTML(txt)
    txt = remove_not_ASCII(txt)
    
    txt = txt.lower()
    
    txt = remove_number(txt)
    
    txt = remove_stopwords(txt)
    txt = lemma(txt)
    return txt  

In [7]:
train["clean_text"] = train["text"].apply(cleanText)

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   text        40000 non-null  object
 1   label       40000 non-null  int64 
 2   clean_text  40000 non-null  object
dtypes: int64(1), object(2)
memory usage: 937.6+ KB


In [9]:
train.head()

Unnamed: 0,text,label,clean_text
0,I grew up (b. 1965) watching and loving the Th...,0,grew b NUMBER watching loving thunderbird mate...
1,"When I put this movie in my DVD player, and sa...",0,put movie dvd player sat coke chip expectation...
2,Why do people who do not know what a particula...,0,people know particular time past like feel nee...
3,Even though I have great interest in Biblical ...,0,even though great interest biblical movie bore...
4,Im a die hard Dads Army fan and nothing will e...,1,im die hard dad army fan nothing ever change g...


In [13]:
train['clean_text'][4]

'im die hard dad army fan nothing ever change got tape dvd audiobooks every time watchlisten brand new br br film film run certain episode man hour enemy within gate battle school numerous others different edge introduction new general instead captain square brilliant move especially wouldnt cash cheque something rarely done nowbr br follows early year getting equipment uniform starting training great film boring sunday afternoon br br two draw back one german bogus dodgy accent come one german cant pronounced letter w like u two casting liz frazer instead familiar janet davis like liz film like carry ons doesnt carry correctly janet davis would better choice'

In [14]:
train =  train.drop('text', axis=1)
train.head()

Unnamed: 0,label,clean_text
0,0,grew b NUMBER watching loving thunderbird mate...
1,0,put movie dvd player sat coke chip expectation...
2,0,people know particular time past like feel nee...
3,0,even though great interest biblical movie bore...
4,1,im die hard dad army fan nothing ever change g...


In [15]:
compression_opts = dict(method='zip',
                        archive_name='movie_reviews.csv')  
train.to_csv('movie_reviews.zip', index=False,
          compression=compression_opts) 