## Importing libraries

* dataset source: 

In [1]:
import pandas as pd
import numpy as np
import os
import re
import shutil
import contractions
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
from datetime import datetime
from joblib import Parallel, delayed

In [2]:
root_path = "E:/projects/ignou project/amazon review/"

In [3]:
data = pd.read_csv(root_path+'train.csv', names=['polarity', 'title', 'text'])
data

Unnamed: 0,polarity,title,text
0,2,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...
1,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
2,2,Amazing!,This soundtrack is my favorite music of all ti...
3,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
4,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."
...,...,...,...
3599995,1,Don't do it!!,The high chair looks great when it first comes...
3599996,1,"Looks nice, low functionality",I have used this highchair for 2 kids now and ...
3599997,1,"compact, but hard to clean","We have a small house, and really wanted two o..."
3599998,1,what is it saying?,not sure what this book is supposed to be. It ...


In [4]:
data['polarity'].unique()

array([2, 1], dtype=int64)

There are two types of polarity 'positive' and 'negative' represented by '2' and '1' respectively.
There are 3.6 Million datapoints with review title and text.

In [5]:
data['polarity'].value_counts()

1    1800000
2    1800000
Name: polarity, dtype: int64

* There are 1.8M positive and 1.8M negative reviews in our dataset

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3600000 entries, 0 to 3599999
Data columns (total 3 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   polarity  int64 
 1   title     object
 2   text      object
dtypes: int64(1), object(2)
memory usage: 82.4+ MB


In [7]:
def change_polarity(polarity):
    if polarity == 1:
        return 0
    return 1

data['polarity'] = data['polarity'].apply(change_polarity)
data

Unnamed: 0,polarity,title,text
0,1,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...
1,1,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
2,1,Amazing!,This soundtrack is my favorite music of all ti...
3,1,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
4,1,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."
...,...,...,...
3599995,0,Don't do it!!,The high chair looks great when it first comes...
3599996,0,"Looks nice, low functionality",I have used this highchair for 2 kids now and ...
3599997,0,"compact, but hard to clean","We have a small house, and really wanted two o..."
3599998,0,what is it saying?,not sure what this book is supposed to be. It ...


* Now, the polarity 1, means positive review and polarity 0 means, negative review

## Data Preprocessing

In, this section i'm going to preprocess title and text data

In [8]:
import nltk
from nltk.corpus import stopwords
stopword = stopwords.words('english')

In [9]:

punctuation = """
!"#$%&'()*+, -/:;<=>?@[\]^_`{|}~—“”
"""

def remove_text_with_brackets(row):
    row=re.sub("\(.*?\)","",row)
    row=re.sub("\{.*?\}","",row)
    row=re.sub("\[.*?\]","",row)
    row=re.sub("\<.*?\>","",row)
    return row

def remove_stopwords(row):
    doc = ' '.join([word for word in row.split() if word not in stopword])
    return doc

def remove_punctuations(row):
    for punc in punctuation:
        row = row.replace(punc, ' ')
    return row

def remove_digits(row):
    doc = ' '.join([word for word in row.split() if word.isalpha()])
    return doc

def remove_urls(row):
    doc = []
    for word in row.split():
        if not word.startswith('http') and not word.startswith('www'):
            if not len(word)<=2 and not len(word)>12:
                doc.append(word)
    return ' '.join(doc)


def preprocess(row):
    row = str(row)
    row = remove_digits(row)
    row = row.lower()
    decontracted_word_sentences = ' '.join([contractions.fix(word) for word in row.split(' ')])
    row = decontracted_word_sentences.strip()
    row = remove_text_with_brackets(row)
    # row = remove_stopwords(row)
    row = remove_urls(row)
    row = remove_punctuations(row)
    doc = ' '.join([word for word in row.split()])
    
    return doc


In [10]:
def preprocessing(row):
    polarity = row[0]
    title = preprocess(row[1])
    text = preprocess(row[2])
    return polarity, title, text

In [11]:
def get_preprocesse_data(data):
    start_time = datetime.now()
    print(f'Preprocessing...')
    preprocessed_data = Parallel(n_jobs=-1)(delayed(preprocessing)(row) for row in data.values)

    polarity, text = [], []
    for datapoint in preprocessed_data:
        polarity.append(datapoint[0])
        text.append(datapoint[1] + ' ' + datapoint[2])

    df = pd.DataFrame()
    df['text'] = text
    df['polarity'] = polarity
    
    del preprocessed_data
    del polarity
    del text
    del data
    print(f'Time taken in preprocessing: {datetime.now() - start_time}')
    return df.sample(frac=1)

In [12]:
data = get_preprocesse_data(data)

Preprocessing...
Time taken in preprocessing: 0:02:22.801402


In [13]:
# text_length = data['text'].str.split(' ').apply(len)

In [14]:
# print(f'%tile\tNum of words')
# for i in range(0, 101, 10):
#     print(f'{i}\t{np.percentile(text_length, i)}')
    
# print(f'\n%tile\tNum of words')
# for i in range(90, 101, 1):
#     print(f'{i}\t{np.percentile(text_length, i)}')

In [15]:
data

Unnamed: 0,text,polarity
115192,what ross suggests you techniques are grounded...,1
3233207,organ awareness all the stops pretty concise o...,1
75350,for the imprisoned this item was given gift wh...,1
2671521,put this great this terrific saw have vhs put ...,1
1781563,the memory fonder somehow thought this film wa...,0
...,...,...
820793,had this tea for the first time today and abs...,1
1639747,funny book very funny never thought that gramm...,1
3220533,slow predicted and repetitive movie too long a...,0
3183042,more trouble than worth too much litter and ru...,0


In [16]:
# data['text_length'] = text_length
# data = data[data['text_length'] < 50]
# data.shape

In [17]:
# positive_data = data[data['polarity'] == 1].iloc[:500000,:]
# negative_data = data[data['polarity'] == 0].iloc[:500000,:]

In [18]:
# positive_data.shape, negative_data.shape

In [19]:
# final_data = pd.concat([positive_data, negative_data], axis=0).sample(frac=1)
# final_data.to_csv('final_data.csv', index=False)
data.to_csv('final_data.csv', index=False)

In [20]:
data.shape, data['polarity'].value_counts()

((3600000, 2),
 0    1800000
 1    1800000
 Name: polarity, dtype: int64)