In [1]:
import os
import json
import re
import string
import pandas as pd

from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer

stop_words = stopwords.words('english')
stemmer_ss = SnowballStemmer("english") 
lemmatizer = WordNetLemmatizer() 

In [2]:
def read_file(file):
    reviews = []
    for line in open(file, 'r'):
        reviews.append(json.loads(line))
        
    return pd.DataFrame(reviews)

def drop_duplicates(df, subset=['review']):
    return df.drop_duplicates(subset=subset)

def remove_punctuation(text):
    return "".join([char for char in text if char not in string.punctuation])

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

def basic_processing(df):
    df['review'] = df.apply(lambda x: x['review'].lower(), axis=1)
    df['review_remove_numbers'] = df.apply(lambda x: remove_numbers(x['review']), axis=1)
    df['review_remove_numbers+symbols'] = df.apply(lambda x: remove_punctuation(x['review_remove_numbers']), axis=1)
    
    df['tokens'] = df.apply(lambda x: word_tokenize(x['review_remove_numbers+symbols']), axis=1)
    df['tokens'] = df.apply(lambda x: [i for i in x['tokens'] if i not in stop_words], axis=1)
    df['reviews_remove_numbers+symbols+stopwords'] = df.apply(lambda x: " ".join(x['tokens']), axis=1)

    df['tokens1'] = df.apply(lambda x:[lemmatizer.lemmatize(word) for word in x['tokens']], axis=1)
    df['reviews_remove_numbers+symbols+stopwords+lemmatization'] = df.apply(lambda x: " ".join(x['tokens1']), axis=1)
    
    df['tokens1'] = df.apply(lambda x:[stemmer_ss.stem(word) for word in x['tokens']], axis=1)
    df['reviews_remove_numbers+symbols+stopwords+stemming'] = df.apply(lambda x: " ".join(x['tokens1']), axis=1)

    return df.drop(columns=['tokens', 'tokens1'])

In [3]:
df = read_file('../dataset/b0013frnkg.json')
df = drop_duplicates(df)
df = basic_processing(df)
df.head()

Unnamed: 0,rating,product_id,review,review_remove_numbers,review_remove_numbers+symbols,reviews_remove_numbers+symbols+stopwords,reviews_remove_numbers+symbols+stopwords+lemmatization,reviews_remove_numbers+symbols+stopwords+stemming
0,5.0,B0013FRNKG,have ipad 1 and will probably get the ipad 2 b...,have ipad and will probably get the ipad but...,have ipad and will probably get the ipad but...,ipad probably get ipad seriously would pay wai...,ipad probably get ipad seriously would pay wai...,ipad probabl get ipad serious would pay wait b...
1,4.0,B0013FRNKG,so after picking it up friday and playing arou...,so after picking it up friday and playing arou...,so after picking it up friday and playing arou...,picking friday playing around weekend whats fi...,picking friday playing around weekend whats fi...,pick friday play around weekend what final wor...
2,4.0,B0013FRNKG,the ipad2 is a great device but the same model...,the ipad is a great device but the same model ...,the ipad is a great device but the same model ...,ipad great device model much cheaper elsewhere...,ipad great device model much cheaper elsewhere...,ipad great devic model much cheaper elsewher e...
3,4.0,B0013FRNKG,for anyone out there who is considering whethe...,for anyone out there who is considering whethe...,for anyone out there who is considering whethe...,anyone considering whether make leap purchase ...,anyone considering whether make leap purchase ...,anyon consid whether make leap purchas ipad re...
6,4.0,B0013FRNKG,first things first: i consider myself relative...,first things first: i consider myself relative...,first things first i consider myself relativel...,first things first consider relatively unbiase...,first thing first consider relatively unbiased...,first thing first consid relat unbias appl pro...
