## Best Practices
* Feature Engineering ---- Preprocessing and Cleaning
* Train Test Split
* BOW | TFIDF | Word2Vec
* Train ML Algorithm/ Model
* Get Prediction



Dataset used - Amazon product data


In [50]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("bharadwaj6/kindle-reviews")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/kindle-reviews


In [51]:
import pandas as pd

# Force Python engine (bypasses C parser completely)
try:
    df = pd.read_csv('/kindle_reviews.csv',
                     engine='python',
                     on_bad_lines='skip',
                     encoding='utf-8')
    print(f"Loaded {len(df)} rows")
except Exception as e:
    print(f"Error: {e}")

Loaded 982619 rows


In [52]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,B000F83SZQ,"[0, 0]",5,I enjoy vintage books and movies so I enjoyed ...,"05 5, 2014",A1F6404F1VG29J,Avidreader,Nice vintage story,1399248000
1,1,B000F83SZQ,"[2, 2]",4,This book is a reissue of an old one; the auth...,"01 6, 2014",AN0N05A9LIJEQ,critters,Different...,1388966400
2,2,B000F83SZQ,"[2, 2]",4,This was a fairly interesting read. It had ol...,"04 4, 2014",A795DMNCJILA6,dot,Oldie,1396569600
3,3,B000F83SZQ,"[1, 1]",5,I'd never read any of the Amy Brewster mysteri...,"02 19, 2014",A1FV0SX13TWVXQ,"Elaine H. Turley ""Montana Songbird""",I really liked it.,1392768000
4,4,B000F83SZQ,"[0, 1]",4,"If you like period pieces - clothing, lingo, y...","03 19, 2014",A3SPTOKDG7WBLN,Father Dowling Fan,Period Mystery,1395187200
5,5,B000F83SZQ,"[0, 0]",4,A beautiful in-depth character description mak...,"05 26, 2014",A1RK2OCZDSGC6R,ubavka seirovska,Review,1401062400
6,6,B000F83SZQ,"[0, 0]",4,I enjoyed this one tho I'm not sure why it's c...,"06 10, 2014",A2HSAKHC3IBRE6,Wolfmist,Nice old fashioned story,1402358400
7,7,B000F83SZQ,"[1, 1]",4,Never heard of Amy Brewster. But I don't need ...,"03 22, 2014",A3DE6XGZ2EPADS,WPY,Enjoyable reading and reminding the old times,1395446400
8,8,B000FA64PA,"[0, 0]",5,Darth Maul working under cloak of darkness com...,"10 11, 2013",A1UG4Q4D3OAH3A,dsa,Darth Maul,1381449600
9,9,B000FA64PA,"[0, 0]",4,This is a short story focused on Darth Maul's ...,"02 13, 2011",AQZH7YTWQPOBE,Enjolras,"Not bad, not exceptional",1297555200


In [53]:
df=df[['reviewText','overall']]

In [54]:
df.head()

Unnamed: 0,reviewText,overall
0,I enjoy vintage books and movies so I enjoyed ...,5
1,This book is a reissue of an old one; the auth...,4
2,This was a fairly interesting read. It had ol...,4
3,I'd never read any of the Amy Brewster mysteri...,5
4,"If you like period pieces - clothing, lingo, y...",4


In [55]:
df.shape

(982619, 2)

In [56]:
df.isnull().sum()

Unnamed: 0,0
reviewText,22
overall,0


In [57]:
df['overall'].unique()

array([5, 4, 3, 2, 1])

In [58]:
df['overall'].value_counts()

Unnamed: 0_level_0,count
overall,Unnamed: 1_level_1
5,575264
4,254013
3,96194
2,34130
1,23018


## Preprocessing and Cleaning

In [59]:
df['overall'] = df['overall'].apply(lambda x: 1 if x>3 else 0)

In [60]:
df['overall'].value_counts()

Unnamed: 0_level_0,count
overall,Unnamed: 1_level_1
1,829277
0,153342


In [70]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [72]:
df=df.head(10000)

In [73]:
import re
import pandas as pd
import bs

# Fill NaN values with empty string before processing
df['reviewText'] = df['reviewText'].fillna('')

# Now apply the regex
df['reviewText'] = df['reviewText'].apply(lambda x: re.sub('[^a-z A-z 0-9-]+', '', x))

# Remove stopwords (handle empty strings)
from nltk.corpus import stopwords
df['reviewText'] = df['reviewText'].apply(lambda x: " ".join([y for y in x.split() if y not in stopwords.words('english')]))

In [74]:
# Removing special characters
df['reviewText']=df['reviewText'].apply(lambda x:re.sub('[^a-z A-z 0-9-]+', '',x))
# ## Remove the stopswords
df['reviewText']=df['reviewText'].apply(lambda x:" ".join([y for y in x.split() if y not in stopwords.words('english')]))
## Remove url
df['reviewText']=df['reviewText'].apply(lambda x: re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '' , str(x)))
## Remove html tags
df['reviewText']=df['reviewText'].apply(lambda x: BeautifulSoup(x, 'lxml').get_text())
## Remove any additional spaces
df['reviewText']=df['reviewText'].apply(lambda x: " ".join(x.split()))

NameError: name 'BeautifulSoup' is not defined