In [72]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import ast
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')

In [73]:
#Downloading used nltk packages
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('words')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shrey\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\shrey\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\shrey\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shrey\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [66]:
#Reading the data
df = pd.read_csv(r'D:\Customer Review Analysis Project\scraped_data.csv')

In [67]:
#Handling NaN Values by filling with custom values
df['Number of Reviews'][2] = "5"
df['Review Summary'][2] = "['Good Value for the Price','Reliable Performance','Impressive Battery Life','Sleek and Portable','Lacks High-End Features']"
df['Review Details'][2] = "[\"Been using it for several months, and it's dependable for my daily use. Multitasking is smooth, and it runs most of my applications without any hiccups.\", \"Battery lasts long enough for a full day's work without needing to plug in. It's perfect for long sessions at the cafe or library.\", \"I love the sleek design of this laptop. It's lightweight and easy to carry around, which is great for my on-the-go lifestyle.\", \"While it's a decent machine, it doesn't have the high-end features that I need for advanced gaming or video editing.\"]"

df['Number of Reviews'][5] = "8"
df['Review Summary'][5] = "['Sluggish and Unresponsive','Poor Screen Quality','Constant Overheating','Cheap Build Quality','Terrible Customer Service','Battery Drains Quickly','Decent for Basic Tasks','Affordable Entry-Level Option']"
df['Review Details'][5] ="[\"This laptop is frustratingly slow. It struggles with more than two applications open, making it nearly impossible to work efficiently.\", \"The display is dim and the colors are washed out. It's not a pleasant experience for watching videos or editing photos.\", \"It overheats within an hour of use, which is concerning. The fan noise is also quite loud and distracting.\", \"The materials feel cheap, and I'm worried about the durability of this laptop. The keyboard is particularly flimsy.\", \"Had an issue and contacted customer service, but the support was unhelpful and dismissive. Not impressed at all.\", \"The battery life is nowhere near what was advertised. It drains so fast that I'm always searching for an outlet.\", \"If all you need is something for basic web browsing and document editing, it gets the job done. Just don't expect much more.\", \"As an affordable entry-level laptop, it's an okay choice for those who are tech-savvy enough to manage its shortcomings.\"]"

In [68]:
#Dropping the products with no Ratings
dropped_df = df.dropna()

#Creating Lists of Strings
dropped_df.loc[:, 'Review Summary'] = dropped_df['Review Summary'].apply(ast.literal_eval)
dropped_df.loc[:, 'Review Details'] = dropped_df['Review Details'].apply(ast.literal_eval)

dropped_df = dropped_df.explode('Review Summary').reset_index(drop=True)

#Exploding the List for each product
df_summary_exploded = dropped_df['Review Summary'].explode('Review Summary').reset_index()
df_details_exploded = dropped_df['Review Details'].explode('Review Details').reset_index()

#Removing previous columns
dropped_df.drop('Review Summary', axis=1, inplace=True)
dropped_df.drop('Review Details', axis=1, inplace=True)

#Adding new splitted columns
dropped_df["Review Summary"] = df_summary_exploded["Review Summary"]
dropped_df["Review Details"] = df_details_exploded["Review Details"]

# Removing READ MORE from 'Review Details'
dropped_df['Review Details'] = dropped_df['Review Details'].str.replace("READ MORE", "", regex=False)

In [69]:
dropped_df.shape

(43, 7)

In [70]:
dropped_df

Unnamed: 0,Product Name,Price,Rating,Number of Ratings,Number of Reviews,Review Summary,Review Details
0,Lenovo IdeaPad Gaming 3 AMD Ryzen 5 Hexa Core ...,"₹56,990",4.3,68,8,Classy product,"First Impression: Great build no wobble, full ..."
1,Lenovo IdeaPad Gaming 3 AMD Ryzen 5 Hexa Core ...,"₹56,990",4.3,68,8,Worth the money,Overall the product is really nice. Lenovo has...
2,Lenovo IdeaPad Gaming 3 AMD Ryzen 5 Hexa Core ...,"₹56,990",4.3,68,8,Horrible,Lenovo worst company I purchased this laptop a...
3,Lenovo IdeaPad Gaming 3 AMD Ryzen 5 Hexa Core ...,"₹56,990",4.3,68,8,Best in the market!,great build and performance of every segment ...
4,Lenovo IdeaPad Gaming 3 AMD Ryzen 5 Hexa Core ...,"₹56,990",4.3,68,8,Super!,Bought it at 53k at sale. 16gb DDR5 Ram (dual ...
5,Lenovo IdeaPad Gaming 3 AMD Ryzen 5 Hexa Core ...,"₹56,990",4.3,68,8,Simply awesome,Good product and svperipherals seller has giv...
6,Lenovo IdeaPad Gaming 3 AMD Ryzen 5 Hexa Core ...,"₹56,990",4.3,68,8,Simply awesome,"It's performance is really good , but disappoi..."
7,Lenovo IdeaPad Gaming 3 AMD Ryzen 5 Hexa Core ...,"₹56,990",4.3,68,8,Excellent,Awesome product and performs well to my expect...
8,Lenovo IdeaPad Gaming 3 AMD Ryzen 5 Hexa Core ...,"₹47,990",3.7,7,1,"Battery issue, rest is good","First Impression: Great build no wobble, full ..."
9,Lenovo Legion 5 AMD Ryzen 7 Octa Core 5800H - ...,"₹99,990",4.6,7,5,Good Value for the Price,Overall the product is really nice. Lenovo has...


## NLTK

In [24]:
example = dropped_df['Review Details'][1]
example

'Overall the product is really nice. Lenovo has included some tweaking features. The only problem that I felt is with the hinge. The laptop screen wobble sometimes.'

In [32]:
#Smartly Splits the text into parts so that the model can interpret it
tokens = nltk.word_tokenize(example)
tokens[:6]

['Overall', 'the', 'product', 'is', 'really', 'nice']

In [36]:
#Part of speech tagging. Determines adjectives, nouns etc.
tagged = nltk.pos_tag(tokens)
tagged[:6]

[('Overall', 'JJ'),
 ('the', 'DT'),
 ('product', 'NN'),
 ('is', 'VBZ'),
 ('really', 'RB'),
 ('nice', 'JJ')]

In [74]:
#Remove Stopwords such as 'The' 'is' 'and', etc.
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in tokens if word.lower() not in stop_words]
filtered_words

['Overall',
 'product',
 'really',
 'nice',
 '.',
 'Lenovo',
 'included',
 'tweaking',
 'features',
 '.',
 'problem',
 'felt',
 'hinge',
 '.',
 'laptop',
 'screen',
 'wobble',
 'sometimes',
 '.']

In [None]:
dropped_df.to_csv('D:\Customer Review Analysis Project\preprocessed_data.csv', index=False)