In [1]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# Set visualization style
sns.set(style='whitegrid')


In [2]:
# Step 2: Load the Data
# Load the dataset
url = 'https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/1429_1.csv'
# https://www.kaggle.com/datasets/snap/amazon-fine-food-reviews
df = pd.read_csv(url)


HTTPError: HTTP Error 404: Not Found

In [None]:
# Step 3: Understand the Data
# Display the first few rows
print(df.head())

# Get basic information
print(df.info())

# Summary statistics
print(df.describe(include='all'))


In [None]:
# Step 4: Handle Missing Data
# Check for missing values
print(df.isnull().sum())

# Handle missing values
df.dropna(subset=['Text', 'Score'], inplace=True)


In [None]:
# Step 5: Data Visualization
# Distribution of review scores
sns.countplot(x='Score', data=df)
plt.title('Distribution of Review Scores')
plt.show()

# Most common words in reviews
all_words = ' '.join([text for text in df['Text']])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_words)

plt.figure(figsize=(10, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Most Common Words in Reviews')
plt.show()

# Number of reviews per product
top_products = df['ProductId'].value_counts().head(10)
sns.barplot(y=top_products.index, x=top_products.values)
plt.title('Top 10 Products by Number of Reviews')
plt.show()

# Number of reviews per user
top_users = df['UserId'].value_counts().head(10)
sns.barplot(y=top_users.index, x=top_users.values)
plt.title('Top 10 Users by Number of Reviews')
plt.show()


In [None]:
# Step 6: Univariate Analysis
# Distribution of helpfulness numerator
sns.histplot(df['HelpfulnessNumerator'], kde=True, bins=30)
plt.title('Distribution of Helpfulness Numerator')
plt.show()

# Distribution of helpfulness denominator
sns.histplot(df['HelpfulnessDenominator'], kde=True, bins=30)
plt.title('Distribution of Helpfulness Denominator')
plt.show()


In [None]:
# Step 7: Bivariate Analysis
# Score vs Helpfulness numerator
sns.scatterplot(x='HelpfulnessNumerator', y='Score', data=df)
plt.title('Score vs Helpfulness Numerator')
plt.show()

# Score vs Helpfulness denominator
sns.scatterplot(x='HelpfulnessDenominator', y='Score', data=df)
plt.title('Score vs Helpfulness Denominator')
plt.show()

# Score vs Time
df['Time'] = pd.to_datetime(df['Time'], unit='s')
df['Year'] = df['Time'].dt.year
sns.boxplot(x='Year', y='Score', data=df)
plt.title('Score vs Year')
plt.xticks(rotation=90)
plt.show()


In [None]:
# Step 8: Multivariate Analysis
# Pair plot for selected features
sns.pairplot(df[['Score', 'HelpfulnessNumerator', 'HelpfulnessDenominator']])
plt.show()


In [None]:
# Step 9: Identify and Handle Outliers
# Box plot to identify outliers in HelpfulnessNumerator
sns.boxplot(x=df['HelpfulnessNumerator'])
plt.title('Boxplot of Helpfulness Numerator')
plt.show()

# Removing outliers from HelpfulnessNumerator
Q1 = df['HelpfulnessNumerator'].quantile(0.25)
Q3 = df['HelpfulnessNumerator'].quantile(0.75)
IQR = Q3 - Q1
df = df[~((df['HelpfulnessNumerator'] < (Q1 - 1.5 * IQR)) | (df['HelpfulnessNumerator'] > (Q3 + 1.5 * IQR)))]

# Box plot to identify outliers in HelpfulnessDenominator
sns.boxplot(x=df['HelpfulnessDenominator'])
plt.title('Boxplot of Helpfulness Denominator')
plt.show()


In [None]:
# Step 10: Feature Engineering
# Create additional features or normalize data if needed
# For example, calculate helpfulness ratio
df['HelpfulnessRatio'] = df['HelpfulnessNumerator'] / df['HelpfulnessDenominator']
df['HelpfulnessRatio'].fillna(0, inplace=True)  # Fill NaN values with 0


In [None]:
# Step 11: Summary and Insights
# Summarize key findings
print("Key Insights:")

# Review score distribution
score_dist = df['Score'].value_counts()
print(f"Review Score Distribution:\n{score_dist}")

# Average helpfulness ratio
avg_helpfulness_ratio = df['HelpfulnessRatio'].mean()
print(f"Average Helpfulness Ratio: {avg_helpfulness_ratio}")

# Most reviewed products
top_products = df['ProductId'].value_counts().head(10)
print(f"Most Reviewed Products:\n{top_products}")

# Insights from word cloud
print("The most common words in the reviews provide insights into the frequent topics and sentiments expressed by customers.")


Findings:
1. Review Score Distribution: Understanding the distribution of review scores provides insights into customer satisfaction levels.
2. Helpfulness of Reviews: Analyzing the helpfulness numerator and denominator offers a view of how useful other users find the reviews.
3. Top Products and Users: Identifying the top products and users by number of reviews provides insights into the most popular items and active reviewers.
4. Common Review Words: The word cloud visualization helps in understanding the most frequent words used in reviews, giving insights into common topics and sentiments.