In [None]:
#Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import re

# Load the CSV data
df = pd.read_csv('reviews.csv')

# Confirm data loading
print("Data loaded successfully.")
print(df.head())

In [None]:
# Cleaning data
df['score'] = pd.to_numeric(df['score'], errors='coerce')
df['at'] = pd.to_datetime(df['at'], errors='coerce')
df['content'] = df['content'].fillna('').astype(str)

print("Data cleaned successfully.")
print(df.dtypes)
print("\nMissing values:\n", df.isnull().sum())

In [None]:
# Summary statistics
avg_score = df['score'].mean()
score_counts = df['score'].value_counts().sort_index()

print(f"Average Score: {avg_score:.2f}")
print("\nScore Distribution:")
print(score_counts)

In [None]:
# Step 1: Filter reviews with score less than or equal to 2
negative_reviews = df[df['score'] <= 2]['content']

# Step 2: Combine all text into one string and convert to lowercase
words = ' '.join(negative_reviews).lower()

# Step 3: Remove punctuation and split text into words
words = re.sub(r"[^\w\s]", '', words).split()

# Step 4: Count the top 20 most common words
from collections import Counter
common_words = Counter(words).most_common(20)

# Step 5: Display results
print("Top 20 Common Words in Negative Reviews (Score ≤ 2):")
for word, count in common_words:
    print(f"{word}: {count}")

In [None]:
mport matplotlib.pyplot as plt

# Step 1: Set the figure size
plt.figure(figsize=(8, 6))

# Step 2: Plot the distribution of review scores
score_counts.plot(kind='bar', color='skyblue')

# Step 3: Add chart title and labels
plt.title('Distribution of Review Scores')
plt.xlabel('Score')
plt.ylabel('Number of Reviews')

# Step 4: Display grid and layout
plt.grid(True)
plt.tight_layout()

# Step 5: Show the plot
plt.show()