In [None]:
#Amazon Sales Data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re

# Load the dataset
df = pd.read_csv('amazon_sales_data.csv')

# Question 1: What is the average rating for each product category?
average_rating_by_category = df.groupby('category')['rating'].mean()
print("Average Rating by Category:")
print(average_rating_by_category)


In [None]:
# Question 2: What are the top rating_count products by category?
top_rated_products = df.loc[df.groupby('category')['rating_count'].idxmax()]
print("Top Rated Products by Category:")
print(top_rated_products[['category', 'product_name', 'rating_count']])

In [None]:
# Question 3: What is the distribution of discounted prices vs. actual prices?
plt.figure(figsize=(10, 6))
sns.histplot(df['discounted_price'], color='blue', label='Discounted Price', kde=True)
sns.histplot(df['actual_price'], color='red', label='Actual Price', kde=True)
plt.title('Distribution of Discounted Prices vs. Actual Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.legend()
plt.show()

In [None]:
# Question 4: How does the average discount percentage vary across categories?
average_discount_by_category = df.groupby('category')['discount_percentage'].mean()
print("Average Discount Percentage by Category:")
print(average_discount_by_category)

In [None]:
# Question 5: What are the most popular product names?

most_popular_products = df.groupby('product_name')['rating_count'].sum().sort_values(ascending=False).head(10)
print("Most Popular Product Names:")
print(most_popular_products)

In [None]:
# Question 6: What are the most popular product keywords?
all_keywords = df['about_product'].dropna().apply(lambda x: re.findall(r'\b\w+\b', x.lower())).sum()
keyword_counts = Counter(all_keywords).most_common(10)
print("Most Popular Product Keywords:")
print(keyword_counts)

In [None]:
# Question 7: What are the most popular product reviews?
most_popular_reviews = df.groupby('review_content')['rating_count'].sum().sort_values(ascending=False).head(10)
print("Most Popular Product Reviews:")
print(most_popular_reviews)

In [None]:
# Question 8: What is the correlation between discounted_price and rating?
correlation = df['discounted_price'].corr(df['rating'])
print(f'Correlation between Discounted Price and Rating: {correlation:.2f}')

In [None]:
# Question 9: What are the Top 5 categories based on the highest ratings?
top_categories_by_rating = df.groupby('category')['rating'].mean().sort_values(ascending=False).head(5)
print("Top 5 Categories Based on Highest Ratings:")
print(top_categories_by_rating)

In [None]:
#Spotify data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Load the dataframe and ensure data quality by checking for missing values and duplicate rows.
df = pd.read_csv('spotify_hiphop_data.csv')

# Check for missing values
print("Missing Values:\n", df.isnull().sum())

# Remove duplicate rows if any
df = df.drop_duplicates()

# Handle missing values if necessary (e.g., drop or fill)
df = df.dropna()

print("Data after cleaning:\n", df.info())

In [None]:
# 2. What is the distribution of popularity among the tracks in the dataset? Visualize it using a histogram.
plt.figure(figsize=(10, 6))
sns.histplot(df['popularity'], bins=20, kde=True)
plt.title('Distribution of Track Popularity')
plt.xlabel('Popularity')
plt.ylabel('Frequency')
plt.show()

In [None]:
# 3. Is there any relationship between the popularity and the duration of tracks? Explore this using a scatter plot.
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='duration_ms', y='popularity')
plt.title('Relationship Between Track Duration and Popularity')
plt.xlabel('Duration (ms)')
plt.ylabel('Popularity')
plt.show()

In [None]:
# 4. Which artist has the highest number of tracks in the dataset? Display the count of tracks for each artist using a countplot.
plt.figure(figsize=(12, 8))
top_artists = df['artist'].value_counts().head(10)
sns.countplot(y='artist', data=df, order=top_artists.index)
plt.title('Top 10 Artists by Number of Tracks')
plt.xlabel('Number of Tracks')
plt.ylabel('Artist')
plt.show()

In [None]:
# 5. What are the top 5 least popular tracks in the dataset? Provide the artist name and track name for each.
least_popular_tracks = df.nsmallest(5, 'popularity')[['artist', 'track_name', 'popularity']]
print("Top 5 Least Popular Tracks:\n", least_popular_tracks)

In [None]:
# 6. Among the top 5 most popular artists, which artist has the highest popularity on average? 
# Calculate and display the average popularity for each artist.
top_5_artists = df['artist'].value_counts().head(5).index
avg_popularity = df[df['artist'].isin(top_5_artists)].groupby('artist')['popularity'].mean().sort_values(ascending=False)
print("Average Popularity for Top 5 Artists:\n", avg_popularity)

In [None]:
# 7. For the top 5 most popular artists, what are their most popular tracks? List the track name for each artist.
most_popular_tracks = df[df['artist'].isin(top_5_artists)].sort_values('popularity', ascending=False).groupby('artist').first()[['track_name', 'popularity']]
print("Most Popular Tracks for Top 5 Artists:\n", most_popular_tracks)

In [None]:
# 8. Visualize relationships between multiple numerical variables simultaneously using a pair plot.
sns.pairplot(df[['popularity', 'duration_ms']])
plt.show()

In [None]:
# 9. Does the duration of tracks vary significantly across different artists? Explore this visually using a box plot or violin plot.
plt.figure(figsize=(12, 8))
sns.boxplot(data=df, x='artist', y='duration_ms', order=top_artists.index)
plt.title('Track Duration Across Different Artists')
plt.xlabel('Artist')
plt.ylabel('Duration (ms)')
plt.xticks(rotation=90)
plt.show()

In [None]:
# 10. How does the distribution of track popularity vary for different artists? Visualize this using a swarm plot or a violin plot.
plt.figure(figsize=(12, 8))
sns.violinplot(data=df, x='artist', y='popularity', order=top_artists.index)
plt.title('Distribution of Track Popularity Across Artists')
plt.xlabel('Artist')
plt.ylabel('Popularity')
plt.xticks(rotation=90)
plt.show()