In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Load the dataset
# Replace 'your_dataset.csv' with the file name in your Kaggle environment
df = pd.read_csv('/kaggle/input/hip-hop/top_hiphop_artists_tracks.csv')

# Display the first few rows to understand the dataset structure
df.head()


In [None]:
# Check basic information about the dataset
df.info()

# Get summary statistics for numerical columns
df.describe()

# Check for any missing values
df.isnull().sum()


In [None]:
import matplotlib.pyplot as plt

# Plotting boxplots to visualize outliers
plt.figure(figsize=(12, 5))

# Popularity column
plt.subplot(1, 2, 1)
plt.boxplot(df['Popularity'])
plt.title('Box Plot of Popularity')
plt.ylabel('Popularity Score')

# Duration column
plt.subplot(1, 2, 2)
plt.boxplot(df['Duration (ms)'])
plt.title('Box Plot of Duration')
plt.ylabel('Duration (ms)')

plt.tight_layout()
plt.show()

# Calculate outlier thresholds using IQR method
def find_outliers(column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers

# Find outliers in Popularity
popularity_outliers = find_outliers('Popularity')
print("\nPopularity Outliers:")
print(popularity_outliers[['Artist', 'Track Name', 'Popularity']])

# Find outliers in Duration (ms)
duration_outliers = find_outliers('Duration (ms)')
print("\nDuration Outliers:")
print(duration_outliers[['Artist', 'Track Name', 'Duration (ms)']])


In [None]:
# Count the number of outliers per artist in the Popularity column
popularity_outlier_counts = popularity_outliers['Artist'].value_counts()
print("Artists with Popularity Outliers:")
print(popularity_outlier_counts.head(8))  # Display top 8 artists

# Count the number of outliers per artist in the Duration column
duration_outlier_counts = duration_outliers['Artist'].value_counts()
print("Artists with Duration Outliers:")
print(duration_outlier_counts.head(8))  # Display top 8 artists

In [None]:
# Count the number of outliers per artist in the Popularity column
popularity_outlier_counts = popularity_outliers['Artist'].value_counts()

# Count the number of outliers per artist in the Duration column
duration_outlier_counts = duration_outliers['Artist'].value_counts()

# Combine the two Series by adding them, filling missing values with 0
total_outlier_counts = popularity_outlier_counts.add(duration_outlier_counts, fill_value=0)

# Display top 8 artists by total outliers
print("Top artists with combined Popularity and Duration Outliers:")
print(total_outlier_counts.nlargest(8))
