# Introduction to Data Preprocessing

## Importance of Data Quality
Data quality is crucial because the performance of machine learning models depends heavily on the quality of the data used for training. Poor quality data can lead to inaccurate models, misleading conclusions, and suboptimal decision-making. High-quality data should be accurate, complete, consistent, and relevant.

Key aspects of data quality include:
1. Accuracy: Correctness of data values.
2. Completeness: All necessary data is present.
3. Consistency: Data should be consistent across different sources.
4. Relevance: Data should be relevant to the problem being solved.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## 1. Load the Spotify Tracks Dataset
We'll use the Spotify Tracks Dataset dataset available on Kaggle. This dataset contains information about Spotify songs with different genres and their audio features

Link to the dataset: [Spotify Tracks](https://www.kaggle.com/datasets/maharshipandya/-spotify-tracks-dataset?resource=download)

In [None]:
# Load the Spotify Tracks Dataset
df = pd.read_csv('spotify_songs.csv')

# Display the first few rows of the dataset
print("First few rows of the Spotify Tracks Dataset")
df.head()

## 2. Initial Data Exploration

In [None]:
# Check the shape of the dataset
df.shape

In [None]:
# Check the columns in the dataset
df.columns

In [None]:
# Check the data types of each column
print("Data types of each column:\n", df.dtypes)

In [None]:
# Get basic information about the dataset
df.info()

In [None]:
# Get statistical summary of the dataset
df.describe()

## Common Data Issues

In [None]:
# Checking for missing values
print("\nMissing values in each column:")
print(df.isnull().sum())

In [None]:
# Handle missing values (if any)
# Assuming 'track_name' and 'artists' should not have missing values
df.dropna(subset=['track_name', 'artists'], inplace=True)

In [None]:
# Verify there are no missing values left
df.isnull().sum()

In [None]:
# Check for duplicate rows
duplicate_rows = df.duplicated().sum()
print("Number of duplicate rows:", duplicate_rows)

In [None]:
# Drop duplicate rows
df.drop_duplicates(inplace=True)

In [None]:
# Verify that duplicates are removed
print(df.duplicated().sum())

In [None]:
# Check the shape of the cleaned dataset
df.shape

## 3. Visualization

In [None]:
# Visualize distributions of numerical features
numerical_features = ['popularity', 'duration_ms', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
plt.figure(figsize=(15, 10))
for i, feature in enumerate(numerical_features, 1):
    plt.subplot(3, 4, i)
    sns.histplot(df[feature], kde=True)
    plt.title(feature)
plt.tight_layout()
plt.show()

In [None]:
# Bar plot for count of tracks per artists
plt.figure(figsize=(12, 6))
sns.countplot(y='artists', data=df, order=df['artists'].value_counts().index[:20], hue='artists')
plt.title('Top 20 Artists by Number of Tracks')
plt.xlabel('Count of Tracks')
plt.ylabel('Artists')
plt.tight_layout()
plt.show()

In [None]:
# Bar Plot the distribution of genres
plt.figure(figsize=(12, 6))
sns.countplot(y='track_genre', data=df, order=df['track_genre'].value_counts().index[:25], hue='track_genre')
plt.title('Distribution of Music Genres')
plt.xlabel('Count')
plt.ylabel('Genre')
plt.tight_layout()
plt.show()

In [None]:
# Count Plot for 'explicit'
plt.figure(figsize=(10, 6))
sns.countplot(y='explicit', data=df, hue='explicit')
plt.xlabel('Explicit')
plt.ylabel('Count')
plt.title('Count Plot: Explicit vs Non-explicit Tracks')
plt.show()

In [None]:
# Count Plot for 'mode'
plt.figure(figsize=(10, 6))
sns.countplot(x='mode', data=df, hue='mode')
plt.xlabel('Mode (0 = Minor, 1 = Major)')
plt.ylabel('Count')
plt.title('Count Plot: Mode (Minor vs Major)')
plt.show()

In [None]:
# Count Plot for 'key'
plt.figure(figsize=(12, 6))
sns.countplot(x='key', data=df, hue='key')
plt.xlabel('Key')
plt.ylabel('Count')
plt.title('Count Plot: Distribution of Keys')
plt.show()

In [None]:
# Count Plot for 'time_signature'
plt.figure(figsize=(10, 6))
sns.countplot(x='time_signature', data=df, hue='time_signature')
plt.xlabel('Time Signature')
plt.ylabel('Count')
plt.title('Count Plot: Distribution of Time Signatures')
plt.show()

In [None]:
# Scatter plot between Duration and Popularity
plt.figure(figsize=(8, 6))
plt.scatter(df['duration_ms'].head(100), df['popularity'].head(100))
plt.title('Duration vs Popularity')
plt.xlabel('Duration')
plt.ylabel('Popularity')
plt.show()

In [None]:
# Box plot for distribution of popularity 
plt.figure(figsize=(12, 6))
sns.boxplot(x='mode', y='popularity', data=df)
plt.xlabel('Mode (0 = Minor, 1 = Major)')
plt.ylabel('Popularity')
plt.title('Box Plot: Popularity by Mode')
plt.show()

In [None]:
# Histogram for popularity
sns.histplot(df['popularity'], bins=20, kde=True)
plt.xlabel('Popularity')
plt.ylabel('Frequency')
plt.title('Distribution of Popularity')
plt.tight_layout()
plt.show()

In [None]:
# Pie chart for percentage of key
key_counts = df['key'].value_counts()

# Plot pie chart
plt.figure(figsize=(8, 8))
plt.pie(key_counts, labels=key_counts.index, autopct='%1.1f%%', startangle=140)
plt.title('Percentage of Keys')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()

In [None]:
# Exclude non-numeric columns
numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns

# Compute the correlation matrix
corr = df[numeric_columns].corr()

# Plot the heatmap of the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Pair plot for pairwise relationships between numerical features
numerical_columns = ['popularity', 'danceability', 'energy', 'valence', 'tempo']

# Create pair plot
sns.pairplot(df[numerical_columns].head(50), diag_kind='kde', plot_kws={'alpha':0.5})
plt.suptitle('Pair Plots', y=1.02)
plt.show()


In [None]:
# Stacked bar plot for popularity of tracks per time_signature
time_signature_popularity = df.groupby('time_signature')['popularity'].mean()
plt.bar(time_signature_popularity.index, time_signature_popularity, color='skyblue')
plt.title('Popularity of Tracks per Time Signature')
plt.xlabel('Time Signature')
plt.ylabel('Mean Popularity')
plt.tight_layout()
plt.show()

In [None]:
# KDE Plot for Danceability
plt.figure(figsize=(10, 6))
sns.kdeplot(df['danceability'], fill=True, color='b')
plt.title('KDE Plot: Danceability')
plt.xlabel('Danceability')
plt.ylabel('Density')
plt.show()

In [None]:
pip install wordcloud

In [None]:
from wordcloud import WordCloud

album_name = ' '.join(df['album_name'].dropna().astype(str).tolist())

# Generate a word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(album_name)

# Plot the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud: Artists')
plt.show()