In [2]:
import pandas as pd

df = pd.read_csv('cleaned_data_no_outliers.csv')

columns_to_fill = ['Total_Views', 'Total_likes', 'Total_streams', 'Total_reach', 'Total_counts' , "Spotify Popularity" , "Amazon Playlist Count"]

if df['Artist'].isna().any():
    print("Warning: 'Artist' column has missing values. Filling these before proceeding.")
    df = df.dropna(subset=['Artist'])

artist_averages = df.groupby('Artist')[columns_to_fill].mean()

def fill_missing_with_artist_avg(row, columns_to_fill, artist_averages):
    artist = row['Artist']
    if artist in artist_averages.index:
        for column in columns_to_fill:
            if pd.isna(row[column]):
                row[column] = artist_averages.loc[artist, column]
    return row

df = df.apply(fill_missing_with_artist_avg, columns_to_fill=columns_to_fill, artist_averages=artist_averages, axis=1)

missing_after_fill = df[columns_to_fill].isna().sum()

print("\nMissing values after attempting to fill with artist averages:")
print(missing_after_fill)

if missing_after_fill.any():
    print("\nStill missing values found. Considering alternative strategies:")
    global_averages = df[columns_to_fill].mean()
    
    df[columns_to_fill] = df[columns_to_fill].fillna(global_averages)
    
    missing_after_global_fill = df[columns_to_fill].isna().sum()
    print("\nMissing values after filling with global averages:")
    print(missing_after_global_fill)



Missing values after attempting to fill with artist averages:
Total_Views               519
Total_likes               522
Total_streams            2005
Total_reach               852
Total_counts              921
Spotify Popularity        361
Amazon Playlist Count     707
dtype: int64

Still missing values found. Considering alternative strategies:

Missing values after filling with global averages:
Total_Views              0
Total_likes              0
Total_streams            0
Total_reach              0
Total_counts             0
Spotify Popularity       0
Amazon Playlist Count    0
dtype: int64


In [3]:
df.to_csv('final.csv', index=False)