In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, accuracy_score

# Load the CSV file
spotify_data = pd.read_csv("spotify-tracks.csv")

# Define the threshold for high popularity
popularity_threshold = 70

# Create a target variable 'high_popularity'
spotify_data['high_popularity'] = (spotify_data['popularity'] >= popularity_threshold).astype(int)

# Select features for the model
features = ['duration_ms', 'danceability', 'energy', 'loudness', 'speechiness',
            'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']
X = spotify_data[features]
y = spotify_data['high_popularity']

# Handle missing values by imputing with the mean
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Random Forest Classifier
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train_scaled, y_train)

# Make predictions
y_pred = classifier.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(report)


Accuracy: 0.9656
Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      4562
           1       0.92      0.23      0.36       207

    accuracy                           0.97      4769
   macro avg       0.94      0.61      0.67      4769
weighted avg       0.96      0.97      0.96      4769

