In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_excel('contest_data_copy (1).xlsx')

In [3]:
# Checking for any non-numeric values and converting them
df.replace('-', pd.NA, inplace=True)  

In [4]:
# Convert all feature columns to numeric
for col in ['BPM', 'energy', 'danceability', 'acousticness', 'instrumentalness', 'liveness', 'speechiness', 'happiness', 'loudness']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

In [5]:
# Imputing missing values 
df.fillna(method='ffill', inplace=True)

In [6]:
key_dummies = pd.get_dummies(df['key'], prefix='key')
df = pd.concat([df, key_dummies], axis=1)

In [7]:
base_features = ['BPM', 'energy', 'danceability', 'acousticness', 'instrumentalness', 'liveness', 'speechiness', 'happiness', 'loudness']
key_features = key_dummies.columns.tolist()
features = base_features + key_features
df['winner'] = (df['final_place'] == 1).astype(int)

In [8]:
# Separating the dataset into historical (2009-2023) and future (2024) data
historical_data = df[df['year'] < 2024]
future_data = df[df['year'] == 2024]

In [9]:
# Preparing the historical data (data before 2024)
X_historical = historical_data[features]
y_historical = historical_data['winner']

In [10]:
# Training the model on historical data
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_historical, y_historical)

In [11]:
# Preparing features for 2024 predictions
X_future = future_data[features]

In [12]:
# Predicting the winning chances for 2024 entries
future_data['winning_chance'] = model.predict_proba(X_future)[:, 1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  future_data['winning_chance'] = model.predict_proba(X_future)[:, 1]


In [14]:
# Displaying the top 5 most likely winners for 2024
predicted_winners = future_data[['country', 'artist_name', 'song_name', 'winning_chance']]
predicted_winners.sort_values(by='winning_chance', ascending=False, inplace=True)
print(predicted_winners.head())

       country         artist_name  \
1      Armenia            Ladaniva   
10     Estonia  5MIINUST x Puuluup   
13     Georgia     Nutsa Buzaladze   
29  San Marino              MEGARA   
21   Lithuania      Silvester Belt   

                                            song_name  winning_chance  
1                                                Jako        0.180000  
10  (nendest) narkootikumidest ei tea me (küll) mi...        0.170000  
13                                        Firefighter        0.163333  
29                                           11:11:00        0.110000  
21                                            Luktelk        0.110000  


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predicted_winners.sort_values(by='winning_chance', ascending=False, inplace=True)
