## Machine Learning Prediction for in_playlist 

In [98]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from node2vec import Node2Vec
import ast 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from gensim.models import Word2Vec


## Preprocessing 

### collab pre preprocessing

In [95]:


# Function to parse JSON column and extract key-value data
def parse_json_column(json_str):
    try:
        json_data = ast.literal_eval(json_str)
        if isinstance(json_data, dict):
            names = list(json_data.keys())
            counts = [item['count'] for item in json_data.values()]
            urls = [item['uri'] for item in json_data.values()]
            return pd.Series([names, counts, urls])
        else:
            return pd.Series([[], [], []])  # Empty lists for rows with invalid data
    except (ValueError, SyntaxError):
        return pd.Series([[], [], []])  # Handle parsing errors
    

#function to generate the unique number of collaborators, total published, and the highest collaborations count 
def get_collaboration_counts(df):
    # Get unique collaborators
    unique_collaborators = df['collab_names'].explode().unique()
    num_unique_collaborators = len(unique_collaborators)
    
    # Get total number of collaborations
    total_published = df['counts'].explode().sum()
    
    # Get highest collaboration count
    highest_collaboration_count = df['counts'].explode().max()
    
    return num_unique_collaborators, total_published, highest_collaboration_count



### Genre preprocessing

In [96]:

# Create an embedding for each artist by averaging the embeddings of their genres
def get_genre_embedding(genre_list, model):
    if not genre_list:
        return [0] * model.vector_size
    vectors = [model.wv[genre] for genre in genre_list if genre in model.wv]
    if vectors:
        return list(sum(vectors) / len(vectors))  # Average embedding
    else:
        return [0] * model.vector_size  # Handle missing genres





In [125]:
# read in week1.csv
week1 = pd.read_csv('week1.csv')
week2 = pd.read_csv('week2.csv')
week3 = pd.read_csv('week3.csv')

#combine the two dataframes
#week = pd.concat([week1, week2])
week = week1

In [126]:
# Apply parsing function to the column
week[['collab_names', 'counts', 'urls']] = week['collaborators'].apply(parse_json_column)
#get the counts 
week[['num_collaborators', 'total_published', 'highest_collaboration_count']] = get_collaboration_counts(week)

#------------------------------------------------------------------------------------------------------------------------------

#genre embedding
#genres_list = week['genres'].tolist()  # List of lists of genres
#word2vec_model = Word2Vec(sentences=genres_list, vector_size=50, window=3, min_count=1, workers=4, sg=1, seed=42)
#week['genre_embedding'] = week['genres'].apply(lambda x: get_genre_embedding(x, word2vec_model))

#------------------------------------------------------------------------------------------------------------------------------
#drop the columns that are not needed
#remove the columns : Unamed: 0, source, img_info, link, and api_link
week = week.drop(columns=['Unnamed: 0', 'source', 'img_info', 'link', 'api_link', 'name', 'collab_names', 'collaborators',  'urls', 'genres', 'counts','pagerank', 'closeness_cent', 'clustering', 'deg_cent', 'degree'])
week.head()

Unnamed: 0,popularity,followers,in_playlist,num_collaborators,total_published,highest_collaboration_count
0,76,503600,True,39727,217229,50
1,75,32646,True,39727,217229,50
2,51,38998,False,39727,217229,50
3,63,155041,False,39727,217229,50
4,75,11840425,False,39727,217229,50


In [127]:
# Apply parsing function to the column
week3[['collab_names', 'counts', 'urls']] = week3['collaborators'].apply(parse_json_column)
#get the counts 
week3[['num_collaborators', 'total_published', 'highest_collaboration_count']] = get_collaboration_counts(week3)

#genre embedding
#genres_list_3 = week3['genres'].tolist()  # List of lists of genres
#word2vec_model3 = Word2Vec(sentences=genres_list_3, vector_size=50, window=3, min_count=1, workers=4, sg=1, seed=42)
#week3['genre_embedding'] = week3['genres'].apply(lambda x: get_genre_embedding(x, word2vec_model3))

week3 = week3.drop(columns=['Unnamed: 0', 'source', 'genres','img_info', 'link', 'api_link', 'name', 'collaborators', 'urls', 'collab_names', 'counts'])
week3.head()

Unnamed: 0,popularity,followers,in_playlist,num_collaborators,total_published,highest_collaboration_count
0,84,7766971,True,39581,216725,50
1,94,60240128,True,39581,216725,50
2,67,2918979,False,39581,216725,50
3,81,51430114,False,39581,216725,50
4,0,2551849,False,39581,216725,50


## Random Forests

#### irrespective of the source, name, 

removing img_info, api_link, link, 


predict in_playlist based on genres, popularity, followers, pagerank, closeness_cent, clustering, deg_cent, degree  and collaborators ( you must diesct the count of collabs with the collaborator, note there are self albums in that)

In [128]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split



# Combine genre_embedding_df with other features
# Drop the original 'genre_embedding' column and reset index for alignment
X = week.drop(columns=[ 'in_playlist'])
y = week['in_playlist']


# Drop the original 'genre_embedding' column and reset index for alignment
X_test = week3.drop(columns=['in_playlist'])
y_test = week3['in_playlist'].reset_index(drop=True)

# Train Random Forest model
rf = RandomForestClassifier(random_state=42)
rf.fit(X, y)

# Predict and evaluate
y_pred = rf.predict(X_test)

# Evaluate performance
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

       False       1.00      1.00      1.00     14882
        True       0.92      0.90      0.91       135

    accuracy                           1.00     15017
   macro avg       0.96      0.95      0.95     15017
weighted avg       1.00      1.00      1.00     15017



In [123]:
#cross validation 
from sklearn.model_selection import cross_val_score
scores = cross_val_score(rf, X, y, cv=5, scoring='f1_macro')
print('F1 scores:', scores)
print('Mean F1 score:', scores.mean())

# Get feature importances
feature_importances = rf.feature_importances_
feature_names = X.columns
feature_importances_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importances})
feature_importances_df = feature_importances_df.sort_values('importance', ascending=False)
feature_importances_df


F1 scores: [0.97805179 0.9669972  0.91513153 0.98197563 0.96759734]
Mean F1 score: 0.9619506988468981


Unnamed: 0,feature,importance
1,followers,0.840993
0,popularity,0.159007
2,num_collaborators,0.0
3,total_published,0.0
4,highest_collaboration_count,0.0


## Logistic regression 

In [124]:
#conduct a logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Create a pipeline with a standard scaler and logistic regression
lr = make_pipeline(StandardScaler(), LogisticRegression(random_state=42))

# Train the model
lr.fit(X, y)

# Predict and evaluate
y_pred = lr.predict(X_test)

# Evaluate performance
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

       False       0.99      1.00      1.00     14882
        True       0.00      0.00      0.00       135

    accuracy                           0.99     15017
   macro avg       0.50      0.50      0.50     15017
weighted avg       0.98      0.99      0.99     15017



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### SCRAP CODE  vvvvvvvvvv include the genre embedding

In [118]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Extract the genre_embedding as separate columns
genre_embedding_df = pd.DataFrame(
    np.vstack(week['genre_embedding']),  # Stack all embeddings into a 2D array
    columns=[f'genre_emb_{i}' for i in range(len(week['genre_embedding'].iloc[0]))]  # Name the columns
).reset_index(drop=True)

# Combine genre_embedding_df with other features
# Drop the original 'genre_embedding' column and reset index for alignment
X = week.drop(columns=['genre_embedding', 'in_playlist']).reset_index(drop=True)

# Concatenate the flattened embeddings with the other features
X = pd.concat([X, genre_embedding_df], axis=1)
y = week['in_playlist'].reset_index(drop=True)

# Perform the same for the test set
genre_embedding_test_df = pd.DataFrame(
    np.vstack(week3['genre_embedding']),
    columns=[f'genre_emb_{i}' for i in range(len(week3['genre_embedding'].iloc[0]))]
).reset_index(drop=True)


# Drop the original 'genre_embedding' column and reset index for alignment
X_test = week3.drop(columns=['genre_embedding', 'in_playlist']).reset_index(drop=True)

# Concatenate the flattened embeddings with the other features
X_test = pd.concat([X_test, genre_embedding_df], axis=1)
y_test = week3['in_playlist'].reset_index(drop=True)

# Train Random Forest model
rf = RandomForestClassifier(random_state=42)
rf.fit(X, y)

# Predict and evaluate
y_pred = rf.predict(X_test)

# Evaluate performance
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))


ValueError: Found input variables with inconsistent numbers of samples: [15017, 30180]

In [66]:
#now predict on week3
X = week3.drop(columns=['in_playlist'])
y = week3['in_playlist']

y_pred = rf.predict(X)
accuracy_score(y, y_pred)

ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- closeness_cent
- clustering
- deg_cent
- degree
- pagerank
