In [1]:
import pandas as pd

In [2]:
from spotify.data_transformations import DataTransformations
from spotify.train_test_split import DatasetSplit

In [3]:
from sklearn.dummy import DummyClassifier
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import roc_auc_score

In [4]:
df = pd.read_csv(
    "/home/sabyasachi/git/spotify_tracks/dataset/tracks.csv"
)  # Read dataframe

In [5]:
DF = (df
.pipe(DataTransformations.preprocess)
.pipe(DataTransformations.process)
)

In [6]:
train_set = DatasetSplit.return_train(DF)
test_set = DatasetSplit.return_test(DF)
print("The lengths of the train and test sets are {} and {} respectively.".format(len(train_set), len(test_set)))

The lengths of the train and test sets are 84941 and 21236 respectively.


In [7]:
# X_train, X_test, y_train, t_test split
X_train = train_set.copy().drop(columns=['popularity'])
y_train = train_set['popularity']
X_test = test_set.copy().drop(columns=['popularity'])
y_test = test_set['popularity']


In [8]:
# Dummy Classifier
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train)
dummy_train_pred = dummy_clf.predict(X_train)
print(dummy_clf.score(X_train, y_train))

# precision, recall
print(f"The precision score is {precision_score(dummy_train_pred, y_train)}.")

print(f"The recall score is {recall_score(dummy_train_pred, y_train)}.")
#print(f"The AUC score is {roc_auc_score(dummy_train_pred, y_train)}")




0.8744187141663037
The precision score is 0.0.
The recall score is 0.0.


  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
# Decision Tree
decision_clf = tree.DecisionTreeClassifier()
decision_clf.fit(X_train, y_train)
decision_train_pred = decision_clf.predict(X_train)
print(decision_clf.score(X_train, y_train))

# precision, recall
print(f"The precision score is {precision_score(decision_train_pred, y_train)}.")

print(f"The recall score is {recall_score(decision_train_pred, y_train)}.")


0.9930893208226886
The precision score is 0.9488140995593888.
The recall score is 0.9959653611493801.


In [10]:
# Logistic Regression
logistic_clf = LogisticRegression(random_state=0)
logistic_clf.fit(X_train, y_train)
logistic_train_pred = logistic_clf.predict(X_train)
print(logistic_clf.score(X_train, y_train))

# precision, recall
print(f"The precision score is {precision_score(logistic_train_pred, y_train)}.")

print(f"The recall score is {recall_score(logistic_train_pred, y_train)}.")
#print(f"The AUC score is {roc_auc_score(logistic_train_pred, y_train)}")


0.8744187141663037
The precision score is 0.0.
The recall score is 0.0.


  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
# Random Forests
random_clf = RandomForestClassifier(max_depth=20, random_state=0, class_weight= "balanced")
random_clf.fit(X_train, y_train)
random_train_pred = random_clf.predict(X_train)
print(random_clf.score(X_train, y_train))

print(f"The precision score is {precision_score(random_train_pred, y_train)}.")

print(f"The recall score is {recall_score(random_train_pred, y_train)}.")

0.9651640550499758
The precision score is 0.986687916002625.
The recall score is 0.7888622395442962.
