In [1]:
# Import dependencies
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np

In [2]:
# Read in hit csv

data_df = pd.read_csv("hits.csv")
data_df.head()

Unnamed: 0.1,Unnamed: 0,Spotify_track_id,artist_x,song_x,ID,text,peak_position,hit,hit_value
0,0,005lwxGU1tms6HGELIcUv9,Katy Perry,I Kissed A Girl,I Kissed A GirlKaty Perry,This was never the way I planned Not my intent...,1.0,Yes,1
1,1,745H5CctFr12Mo7cqa1BMH,The Temptations,My Girl,My GirlThe Temptations,I've got sunshine on a cloudy day When it's co...,1.0,Yes,1
2,2,4w2DQnTAbzmduTv0zCT5QD,Olivia Newton-John,I Honestly Love You,I Honestly Love YouOlivia Newton-John,Maybe I hang around here A little more than I ...,1.0,Yes,1
3,3,3CKCZ9pfwAfoMZlMncA1Nc,Adele,Set Fire To The Rain,Set Fire To The RainAdele,"I let it fall, my heart, And as it fell you ro...",1.0,Yes,1
4,4,74jZhGv0fdLaf9q8AZZ15k,Carpenters,Please Mr. Postman,Please Mr. PostmanCarpenters,"(Stop) Oh yes, wait a minute Mister Postman (W...",1.0,Yes,1


In [3]:
# Remove unecessary columns and rename remaining columns

data_df = data_df[["artist_x", "song_x", "text", "peak_position", "hit", "hit_value"]]
data_df = data_df.rename(columns={"artist_x": "artist", "song_x": "song"})
data_df.head()

Unnamed: 0,artist,song,text,peak_position,hit,hit_value
0,Katy Perry,I Kissed A Girl,This was never the way I planned Not my intent...,1.0,Yes,1
1,The Temptations,My Girl,I've got sunshine on a cloudy day When it's co...,1.0,Yes,1
2,Olivia Newton-John,I Honestly Love You,Maybe I hang around here A little more than I ...,1.0,Yes,1
3,Adele,Set Fire To The Rain,"I let it fall, my heart, And as it fell you ro...",1.0,Yes,1
4,Carpenters,Please Mr. Postman,"(Stop) Oh yes, wait a minute Mister Postman (W...",1.0,Yes,1


In [4]:
# Set X and y for Preprocessing

X = data_df.text
y = data_df.hit

print(X.shape)
print(y.shape)

(5462,)
(5462,)


In [5]:
# Split data into training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

# Count number of unique words in training data using CountVectorizer
count_vect = CountVectorizer(stop_words="english")
X_train_counts = count_vect.fit_transform(X_train)
#X_train_counts.shape

# Count number of unique words in testing data
X_test_counts = count_vect.transform(X_test)
#X_test_counts.shape

In [7]:
from sklearn.feature_extraction.text import TfidfTransformer

# Reduce weighting for common words using TF-IDF in training data
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
#X_train_tfidf.shape

# Reduce weighting for common words using TF-IDF in testing data
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
#X_test_tfidf.shape

In [8]:
from sklearn.naive_bayes import MultinomialNB

# Run Naive Bayes algorithm using training data
nb = MultinomialNB().fit(X_train_tfidf, y_train)

In [9]:
# Make predictions using test data
nb_predicted = nb.predict(X_test_tfidf)

In [43]:
# Show r2 score
nb_score = nb.score(X_test_tfidf, y_test)
print(f"Accuracy of Naive Bayes model: {nb_score}")

Accuracy of Naive Bayes model: 0.5980966325036603


In [13]:
from sklearn.ensemble import RandomForestClassifier

# Run Random Forest algorithm with training data
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf = rf.fit(X_train_tfidf, y_train)

# Make predictions with test data
rf_predicted = rf.predict(X_test_tfidf)

In [44]:
# Show r2 score
rf_score = rf.score(X_test_tfidf, y_test)
print(f"Accuracy of Random Forest model: {rf_score}")

Accuracy of Random Forest model: 0.6193265007320644


In [15]:
from sklearn.linear_model import SGDClassifier

# Run Support Vector Machines algorithm
svm = SGDClassifier()
svm = svm.fit(X_train_tfidf, y_train)

# Make predictions
svm_predicted = svm.predict(X_test_tfidf)

In [45]:
# Show r2 score
svm_score = svm.score(X_test_tfidf, y_test)
print(f"Accuracy of Support Vector Machines model: {svm_score}")

Accuracy of Support Vector Machines model: 0.5834553440702782


In [20]:
 # Calculate classification report for each model
from sklearn.metrics import classification_report

print("Classification Reports for Natural Language Processing Models \n")
print("Naive Bayes Classification Report: \n" + classification_report(y_test, nb_predicted,
                            target_names=["Yes", "No"]))

print("Random Forest Classification Report: \n" + classification_report(y_test, rf_predicted,
                            target_names=["Yes", "No"]))

print("Support Vector Machines Classification Report: \n" + classification_report(y_test, svm_predicted,
                            target_names=["Yes", "No"]))

Classification Reports for Natural Language Processing Models 

Naive Bayes Classification Report: 
              precision    recall  f1-score   support

         Yes       0.66      0.37      0.47       668
          No       0.58      0.82      0.68       698

    accuracy                           0.60      1366
   macro avg       0.62      0.59      0.57      1366
weighted avg       0.62      0.60      0.58      1366

Random Forest Classification Report: 
              precision    recall  f1-score   support

         Yes       0.62      0.57      0.59       668
          No       0.62      0.67      0.64       698

    accuracy                           0.62      1366
   macro avg       0.62      0.62      0.62      1366
weighted avg       0.62      0.62      0.62      1366

Support Vector Machines Classification Report: 
              precision    recall  f1-score   support

         Yes       0.58      0.53      0.56       668
          No       0.59      0.63      0.61       6

# Classification reports for all three models

* Precision is true positives / (true positives + false positives). This measures the model's ability to not predict false positives.
    * Naive Bayes and Random Forest both have the highest precision with a score of 0.62
    

* Recall is true positives / (true positives + false negatives). This measures the model's ability to find all positive samples.
    * Random Forest has the highest recall score of 0.62
    

* f1-score is the average of recall and precision.
    * Precision and recall are equal for Random Forest and Support Vector Machines, so f1 is also the same.
    

* Support is the number of each label (Yes/No) in the test data


* With an average f1-score of 0.62, Random Forest is the most accurate of the three models.

In [47]:
from sklearn.metrics import confusion_matrix

# Print a confusiion matrix for each model

nb_confusion_matrix = confusion_matrix(y_test, nb_predicted)
rf_confusion_matrix = confusion_matrix(y_test, rf_predicted)
svm_confusion_matrix = confusion_matrix(y_test, svm_predicted)

print("Confusion Matrices for Natural Language Processing Models \n")
print("Matrix layout: true positive | false positive \n              false negative | true negative")
print("-" * 50)

print("Naive Bayes Confusion Matrix:")
print(nb_confusion_matrix)
print("\nRandom Forest Confusion Matrix:")
print(rf_confusion_matrix)
print("\nSupport Vector Machines Confusion Matrix:")
print(svm_confusion_matrix)


Confusion Matrices for Natural Language Processing Models 

Matrix layout: true positive | false positive 
              false negative | true negative
--------------------------------------------------
Naive Bayes Confusion Matrix:
[[246 422]
 [127 571]]

Random Forest Confusion Matrix:
[[380 288]
 [232 466]]

Support Vector Machines Confusion Matrix:
[[355 313]
 [256 442]]


# Classification reports for all three models

* Confusion matrices show the number of true and false positives and negatives in each model

* Top left and bottom right values are the correct predictions, top right and bottom left are incorrect

### Correct and incorrect predictions for each model

* Naive Bayes: 817 Correct predictions (246 tp, 571 tn), 549 Incorrect predictions (422 fp, 127 fn)
* Random Forest: 846 Correct predictions (380 tp, 466 tn), 520 Incorrect predictions (288 fp, 232 fn)
* Support Vector Machines: 797 Correct predictions (355 tp, 442 tn), 569 Incorrect predictions (313 fp, 256 fn)

### Conclusions

* As stated above, Random Forest was the most accurate of the three models with and f1-score of 0.62

* Random Forest correctly predicted the highest number of hit songs correctly and had the least false positives. However, it was not the best model at predicting songs that did not make the Billboard Hot 100.

* Naive Bayes was best at predicting non-hit songs with a total of 571 true negatives correctly predicted, 105 more than the next highest model (Random Forest). It also had the lowest number of false negatives with only 127.

* However, Naive Bayes also predicted a much higher amount of false positives with 422, 109 more than the next highest model (SVM)