**Loading data**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
import seaborn as sns

import warnings 
warnings.filterwarnings('ignore')

In [2]:
#load data

df_train = pd.read_csv('../input/song-popularity-prediction/train.csv')
df_test = pd.read_csv('../input/song-popularity-prediction/test.csv')

**Basic EDA**

In [3]:
df_train.shape

In [4]:
df_test.shape

In [5]:
df_train.head()

In [6]:
df_test.head()

In [7]:
np.sum(df_train.isnull())

**Using SimpleIMputer from SKlearn for imputing data for missing values**

In [8]:
from sklearn.impute import SimpleImputer
imptr = SimpleImputer(strategy="median", add_indicator=False)

In [9]:
#Train data
imptr = imptr.fit(df_train[['song_duration_ms','acousticness','danceability','energy','instrumentalness','key','liveness','loudness']])
df_train[['song_duration_ms','acousticness','danceability','energy','instrumentalness','key','liveness','loudness']] = imptr.transform(df_train[['song_duration_ms','acousticness','danceability','energy','instrumentalness','key','liveness','loudness']])

In [10]:
#Test data
imptr = imptr.fit(df_test[['song_duration_ms','acousticness','danceability','energy','instrumentalness','key','liveness','loudness']])
df_test[['song_duration_ms','acousticness','danceability','energy','instrumentalness','key','liveness','loudness']] = imptr.transform(df_test[['song_duration_ms','acousticness','danceability','energy','instrumentalness','key','liveness','loudness']])

In [11]:
np.sum(df_train.isnull())

In [12]:
np.sum(df_test.isnull())

In [13]:
df_train.describe()

In [14]:
df_train.audio_mode.value_counts()

In [15]:
df_train.time_signature.value_counts()

audio mode and time signature are categorical values, they will be handled later using Encoding

In [16]:
df_train.song_popularity.value_counts()
#0 -- not popular
#1 -- popular

In [17]:
pop = df_train[df_train['song_popularity'] == 1]
not_pop = df_train[df_train['song_popularity'] == 0]

print("Percentage of Songs popular", len(pop['song_popularity']) * 100 / len(df_train))
print("Percentage of Songs not popular", len(not_pop['song_popularity']) * 100 / len(df_train))

In [18]:
corr = df_train.corr()
sns.heatmap(corr,
            fmt='.1f',
            linewidth=0.2,
            linecolor='black',
            annot = True,
            cmap="YlGnBu" 
            )

**Positive corr**

loudness and energy = 0.6

loudness and audio_valence = 0.4

danceability and audio_valence = 0.4

**Negative corr**

acousticness and energy = -0.6

acousticness and loudness = -0.5

**plotting highly coorelated attributes**

In [19]:
plt.scatter(df_train.loudness,df_train.energy)
sns.set_style("whitegrid")

In [20]:
plt.scatter(df_train.energy,df_train.acousticness)
sns.set_style("whitegrid")

In [21]:
plt.scatter(df_train.loudness,df_train.acousticness)
sns.set_style("whitegrid")

In [22]:
#testing low coorelated values
plt.scatter(df_train.audio_valence,df_train.tempo)
sns.set_style("whitegrid")

In [23]:
df_train.dtypes
#audio_mode and time_signature are categorical
#other are continuous

**Encoding both the values using "One-hot" encoder**

In [19]:
#train
df_train_encoded = pd.get_dummies(df_train, columns=["audio_mode", "time_signature"])

In [20]:
#test
df_test_encoded = pd.get_dummies(df_test, columns=["audio_mode", "time_signature"])

In [21]:
df_train_encoded.head(5)

In [22]:
#dropping id column as they are not necessary for predicting
df_train = df_train_encoded.drop(['id'], axis = 1)

In [23]:
df_test = df_test_encoded.drop(['id'], axis = 1)

In [24]:
df_train.shape

In [25]:
df_test.shape

**checking distribution of data if they are normal, right or left skewed**

In [26]:
sns.distplot(df_train['song_duration_ms'],rug=True,color='#38b000')

In [32]:
sns.distplot(df_train['loudness'],rug=True,color='#38b000')

In [33]:
sns.distplot(df_train['acousticness'],rug=True,color='#38b000')

In [34]:
sns.distplot(df_train['danceability'],rug=True,color='#38b000')

In [35]:
sns.distplot(df_train['energy'],rug=True,color='#38b000')

In [36]:
sns.distplot(df_train['instrumentalness'],rug=True,color='#38b000')

In [None]:
sns.distplot(df_train['key'],rug=True,color='#38b000')

skewed data will be handled later

**Using Random Forest, KNN and XGBoost to check on predictions, will be using the optimised algorithm later**

In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [28]:
rf = RandomForestClassifier()

In [29]:
df_train

In [30]:
X = df_train.copy()
#X_test = df_test.copy()

In [31]:
#X,y split
y = X['song_popularity']
X = X.drop('song_popularity', axis=1)

In [32]:
X.shape

In [33]:
X.head()

In [34]:
y.head()

In [35]:
df_test.shape

In [36]:
#Scaling data
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()

col = X.columns
X = pd.DataFrame(scaler.fit_transform(X))
X.columns = col


col = df_test.columns
df_test = pd.DataFrame(scaler.fit_transform(df_test))
df_test.columns = col

In [37]:
sns.kdeplot(df_test.loudness, shade=True,color="g")
sns.kdeplot(df_test.energy, shade=True,color="r")

In [38]:
sns.kdeplot(df_test.acousticness, shade=True,color="g")
sns.kdeplot(df_test.energy, shade=True,color="r")

In [39]:
#Split dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4)

In [40]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

**Using Random Forest**

In [41]:
#fitting on training data
rf.fit(X_train, y_train)

In [42]:
rf.score(X_train, y_train)

In [43]:
#this cannot be ran as we don't have y_test - in the Test data, we do not have the target variable
rf.score(X_test, y_test)

In [44]:
#now prdicting the RF model on the X_train features
pred_y_train_rf = rf.predict(X_train)

In [45]:
#now prdicting the RF model on the X_test features
pred_y_test_rf = rf.predict(X_test)

In [46]:
metrics.accuracy_score(pred_y_train_rf, y_train)

In [47]:
metrics.accuracy_score(pred_y_test_rf, y_test)

In [None]:
#print('Random Forest R2 Score', metrics.r2_score(y_train,rf.predict(X_train)))
#print('Random Forest OOB Score',rf.oob_score_ )

In [48]:
print('AUC scores on the training set: ', cross_val_score(rf, X_train,y_train, cv=5, scoring='roc_auc'),
      'Mean of AUC scores: ', np.mean(cross_val_score(rf, X_train,y_train, cv=5, scoring='roc_auc')))

In [49]:
print('AUC scores on the training set: ', cross_val_score(rf, X_train,pred_y_train_rf, cv=5, scoring='roc_auc'),
      'Mean of AUC scores: ', np.mean(cross_val_score(rf, X_train,pred_y_train_rf, cv=5, scoring='roc_auc')))

In [50]:
print('AUC scores on the test set: ', cross_val_score(rf, X_test,pred_y_test_rf, cv=5, scoring='roc_auc'),
      'Mean of AUC scores: ', np.mean(cross_val_score(rf, X_test,pred_y_test_rf, cv=5, scoring='roc_auc')))

In [51]:
actual_test_rf_preds = rf.predict(df_test)

In [52]:
actual_test_rf_preds.shape

In [53]:
print('AUC scores on the test set: ', cross_val_score(rf, df_test,actual_test_rf_preds, cv=5, scoring='roc_auc'),
      'Mean of AUC scores: ', np.mean(cross_val_score(rf, df_test,actual_test_rf_preds, cv=5, scoring='roc_auc')))

In [54]:
matrix = confusion_matrix(y_test, pred_y_test_rf, labels=[1,0])
print('Confusion matrix : \n',matrix)

In [55]:
width = 2
height= 2
plt.figure(figsize=(5,5))
plt.imshow(matrix, interpolation='nearest', cmap='Pastel1')
plt.title('Confusion matrix')
plt.colorbar()
tick_marks = np.arange(10)
#plt.xticks(["0", "1"])
#plt.yticks(["0", "1"])
plt.tight_layout()
plt.ylabel('actual', size = 15)
plt.xlabel('predicted', size = 15)
width, height = matrix.shape
for x in range(width):
    for y in range(height):
        plt.annotate(str(matrix[x][y]), xy=(y, x), 
        horizontalalignment='center',
        verticalalignment='center')

In [56]:
print(classification_report(y_test, pred_y_test_rf,labels=[1,0]))
#1 -- Popular
#0 -- Not popular

**Using XGBoost**

In [57]:
from xgboost import XGBClassifier
from sklearn import model_selection
clf = XGBClassifier()

In [58]:
clf.fit(X_train, y_train)

In [60]:
clf.score(X_test, y_test)

In [61]:
pred_y_test_clf = clf.predict(X_test)

In [62]:
actual_test_clf_preds = clf.predict(df_test)

In [63]:
print('AUC scores on the test set: ', cross_val_score(clf, df_test,actual_test_clf_preds, cv=5, scoring='roc_auc'),
      'Mean of AUC scores: ', np.mean(cross_val_score(clf, df_test,actual_test_clf_preds, cv=5, scoring='roc_auc')))

In [64]:
matrix = confusion_matrix(y_test, pred_y_test_clf, labels=[1,0])
print('Confusion matrix : \n',matrix)

In [65]:
width = 2
height= 2
plt.figure(figsize=(5,5))
plt.imshow(matrix, interpolation='nearest', cmap='Pastel1')
plt.title('Confusion matrix')
plt.colorbar()
tick_marks = np.arange(10)
#plt.xticks(["0", "1"])
#plt.yticks(["0", "1"])
plt.tight_layout()
plt.ylabel('actual', size = 15)
plt.xlabel('predicted', size = 15)
width, height = matrix.shape
for x in range(width):
    for y in range(height):
        plt.annotate(str(matrix[x][y]), xy=(y, x), 
        horizontalalignment='center',
        verticalalignment='center')

In [78]:
#from sklearn.neighbors import KNeighborsClassifier

In [79]:
#knn = KNeighborsClassifier(n_neighbors=5)

In [80]:
#knn.fit(X_train, y_train)

In [81]:
#pred_y_test_knn = knn.predict(X_test)

In [82]:
#print('AUC scores on the training set: ', cross_val_score(knn, X_train,y_train, cv=5, scoring='roc_auc'),
#     'Mean of AUC scores: ', np.mean(cross_val_score(knn, X_train,y_train, cv=5, scoring='roc_auc')))

In [83]:
#print('AUC scores on the test set: ', cross_val_score(rf, X_test,pred_y_test_knn, cv=5, scoring='roc_auc'),
#      'Mean of AUC scores: ', np.mean(cross_val_score(rf, X_test,pred_y_test_knn, cv=5, scoring='roc_auc')))

**Submission** ::
Random Forest and XGBoost seems to be providing the best scores **~.54**

Random Forest AUC on X_Test is **0.83** which is good, hence submitting for the same. Will be working further on XGBoost

In [84]:
sample_submission = pd.read_csv('../input/song-popularity-prediction/sample_submission.csv')

In [85]:
sample_submission.shape

In [86]:
sample_submission.head(5)

In [87]:
output = pd.DataFrame({'id': sample_submission.index,
                       'song_popularity': actual_test_rf_preds})

In [88]:
output

In [89]:
output.to_csv('./submission.csv', index=False)