In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
dataset = pd.read_csv('../input/the-spotify-hit-predictor-dataset/dataset-of-90s.csv')#importing CSV file
dataset.head()

## Exploratory Data Analysis

In [None]:
features_with_na = [ feature for feature in dataset.columns if dataset[feature].isnull().sum() >1]#extracting feature with nan values
features_with_na

In [None]:
dataset.isnull().sum() #no null values

In [None]:
numerical_feature = [ feature for feature in dataset.columns if dataset[feature].dtypes != "O"] #extracting numerical feature
numerical_feature

In [None]:
discrete_feature = [feature for feature in numerical_feature if len(dataset[feature].unique()) <15] #extracting discrete features
discrete_feature

In [None]:
for feature in discrete_feature: #understanding impact of discrete values on taget
    data = dataset.copy()
    data.groupby(feature)["target"].median().plot.bar()
    plt.xlabel(feature)
    plt.ylabel("target")
    plt.show()

In [None]:
continuous_feature = [feature for feature in numerical_feature if feature not in discrete_feature]#extracting continuous features
continuous_feature

In [None]:
for feature in continuous_feature: #plotting continuous features
    data = dataset.copy()
    data[feature].hist(bins=20)
    plt.xlabel(feature)
    plt.ylabel("count")
    plt.show()

In [None]:
for feature in continuous_feature: #transforming skew features to log transform
    if 0 in data[feature].unique():
        pass
    else:
        data[feature] = np.log(data[feature])
        data[feature].hist(bins=20)
        plt.xlabel(feature)
        plt.ylabel("counts")
        plt.show()

In [None]:
for feature in continuous_feature: #checking outliers
    data = dataset.copy()
    if 0 in data[feature].unique():
        pass
    else:
        data[feature] = np.log(data[feature])
        data.boxplot(column=feature)
        plt.ylabel(feature)
        plt.title(feature)
        plt.show()

In [None]:
categorical_features = [feature for feature in dataset.columns if dataset[feature].dtypes == "O"] #extracting categorical features
categorical_features

In [None]:
for feature in categorical_features:
    print("The Feature is {} and the no of categories are: {}".format(feature,len(dataset[feature].unique())))

## Feature Engineering

In [None]:
dataset.corr()

In [None]:
sns.heatmap(dataset.corr()) #correlation

In [None]:
def correlation(dataset, threshold):#extracting highly correlated features 
    col_corr = set() # Set of all the names of deleted columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if (corr_matrix.iloc[i, j] >= threshold):
                colname = corr_matrix.columns[i] # getting the name of column
                col_corr.add(colname)
    return col_corr

In [None]:
correlation_features = correlation(dataset,0.7)
correlation_features

In [None]:
for feature in continuous_feature:#handling skew data with log transform
    if 0 in dataset[feature].unique():
        pass
    else:
        dataset[feature] = np.log(dataset[feature])
        dataset[feature].hist(bins=20)
        plt.xlabel(feature)
        plt.ylabel("counts")
        plt.show()

In [None]:
dataset = dataset.drop(["loudness","sections"],axis = 1) #dropping highly correlated features

In [None]:
dataset.describe()

In [None]:
feature = [feature for feature in dataset.columns if feature not in ["target","artist","uri","track"]]
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(dataset[feature])

In [None]:
x = scaler.transform(dataset[feature]) #scaling data

In [None]:
dataset.head()

In [None]:
data = pd.concat([dataset[["target","artist","uri","track"]].reset_index(drop= True),pd.DataFrame(scaler.transform(dataset[feature]),columns = feature)],axis=1)

In [None]:
data.head()

In [None]:
data.drop(["artist","uri","track"],axis=1,inplace=True) #dropping useless features

In [None]:
data.head()

## Selecting Threshold for Classification

In [None]:
X = data.iloc[:,1:]
Y = data.iloc[:,[0]]
Y

In [None]:
from sklearn.model_selection import train_test_split #train-test split
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=30)

In [None]:
from sklearn.datasets import make_classification
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_auc_score

In [None]:
from sklearn.linear_model import LogisticRegression 
lr = LogisticRegression(solver='liblinear')
lr.fit(x_train,y_train)
ytrain_pred = lr.predict_proba(x_train)
ytest_pred = lr.predict_proba(x_test)
print("roc train score :",roc_auc_score(y_train,ytrain_pred[:,1]))
print("roc test score :",roc_auc_score(y_test,ytest_pred[:,1]))

In [None]:
from sklearn.ensemble import RandomForestClassifier
random_model = RandomForestClassifier()
random_model.fit(x_train,y_train)
ytrain_pred = random_model.predict_proba(x_train)
ytest_pred = random_model.predict_proba(x_test)
print("roc train score :",roc_auc_score(y_train,ytrain_pred[:,1]))
print("roc test score :",roc_auc_score(y_test,ytest_pred[:,1]))

In [None]:
from sklearn.ensemble import AdaBoostClassifier
ada_model = AdaBoostClassifier()
ada_model.fit(x_train,y_train)
ytrain_pred = ada_model.predict_proba(x_train)
ytest_pred  = ada_model.predict_proba(x_test)
print("roc train score :",roc_auc_score(y_train,ytrain_pred[:,1]))
print("roc test score :",roc_auc_score(y_test,ytest_pred[:,1]))

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = AdaBoostClassifier()
knn_model.fit(x_train,y_train)
ytrain_pred = knn_model.predict_proba(x_train)
ytest_pred  = knn_model.predict_proba(x_test)
print("roc train score :",roc_auc_score(y_train,ytrain_pred[:,1]))
print("roc test score :",roc_auc_score(y_test,ytest_pred[:,1]))

In [None]:
pred = []#predicting probablities of all the models test data
for model in [lr,random_model,ada_model,knn_model]:
    pred.append(pd.Series(model.predict_proba(x_test)[:,1]))
final_prediction = pd.concat(pred,axis=1).mean(axis=1)
print("test ROC-AUC",roc_auc_score(y_test,final_prediction))

In [None]:
pd.concat(pred,axis=1)

In [None]:
from sklearn.metrics import roc_curve, auc #finding threshold,fpr,tpr
fpr,tpr,threshold = roc_curve(y_test,final_prediction)
threshold

In [None]:
from sklearn.metrics import accuracy_score #finding accuracy with different threshold
accuracy_ls = []
for thres in threshold:
    y_pred = np.where(final_prediction>thres,1,0)
    accuracy_ls.append(accuracy_score(y_test,y_pred,normalize=True))
    
accuracy_ls = pd.concat([pd.Series(threshold),pd.Series(accuracy_ls)],axis=1)
accuracy_ls.columns = ['threshold','accuracy']
accuracy_ls.sort_values(by='accuracy',ascending=False,inplace =True)
accuracy_ls.head()


In [None]:
accuracy_ls #accuracy vs threshold

## Final Model and Results

In [None]:
from sklearn.ensemble import RandomForestClassifier #choose random forest as final model
random_model = RandomForestClassifier( criterion='gini')
random_model.fit(x_train,y_train)
y_pred = random_model.predict(x_test)
print("roc test score :",roc_auc_score(y_test,y_pred))

In [None]:
threshold = 0.493468 #best threshold for the model

predicted_proba = random_model.predict_proba(x_test)
predicted = (predicted_proba [:,1] >= threshold).astype('int')


accuracy = accuracy_score(y_test, predicted)
print(accuracy)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score #measuring perfomance of model
p_score = precision_score(y_test, y_pred)
print('p_score',p_score)
r_score = recall_score(y_test, y_pred)
print('r_score',r_score)
f1 = f1_score(y_test, y_pred)
print('f1 score',f1)

In [None]:
from sklearn.metrics import confusion_matrix # confusion matrix
confusion_matrix(y_test,y_pred)