In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

data_paths = []

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        data_paths.append(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv(data_paths[0])
data = data.drop(['title','Unnamed: 0','id','uri','track_href','analysis_url','type','song_name'],axis=1)
print(data.columns)
data.head()


In [None]:
data.isna().sum()

## Data Analysis

In [None]:
data.shape

In [None]:
numeric = data._get_numeric_data()
genre = data['genre']
print(numeric.head())
print("Numeric columns: ",end=" ")
print(numeric.columns)
print(len(numeric.columns))


numeric.describe()


In [None]:
num_hist = numeric.hist(layout=(3,5),figsize=(20,10))
plt.show()

In [None]:
np.unique(genre)

### Features of each genre

In [None]:
grouped_genre = data.groupby('genre')

for col in numeric.columns:
    fig,ax = plt.subplots()
    
    for i, d in grouped_genre:
        d[col].hist(alpha=0.4, ax=ax, label=i,figsize=(10,4))
        ax.set_title(col)

    ax.legend()
    plt.show()

In [None]:
grouped_genre.mean()

In [None]:
grouped_genre.std()

In [None]:
grouped_genre.min()

In [None]:
grouped_genre.max()

In [None]:
genre_count = {}
for gen in np.unique(genre):
    genre_count[gen] = len(data[data['genre'] == gen])
genre_count

In [None]:
fig = plt.figure(figsize=(20,10))
plt.bar(height=list(genre_count.values()),x=list(genre_count.keys()))
plt.show()

#### Dataset a little imbalanced

### Correlation Analysis

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler
from imblearn.over_sampling import SMOTE

data['genre'] = LabelEncoder().fit_transform(data['genre'])
corr = data.corr()
print(corr)

In [None]:
corr['genre']

### Select features

In [None]:
features = data.drop(['key','mode','time_signature','danceability','genre'],axis=1)
features

In [None]:

labels = data['genre']
features = StandardScaler().fit_transform(features)


oversample = SMOTE()
features, labels = oversample.fit_resample(features, labels)

xtrain,xtest,ytrain,ytest = train_test_split(features,labels,test_size=0.2,shuffle=True)

In [None]:
labels

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import BaggingClassifier

model = BaggingClassifier()

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, xtrain, ytrain, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')

In [None]:
n_scores

In [None]:
n_scores.mean()

In [None]:
model = BaggingClassifier()
model.fit(xtrain,ytrain)
pred = model.predict(xtest)
pred

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(ytest,pred)