# Music Recommendation System

In [24]:
# Load Python libraries
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn import ensemble, metrics
import xgboost as xgb
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import warnings

### Importing Datasets

In [25]:
df = pd.read_csv('../input/musicdata/train.csv')

In [26]:
df.head()

In [27]:
df.shape

In [28]:
df = df.sample(frac=0.1)

In [29]:
df.info()

In [30]:
songs = pd.read_csv('../input/musicdata/songs.csv')

In [31]:
songs.info()

In [32]:
df = pd.merge(df, songs, on='song_id', how='left')
del songs

In [33]:
df.info()

In [34]:
members = pd.read_csv('../input/musicdata/members.csv')

In [35]:
df = pd.merge(df, members, on='msno', how='left')
del members

In [36]:
df.info()

##### Replace NA

In [37]:
# Replace NA
for i in df.select_dtypes(include=['object']).columns:
    df[i][df[i].isnull()] = 'unknown'
df = df.fillna(value=0)

In [38]:
df.info()

##### Create Dates

In [39]:
# Create Dates

# registration_init_time
df.registration_init_time = pd.to_datetime(df.registration_init_time, format='%Y%m%d', errors='ignore')
df['registration_init_time_year'] = df['registration_init_time'].dt.year
df['registration_init_time_month'] = df['registration_init_time'].dt.month
df['registration_init_time_day'] = df['registration_init_time'].dt.day

# expiration_date
df.expiration_date = pd.to_datetime(df.expiration_date,  format='%Y%m%d', errors='ignore')
df['expiration_date_year'] = df['expiration_date'].dt.year
df['expiration_date_month'] = df['expiration_date'].dt.month
df['expiration_date_day'] = df['expiration_date'].dt.day

In [40]:
#Dates to categoty
df['registration_init_time'] = df['registration_init_time'].astype('category')
df['expiration_date'] = df['expiration_date'].astype('category')

In [41]:
# Object data to category
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].astype('category')
    
# Encoding categorical features
for col in df.select_dtypes(include=['category']).columns:
    df[col] = df[col].cat.codes

In [42]:
df.corr()

In [43]:
plt.figure(figsize=[7,5])
sns.heatmap(df.corr())
plt.show()

### RandomForest

In [44]:
# Model with the best estimator
model = ensemble.RandomForestClassifier(n_estimators=250, max_depth=25)
model.fit(df[df.columns[df.columns != 'target']], df.target)

In [45]:
df_plot = pd.DataFrame({'features': df.columns[df.columns != 'target'],
                        'importances': model.feature_importances_})
df_plot = df_plot.sort_values('importances', ascending=False)

In [46]:
plt.figure(figsize=[11,5])
sns.barplot(x = df_plot.importances, y = df_plot.features)
plt.title('Importances of Features Plot')
plt.show()

In [47]:
model.feature_importances_

In [48]:
df = df.drop(df_plot.features[df_plot.importances < 0.04].tolist(), 1)

In [49]:
list(df.columns)

In [50]:
target = df.pop('target')

In [51]:
from sklearn.model_selection import train_test_split

In [52]:
train_data, test_data, train_labels, test_labels = train_test_split(df, target, test_size = 0.3)
model = xgb.XGBClassifier(learning_rate=0.1, max_depth=15, min_child_weight=5, n_estimators=250)
model.fit(train_data, train_labels)

In [53]:
predict_labels = model.predict(test_data)
print(metrics.classification_report(test_labels, predict_labels))