# Task 3 Music Recommendation

### Music recommender system can suggest songs to users based on their listening pattern.

## Author: Pratyush Srivastava

## LGM VIP September 2021


In [None]:
#Connect Kaggle with Colab to Import Dataset
from google.colab import drive
drive.mount('/content/gdrive')




In [None]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content/gdrive/My Drive/Kaggle"

%cd /content/gdrive/My Drive/Kaggle

!kaggle competitions download -c kkbox-music-recommendation-challenge

In [None]:
!pip install pyunpack
!pip install patool
from pyunpack import Archive
Archive('train.csv.7z').extractall("/content/gdrive/My Drive/Kaggle")
Archive('test.csv.7z').extractall("/content/gdrive/My Drive/Kaggle")

Archive('members.csv.7z').extractall("/content/gdrive/My Drive/Kaggle")

Archive('songs.csv.7z').extractall("/content/gdrive/My Drive/Kaggle")

Successfully Connected Kaggle with Colab and imported and unzip dataset

Now we import all required libraries for our task

In [None]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import warnings
warnings.filterwarnings('ignore')
plt.style.use('ggplot')

In [None]:
train = pd.read_csv('train.csv')
test=pd.read_csv("test.csv")
members=pd.read_csv("members.csv")
songs=pd.read_csv("songs.csv")

In [None]:
train.head()

In [None]:
test.head()

In [None]:
members.head()

In [None]:
songs.head()

Successfullly Imported all required dataset

In [None]:
# 1% sample of items
train = train.sample(frac=0.01)

In [None]:
train.shape

In [None]:
train = pd.merge(train, songs, on='song_id', how='left')

In [None]:
train.shape

In [None]:
train.head()

In [None]:
train = pd.merge(train, members, on='msno', how='left')
train.shape
train.head()

In [None]:
train.info()

In [None]:
#Now I will count Null Values in %
train.isnull().sum()/train.isnull().count()*100

In [None]:
#Most of the Columns are Non Null but some Columns have Null Values, Now Remove Null Values from Columns
for i in train.select_dtypes(include=['object']).columns:
    train[i][train[i].isnull()] = 'unknown'
train = train.fillna(value=0)

In [None]:
#Now I will count Null Values in %
train.isnull().sum()/train.isnull().count()*100

There is no null value left in our dataset

Now Changing Date and Time Format

In [None]:
# registration_init_time
train.registration_init_time = pd.to_datetime(train.registration_init_time, format='%Y%m%d', errors='ignore')
train['registration_init_time_year'] = train['registration_init_time'].dt.year
train['registration_init_time_month'] = train['registration_init_time'].dt.month
train['registration_init_time_day'] = train['registration_init_time'].dt.day

# expiration_date
train.expiration_date = pd.to_datetime(train.expiration_date,  format='%Y%m%d', errors='ignore')
train['expiration_date_year'] = train['expiration_date'].dt.year
train['expiration_date_month'] = train['expiration_date'].dt.month
train['expiration_date_day'] = train['expiration_date'].dt.day

In [None]:
train.head()

In [None]:

# Changing Dates to category
train['registration_init_time'] = train['registration_init_time'].astype('category')
train['expiration_date'] = train['expiration_date'].astype('category')

In [None]:
train.head()

In [None]:
# Object data to category
for col in train.select_dtypes(include=['object']).columns:
    train[col] = train[col].astype('category')
    
# Encoding categorical features
for col in train.select_dtypes(include=['category']).columns:
    train[col] = train[col].cat.codes

In [None]:
train.head()

Now drop those columns which are not required for training

In [None]:

# Drop columns
train = train.drop(['expiration_date', 'lyricist'], 1)

In [None]:
train.shape

Done with Data Cleaning Process, Now start training Model

In [None]:
train1=train.copy()

In [None]:
X_train=train1.drop(['target'],axis=1).values
Y_train=train1['target'].values

In [None]:
x_train,x_test,y_train,y_test=train_test_split(X_train,Y_train,test_size=0.25)

In [None]:
from sklearn.ensemble import RandomForestClassifier


In [None]:
clf = RandomForestClassifier(n_estimators=250, max_depth=25, random_state=0)
clf.fit(X_train,Y_train)

In [None]:
train_plot = pd.DataFrame({'features': train.columns[train.columns != 'target'],
                        'importances': clf.feature_importances_})
train_plot = train_plot.sort_values('importances', ascending=False)

plt.figure(figsize=[11,5])
sns.barplot(x = train_plot.importances, y = train_plot.features)
plt.title('Importances of Features')
plt.show()

In [None]:
train2=train.copy()

In [None]:
# Drop columns with importances < 0.04
train2 = train2.drop(train_plot.features[train_plot.importances < 0.04].tolist(), 1)

# Selected columns
train2.columns

In [None]:
# Train & Test split
train3=train.copy()

X_train1=train3.drop(['target'],axis=1).values
Y_train1=train3['target'].values

x_train,x_test,y_train,y_test=train_test_split(X_train1,Y_train1,test_size=0.25)

In [None]:
# Create model
model2 = xgb.XGBClassifier(learning_rate=0.1, max_depth=15, min_child_weight=5, n_estimators=250)
model2.fit(X_train1, Y_train1)

In [None]:
# Predicting
Prediction = model2.predict(x_test)

In [None]:
from sklearn import metrics

print(metrics.classification_report(y_test, Prediction))

97% Accuracy recorded, It is highest using XGBoost