In [1]:
#importing all required packages

import numpy as np 
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing


In [2]:
#Load data

df_train = pd.read_csv("train.csv")
df_songs = pd.read_csv("songs.csv")

In [3]:
df_train.head()

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,explore,Explore,online-playlist,1
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,my library,Local playlist more,local-playlist,1
2,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,my library,Local playlist more,local-playlist,1
3,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,my library,Local playlist more,local-playlist,1
4,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,explore,Explore,online-playlist,1


In [4]:
df_songs.head()

Unnamed: 0,song_id,song_length,genre_ids,artist_name,composer,lyricist,language
0,CXoTN1eb7AI+DntdU1vbcwGRV4SCIDxZu+YD8JP8r4E=,247640,465,張信哲 (Jeff Chang),董貞,何啟弘,3.0
1,o0kFgae9QtnYgRkVPqLJwa05zIhRlUjfF7O1tDw0ZDU=,197328,444,BLACKPINK,TEDDY| FUTURE BOUNCE| Bekuh BOOM,TEDDY,31.0
2,DwVvVurfpuz+XPuFvucclVQEyPqcpUkHR0ne1RQzPs0=,231781,465,SUPER JUNIOR,,,31.0
3,dKMBWoZyScdxSkihKG+Vf47nc18N9q4m58+b4e7dSSE=,273554,465,S.H.E,湯小康,徐世珍,3.0
4,W3bqWd3T+VeHFzHAUfARgW9AvVRaF4N5Yzm4Mr6Eo/o=,140329,726,貴族精選,Traditional,Traditional,52.0


In [5]:
#Merge training data and songs data 
df_train = pd.merge(df_train, df_songs, how='inner', on=['song_id'])
df_train.shape[0]

7377304

In [6]:
#identifing the number of languages
a = df_train.language.unique()
print(a)
df_train.language.value_counts()

[ 52.  -1.  31.   3.  17.  10.  24.  59.  45.  38.  nan]


 3.0     4044643
 52.0    1864789
 31.0     656623
-1.0      308752
 17.0     245136
 10.0     171904
 24.0      78621
 59.0       4193
 45.0       2397
 38.0        210
Name: language, dtype: int64

In [7]:
#filter songs in language = 3
df_train = df_train.loc[df_train['language'] == -1.0]

In [8]:
#Remove data without artist name
df_train = df_train.loc[df_train['artist_name'] != 'Various Artists']
df_train.shape[0]

300932

In [9]:
#REMOVING NULL VALUES as they hinder the training
df_train = df_train.dropna(how='any')

In [10]:
df_train.shape[0]

28770

In [12]:
#create a new entry for songs with multiple genre_ids - good training 

(df_train.set_index(df_train.columns.drop('genre_ids',1).tolist()).genre_ids.str.split('|', expand=True).stack().reset_index().rename(columns={0:'genre_ids'}).loc[:, df_train.columns])
(df_train.set_index(df_train.columns.drop('composer',1).tolist()).composer.str.split('|', expand=True).stack().reset_index().rename(columns={0:'composer'}).loc[:, df_train.columns])
(df_train.set_index(df_train.columns.drop('lyricist',1).tolist()).lyricist.str.split('|', expand=True).stack().reset_index().rename(columns={0:'lyricist'}).loc[:, df_train.columns])
(df_train.set_index(df_train.columns.drop('artist_name',1).tolist()).artist_name.str.split('|', expand=True).stack().reset_index().rename(columns={0:'artist_name'}).loc[:, df_train.columns])

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target,song_length,genre_ids,artist_name,composer,lyricist,language
0,zRfOEH4Bl09lWEbWtYVEp5CffQ5VBDcSNFLumK291As=,ilNsh4b995pdJD3LVYCQB6JdWAM7AH/w4QnvbxWR2XE=,my library,Local playlist more,local-playlist,1,192679,242|726,寶寶水晶音樂,Alan Irwin Menken,Stephen Laurence Schwartz,-1.0
1,FYdHbSh9d8bEuHHkTO0fhVMybVwZCJz9hdGK6RFnmag=,ilNsh4b995pdJD3LVYCQB6JdWAM7AH/w4QnvbxWR2XE=,my library,Local playlist more,local-playlist,1,192679,242|726,寶寶水晶音樂,Alan Irwin Menken,Stephen Laurence Schwartz,-1.0
2,z9E+9+h+/UBSUD+pB/w7gvhe5L6e7VG8o5c/fIkfbCM=,ilNsh4b995pdJD3LVYCQB6JdWAM7AH/w4QnvbxWR2XE=,my library,Local playlist more,local-playlist,1,192679,242|726,寶寶水晶音樂,Alan Irwin Menken,Stephen Laurence Schwartz,-1.0
3,/NH6bPENe6jOUheEl65TNbsL/vCw5N6Er+wZKg6Bg7M=,ilNsh4b995pdJD3LVYCQB6JdWAM7AH/w4QnvbxWR2XE=,my library,Local playlist more,local-playlist,1,192679,242|726,寶寶水晶音樂,Alan Irwin Menken,Stephen Laurence Schwartz,-1.0
4,7Ufso2/kfKOQuMCZl1UMyYL2Xu9uZwseNCPeYbqbuHY=,ilNsh4b995pdJD3LVYCQB6JdWAM7AH/w4QnvbxWR2XE=,my library,Album more,album,1,192679,242|726,寶寶水晶音樂,Alan Irwin Menken,Stephen Laurence Schwartz,-1.0
5,JF+HD7AEy57Vc1NIrfe/on/mcnS2rBeRx6UVLua01Fw=,ilNsh4b995pdJD3LVYCQB6JdWAM7AH/w4QnvbxWR2XE=,my library,Local playlist more,local-library,1,192679,242|726,寶寶水晶音樂,Alan Irwin Menken,Stephen Laurence Schwartz,-1.0
6,DwQLXL3Ts0a8+l9GRTBCqsBCbe95sln859meMhcrVIU=,ilNsh4b995pdJD3LVYCQB6JdWAM7AH/w4QnvbxWR2XE=,my library,Local playlist more,local-library,1,192679,242|726,寶寶水晶音樂,Alan Irwin Menken,Stephen Laurence Schwartz,-1.0
7,nwDyh5Z5+UdqmoSoIt7EqLbob0PGHVxu5UVt/0mh01Q=,ilNsh4b995pdJD3LVYCQB6JdWAM7AH/w4QnvbxWR2XE=,my library,Local playlist more,local-library,1,192679,242|726,寶寶水晶音樂,Alan Irwin Menken,Stephen Laurence Schwartz,-1.0
8,6Hd5xT+nOqkeiwc0i/qF6hhHGutBigH1Cx3x2L9hoTw=,ilNsh4b995pdJD3LVYCQB6JdWAM7AH/w4QnvbxWR2XE=,my library,Local playlist more,local-library,1,192679,242|726,寶寶水晶音樂,Alan Irwin Menken,Stephen Laurence Schwartz,-1.0
9,9ijgLhtFIb0vwRnRCe91ZJmx4mZCLa9x9XN2GpLT0Co=,ilNsh4b995pdJD3LVYCQB6JdWAM7AH/w4QnvbxWR2XE=,my library,Local playlist more,local-library,1,192679,242|726,寶寶水晶音樂,Alan Irwin Menken,Stephen Laurence Schwartz,-1.0


In [13]:
df_train = df_train.drop('source_system_tab', 1)
df_train = df_train.drop('source_screen_name', 1)
df_train = df_train.drop('source_type', 1)
df_train = df_train.drop('song_length', 1)
df_train.head(5)

Unnamed: 0,msno,song_id,target,genre_ids,artist_name,composer,lyricist,language
1248882,zRfOEH4Bl09lWEbWtYVEp5CffQ5VBDcSNFLumK291As=,ilNsh4b995pdJD3LVYCQB6JdWAM7AH/w4QnvbxWR2XE=,1,242|726,寶寶水晶音樂,Alan Irwin Menken,Stephen Laurence Schwartz,-1.0
1248883,FYdHbSh9d8bEuHHkTO0fhVMybVwZCJz9hdGK6RFnmag=,ilNsh4b995pdJD3LVYCQB6JdWAM7AH/w4QnvbxWR2XE=,1,242|726,寶寶水晶音樂,Alan Irwin Menken,Stephen Laurence Schwartz,-1.0
1248884,z9E+9+h+/UBSUD+pB/w7gvhe5L6e7VG8o5c/fIkfbCM=,ilNsh4b995pdJD3LVYCQB6JdWAM7AH/w4QnvbxWR2XE=,1,242|726,寶寶水晶音樂,Alan Irwin Menken,Stephen Laurence Schwartz,-1.0
1248885,/NH6bPENe6jOUheEl65TNbsL/vCw5N6Er+wZKg6Bg7M=,ilNsh4b995pdJD3LVYCQB6JdWAM7AH/w4QnvbxWR2XE=,1,242|726,寶寶水晶音樂,Alan Irwin Menken,Stephen Laurence Schwartz,-1.0
1248886,7Ufso2/kfKOQuMCZl1UMyYL2Xu9uZwseNCPeYbqbuHY=,ilNsh4b995pdJD3LVYCQB6JdWAM7AH/w4QnvbxWR2XE=,1,242|726,寶寶水晶音樂,Alan Irwin Menken,Stephen Laurence Schwartz,-1.0


In [14]:
from sklearn import preprocessing
# Create a label (category) encoder object
le = preprocessing.LabelEncoder()

# Fit the encoder to the pandas column
le.fit(df_train['msno'])

# Apply the fitted encoder to the pandas column
df_train['msno_num'] = le.transform(df_train['msno']) 

In [15]:
# Fit the encoder to the pandas column
le.fit(df_train['genre_ids'])

# Apply the fitted encoder to the pandas column
df_train['genre_ids_num'] = le.transform(df_train['genre_ids']) 

In [16]:
# Fit the encoder to the pandas column
le.fit(df_train['lyricist'])

# Apply the fitted encoder to the pandas column
df_train['lyricist_num'] = le.transform(df_train['lyricist']) 

In [17]:
# Fit the encoder to the pandas column
le.fit(df_train['composer'])

# Apply the fitted encoder to the pandas column
df_train['composer_num'] = le.transform(df_train['composer']) 

In [18]:
# Fit the encoder to the pandas column
le.fit(df_train['artist_name'])

# Apply the fitted encoder to the pandas column
df_train['artist_name_num'] = le.transform(df_train['artist_name']) 

In [19]:
df_train.head()

Unnamed: 0,msno,song_id,target,genre_ids,artist_name,composer,lyricist,language,msno_num,genre_ids_num,lyricist_num,composer_num,artist_name_num
1248882,zRfOEH4Bl09lWEbWtYVEp5CffQ5VBDcSNFLumK291As=,ilNsh4b995pdJD3LVYCQB6JdWAM7AH/w4QnvbxWR2XE=,1,242|726,寶寶水晶音樂,Alan Irwin Menken,Stephen Laurence Schwartz,-1.0,7406,56,1712,67,745
1248883,FYdHbSh9d8bEuHHkTO0fhVMybVwZCJz9hdGK6RFnmag=,ilNsh4b995pdJD3LVYCQB6JdWAM7AH/w4QnvbxWR2XE=,1,242|726,寶寶水晶音樂,Alan Irwin Menken,Stephen Laurence Schwartz,-1.0,2021,56,1712,67,745
1248884,z9E+9+h+/UBSUD+pB/w7gvhe5L6e7VG8o5c/fIkfbCM=,ilNsh4b995pdJD3LVYCQB6JdWAM7AH/w4QnvbxWR2XE=,1,242|726,寶寶水晶音樂,Alan Irwin Menken,Stephen Laurence Schwartz,-1.0,7372,56,1712,67,745
1248885,/NH6bPENe6jOUheEl65TNbsL/vCw5N6Er+wZKg6Bg7M=,ilNsh4b995pdJD3LVYCQB6JdWAM7AH/w4QnvbxWR2XE=,1,242|726,寶寶水晶音樂,Alan Irwin Menken,Stephen Laurence Schwartz,-1.0,147,56,1712,67,745
1248886,7Ufso2/kfKOQuMCZl1UMyYL2Xu9uZwseNCPeYbqbuHY=,ilNsh4b995pdJD3LVYCQB6JdWAM7AH/w4QnvbxWR2XE=,1,242|726,寶寶水晶音樂,Alan Irwin Menken,Stephen Laurence Schwartz,-1.0,1104,56,1712,67,745


In [None]:
#translation of languages to english

from googletrans import Translator
translator = Translator()

for i in range(0, len(df_train)):
    df_songs.composer.iloc[i] = translator.translate(df_songs.composer.iloc[i])
    print(i)

In [None]:
print(df_songs.composer[670:1000])

In [None]:
df_songs['composer'] = df_songs['composer'].astype('str') 


In [None]:
s = df_songs.composer[0]
start = s.find('text=') +5
end = s.find(',', start)
s[start:end]


In [None]:
for i in range(680, 1000):
    start = df_songs.composer[i].find('text=') +5
    end = df_songs.composer[i].find(',', start)
    df_songs.composer[i] = df_songs.composer[i][start:end]

In [20]:
x = df_train[['lyricist_num','composer_num','artist_name_num','genre_ids_num']].copy()
# x = df_train[['composer_num','artist_name_num','genre_ids_num']].copy()
x.head(8)

Unnamed: 0,lyricist_num,composer_num,artist_name_num,genre_ids_num
1248882,1712,67,745,56
1248883,1712,67,745,56
1248884,1712,67,745,56
1248885,1712,67,745,56
1248886,1712,67,745,56
1248887,1712,67,745,56
1248888,1712,67,745,56
1248889,1712,67,745,56


In [21]:
y = df_train[['target']].copy()
y.head()

Unnamed: 0,target
1248882,1
1248883,1
1248884,1
1248885,1
1248886,1


In [22]:
# create a base classifier used to evaluate a subset of attributes
model = LogisticRegression()

# create the RFE model and select 3 attributes
rfe = RFE(model, 3)
rfe = rfe.fit(x, y.values.ravel())

# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)

[False  True  True  True]
[2 1 1 1]


In [23]:
#split train test data

from sklearn.model_selection import train_test_split

trainx, testx = train_test_split(x, test_size=0.2)
trainy, testy = train_test_split(y, test_size=0.2)

In [24]:
# try1 SVM 
from sklearn import svm
clf = svm.LinearSVC()
clf.fit(trainx , trainy.values.ravel())
pred = clf.predict(testx)
print(pred)

[1 1 1 ..., 0 0 1]


In [25]:
#accuracy calculation SVM
from sklearn.metrics import accuracy_score
acc = accuracy_score(testy, pred)
print(acc)

0.496176572819


In [26]:
# Try2 logistic regression

from sklearn import linear_model
h = .02  # step size in the mesh
logreg = linear_model.LogisticRegression(C=1e5)
# we create an instance of Neighbours Classifier and fit the data.
logreg.fit(trainx, trainy.values.ravel())
pred = logreg.predict(testx)

In [30]:
#accuracy calculation
from sklearn.metrics import accuracy_score
acc = accuracy_score(testy, pred)
print(acc)

0.554570733403


In [28]:
#try3 GradientBoostingClassifier

from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
max_depth=1, random_state=0).fit(trainx, trainy.values.ravel())
    

In [29]:
pred = clf.predict(testx)
print(pred)

[0 0 0 ..., 0 0 0]


In [27]:
#accuracy calculation logistic Regression
from sklearn.metrics import accuracy_score
acc = accuracy_score(testy, pred)
print(acc)


0.555787278415
