In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from tqdm import tqdm 
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error 
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold
from sklearn.decomposition import TruncatedSVD
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv("train.csv")
train

In [None]:
y = train["score"]
X = train.drop(columns="score")
X

In [None]:
test = pd.read_csv("test.csv")
test

In [None]:
train_idx = len(train)
X_tot = X.append(test)
X_tot

In [None]:
save_for_later = pd.read_csv("save_for_later.csv")
save_for_later

In [None]:
t1 = pd.merge(X_tot, save_for_later, how="inner")
t1["save_for_later"] = [1 for _ in range(len(t1))]
t1

In [None]:
X_tot1 = X_tot.merge(X_tot.merge(t1, how="left", sort=False))
X_tot1["save_for_later"] = X_tot1["save_for_later"].fillna(0)
X_tot1

In [None]:
songs = pd.read_csv("songs.csv")
songs

In [None]:
X_tot2 = X_tot1.merge(X_tot1.merge(songs, how="outer", on="song_id", sort=False))
X_tot2

In [None]:
labels = pd.read_csv("song_labels.csv")
labels

In [None]:
platforms = labels["platform_id"].unique()
df = pd.DataFrame()  
sum_count, max_label, max_count = [], [], []
for l in tqdm(platforms):
    temp_df = labels[labels["platform_id"]==l]
    sum_count.append(temp_df["count"].sum())
    idx = temp_df["count"].argmax()
    max_label.append(temp_df.iloc[idx][1])
    max_count.append(temp_df.iloc[idx][2])

In [None]:
df["platform_id"] = platforms
df["count_sum"] = sum_count
df["max_label"] = max_label
df["max_count"] = max_count
df

In [None]:
X_tot3 = X_tot2.merge(X_tot2.merge(df, how="outer", on="platform_id", sort=False))
X_tot3

In [None]:
X_tot4 = pd.get_dummies(X_tot3, columns=["language"])
X_tot4

In [None]:
X_train = X_tot4[:train_idx]
X_test = X_tot4[train_idx:]

In [None]:
song_matrix = pd.concat([train,test]).drop_duplicates(subset = ['song_id','customer_id'], keep='first')
song_matrix = song_matrix.pivot('customer_id','song_id','score')
song_matrix

In [None]:
song_means = song_matrix.mean()
customer_means = song_matrix.mean(axis=1)
global_mean = y.mean()

In [None]:
s_temp = pd.DataFrame(song_means.to_frame().reset_index())
s_temp.rename(columns={0 : "song_means"}, inplace=True)
s_temp

In [None]:
X_total = X_train.append(X_test)
X_total

In [None]:
X_ = X_total.merge(X_total.merge(s_temp, on="song_id", how="outer", sort=False))
X_

In [None]:
c_temp = pd.DataFrame(customer_means.to_frame().reset_index())
c_temp.rename(columns={0 : "customer_means"}, inplace=True)
c_temp

In [None]:
X_1 = X_.merge(X_.merge(c_temp, on="customer_id", how="outer", sort=False))
X_1

In [None]:
X_1["released_year"][X_1["released_year"]<1595] = np.nan

In [None]:
X_1["per_comment"] = X_1["number_of_comments"]/X_1["released_year"]
X_1["per_max_count"] = X_1["max_count"]/X_1["released_year"]
X_1["per_count_sum"] = X_1["count_sum"]/X_1["released_year"]
X_1

In [None]:
X_2 = X_1.copy()
X_3 = X_2.drop(["song_id", "customer_id", "platform_id", "number_of_comments", "count_sum", "max_count"], axis=1)
X_3

In [None]:
X_train_new = X_3[:len(train)]
X_test_new = X_3[len(train):]

In [None]:
#ypred14 (df1)
xgb_model_1 = xgb.XGBRegressor(colsample_bytree=0.4,
                 gamma=0,                 
                 learning_rate=0.07,
                 max_depth=3,
                 min_child_weight=1.5,
                 n_estimators=10000,                                                                    
                 reg_alpha=0.75,
                 reg_lambda=0.45,
                 subsample=0.6,
                 seed=42)
xgb_model_1.fit(X_train_new, y, eval_metric="rmse")

In [None]:
y_xgb_1 = xgb_model_1.predict(X_test_new)
df1 = pd.DataFrame()
df1["score"] = y_xgb_1
df1

In [None]:
#ypred17 df2
y_pred_table1 = np.zeros((test.shape[0],50))
for i in tqdm(range(50)):
    X_temp, X_val, y_temp, y_val = train_test_split(X_train_new, y, test_size=0.2, random_state = i)
    reg1 = xgb.XGBRegressor(n_estimators=50)
    reg1.fit(X_temp, y_temp, eval_metric="rmse")
    y_pred_table1[:,i] = reg1.predict(X_test_new)

In [None]:
y_xgb_2 = np.sum(y_pred_table1, axis=1)/50
df2 = pd.DataFrame()
df2["score"] = y_xgb_2
df2

In [None]:
#ypred22
xgb_model_3 = xgb.XGBRegressor(colsample_bytree=0.4,
                 gamma=0,                 
                 learning_rate=0.07,
                 max_depth=4,
                 min_child_weight=1.5,
                 n_estimators=10000,                                                                    
                 reg_alpha=0.75,
                 reg_lambda=0.45,
                 subsample=0.6,
                 seed=42)
xgb_model_3.fit(X_train_new, y, eval_metric="rmse")

In [None]:
y_xgb_3 = xgb_model_3.predict(X_test_new)
df3 = pd.DataFrame()
df3["score"] = y_xgb_3
df3

In [None]:
#ypred25
xgb_model_4 = xgb.XGBRegressor(colsample_bytree=0.4,
                 gamma=0,                 
                 learning_rate=0.07,
                 max_depth=5,
                 min_child_weight=1.5,
                 n_estimators=10000,                                                                    
                 reg_alpha=0.75,
                 reg_lambda=0.45,
                 subsample=0.6,
                 seed=42)
xgb_model_4.fit(X_train_new, y, eval_metric="rmse")

In [None]:
y_xgb_4 = xgb_model_4.predict(X_test_new)
df4 = pd.DataFrame()
df4["score"] = y_xgb_4
df4

In [None]:
#ypred7
song_means = song_matrix.mean()
customer_means = song_matrix.mean(axis=1)
song_shifted_temp = song_matrix-song_means
song_shifted = song_shifted_temp.fillna(0)
mask = -song_shifted_temp.isnull()

In [None]:
def matrix_reconstruction_method(pcs, iters):
    global song_shifted
    for i in range(iters):
        SVD = TruncatedSVD(n_components = pcs, random_state = 107)
        SVD.fit(song_shifted)
        song_represented =  pd.DataFrame(SVD.inverse_transform(SVD.transform(song_shifted)),columns=song_shifted.columns,index=song_shifted.index)
        loss = mean_squared_error(song_represented[mask].fillna(0),song_shifted_temp[mask].fillna(0))
        print('Iteration: {} , Loss: {} '.format(i,loss))
        song_represented[mask] = song_shifted_temp[mask]
        song_shifted = song_represented
    song_mat = song_shifted + song_means
    song_mat = song_mat.clip(lower=1,upper=5)
    return song_mat
print("Starting truncated svd with number of components as 20")
representative_matrix_20 = matrix_reconstruction_method(20,10)
print("-----Done-----")
print("Starting truncated svd with number of components as 15")
representative_matrix_15 = matrix_reconstruction_method(15,10)
print("-----Done------")
score_matrix = (representative_matrix_15+representative_matrix_20)/2

In [None]:
trunc_prediction = np.zeros(len(test))
for i in tqdm(range(len(test))):
    customerid =  test.iloc[i,0]
    songid = test.iloc[i,1]
    trunc_prediction[i] = score_matrix[score_matrix.index==customerid][songid].values[0]

In [None]:
df5 = pd.DataFrame()
df5["score"] = trunc_prediction
df5

In [None]:
x = (df1["score"]+df2["score"]+3*df3["score"]+df4["score"]+2*df5['score'])/8
x = x.clip(lower=1, upper=5)
x

In [None]:
df = pd.DataFrame()
df["score"] = x
df.to_csv("ypred.csv",index_label="test_row_id")

In [None]:
df