In [20]:
# librerias:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)
import numpy as np
from tqdm.notebook import tqdm
import gc

import warnings
warnings.filterwarnings("ignore")

from lightgbm import LGBMClassifier,LGBMRegressor
from xgboost import XGBClassifier,XGBRegressor
from catboost import CatBoostClassifier,CatBoostRegressor

import os
import sweetviz as sv

from sklearn.model_selection import GroupKFold, StratifiedKFold, KFold, train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_log_error,mean_squared_error,f1_score
from sklearn import preprocessing

import matplotlib.pyplot as plt
import seaborn as sns

import datetime

In [3]:
os.chdir("D:/Script/KaggleDaysQualifiers/Qualifier8/")

In [4]:
train_ratings = pd.read_csv("Data/train_ratings.csv")
train_ratings.head()

Unnamed: 0,id,user_id,book_id,rating
0,5c52cc76216e68f8,9db527ea34,69173ee3b6,5
1,9ecc1d4a2e9a7476,3db2595a13,58d33fe06a,8
2,953317c1edbe1e56,375781e597,512b5d69de,10
3,617ef107e6ffed5a,f00ee6360d,98c1419160,9
4,0e2be280941c4d1f,8b893fb104,99c56ce036,9


In [5]:
test_ratings = pd.read_csv("Data/test_ratings.csv")
test_ratings.head()

Unnamed: 0,id,user_id,book_id
0,f38f854b31509133,8073cd1ef5,5d228c95dc
1,aadb34054d6c8a12,41d6240ef5,6bbcd07e36
2,87b031bdd906e26d,fcb9105c5c,4a88d029fa
3,371562c1c36bb8d5,942e3a5ae8,4127c00651
4,3ebb7df64a2896be,da3c9dfa9c,34de200846


In [6]:
books = pd.read_csv("Data/books.csv")
books.head()

Unnamed: 0,book_id,title,author,year,publisher
0,8a2f2c390c,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,dce235e3f8,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,624ad92bef,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,9418f71f8f,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,a9908c598a,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company


In [7]:
users = pd.read_csv("Data/users.csv")
users.head()

Unnamed: 0,user_id,age,city,province,country
0,c4ca4238a0,,nyc,new york,usa
1,c81e728d9d,18.0,stockton,california,usa
2,eccbc87e4b,,moscow,yukon territory,russia
3,a87ff679a2,17.0,porto,v.n.gaia,portugal
4,e4da3b7fbb,,farnborough,hants,united kingdom


### Realizando los joins:

In [8]:
train_ratings = pd.merge(train_ratings,users,on="user_id",how="left")
test_ratings = pd.merge(test_ratings,users,on="user_id",how="left")

In [9]:
def preprocess_book(df):
    vals_year   = df[df['year'].isin(['DK Publishing Inc','Gallimard'])]['author'].values 
    vals_author = df[df['year'].isin(['DK Publishing Inc','Gallimard'])]['year'].values 
    df.loc[df['year'].isin(['DK Publishing Inc','Gallimard']),'year'] = vals_year
    df.loc[df['year'].isin(['DK Publishing Inc','Gallimard']),'author'] = vals_author
    df['year'] = df['year'].astype('float')
    return df

In [10]:
books = preprocess_book(books)

In [11]:
train_ratings = pd.merge(train_ratings,books,on="book_id",how="left")
test_ratings = pd.merge(test_ratings,books,on="book_id",how="left")

### PreProcessing:

In [12]:
def label_encoder(train,test,vars_cat):
    lbl_dict = {}
    for f in train[vars_cat].columns:
        if train[f].dtype=='object' or test[f].dtype=='object': 
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(train[f].values) + list(test[f].values))
            lbl_dict[f] = lbl
            train[f] = lbl.transform(list(train[f].values))
            test[f] = lbl.transform(list(test[f].values))
    return train, test

In [13]:
vars_cat = ["city","province","country","title","author","publisher"]

In [14]:
train_ratings, test_ratings = label_encoder(train_ratings,test_ratings,vars_cat)

### Feature Engineering:

### LightGBM:

In [17]:
data = train_ratings
test = test_ratings

In [18]:
# Variables finales que entrarian al modelo:
vars_in = [f for f in data.columns if f not in ['id','user_id','book_id','rating']]

In [23]:
# Particionando la muestra en train(70%) y test (30%)
train_x, valid_x, train_y, valid_y = train_test_split(data[vars_in],
                                                      data["rating"],
                                                      test_size=0.25,
                                                      shuffle=True,
                                                      random_state=456,
                                                      stratify=data["rating"])

In [24]:
# Parametros del LightGBM
param = LGBMClassifier(
        n_estimators=10000,
        learning_rate=0.03,
        boosting_type= 'gbdt',
        objective='multiclass',
        colsample_bytree=.8,
        subsample=.8,
        max_depth=5,
        num_class=3
        #feval = row_wise_f1_score_micro
        )

# Entrenamiento:
lgbm_model = param.fit(train_x, train_y, 
                eval_set= [(train_x, train_y), (valid_x, valid_y)], 
                categorical_feature = vars_cat,
                eval_metric=['multi_error'],
                verbose=25,
                early_stopping_rounds=100,
                #feval = row_wise_f1_score_micro
               )

[25]	training's multi_error: 0.696166	training's multi_logloss: 1.79866	valid_1's multi_error: 0.704412	valid_1's multi_logloss: 1.82486
[50]	training's multi_error: 0.681898	training's multi_logloss: 1.75814	valid_1's multi_error: 0.698959	valid_1's multi_logloss: 1.80449
[75]	training's multi_error: 0.670416	training's multi_logloss: 1.72916	valid_1's multi_error: 0.696313	valid_1's multi_logloss: 1.7935
[100]	training's multi_error: 0.660377	training's multi_logloss: 1.70742	valid_1's multi_error: 0.693427	valid_1's multi_logloss: 1.78773
[125]	training's multi_error: 0.652941	training's multi_logloss: 1.68923	valid_1's multi_error: 0.692079	valid_1's multi_logloss: 1.784
[150]	training's multi_error: 0.646494	training's multi_logloss: 1.6737	valid_1's multi_error: 0.692079	valid_1's multi_logloss: 1.78173
[175]	training's multi_error: 0.640512	training's multi_logloss: 1.65954	valid_1's multi_error: 0.691326	valid_1's multi_logloss: 1.77977
[200]	training's multi_error: 0.63491	tra

In [25]:
# Haciendo la prediccion:
predict_train = lgbm_model.predict(valid_x)

In [27]:
f1_score(valid_y,predict_train,average="micro")*100

30.99892554163927

In [28]:
# Haciendo la prediccion:
predict_test = lgbm_model.predict(test[vars_in])

In [29]:
submission_test = pd.DataFrame({'id':test["id"],
                                 'rating':predict_test})
submission_test.head()

Unnamed: 0,id,rating
0,f38f854b31509133,8
1,aadb34054d6c8a12,8
2,87b031bdd906e26d,8
3,371562c1c36bb8d5,8
4,3ebb7df64a2896be,8


In [30]:
submission_test.to_csv("Envios/test_30.9989.csv",index=False)