In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [2]:
def make_train_df(rating,clothes,user):
    df = pd.merge(rating,user,how='inner',on='R_id')
    df = pd.merge(df,clothes,how='inner',on='image')
    df=df.drop(columns=['스타일선호','mar','job','income','r_style1','r_style2','r_style3','r_style4','r_style5'])
    df_rating = df['선호여부']
    df = df.drop(columns=['선호여부'])
    encoder = OneHotEncoder(sparse_output=False)
    #print(df.head())
    df_encoded = encoder.fit_transform(df.loc[:,'r_gender':'분위기'])
    
    df_encoded = pd.DataFrame(df_encoded,columns= [f"col{i}_{elem}" for i,sublist in enumerate(encoder.categories_) for elem in sublist])
    #df = pd.concat([df[['R_id','image']],df_encoded,df.loc[:,'멋있다':].astype(np.int8)],axis=1)
    df = pd.concat([df_encoded,df.loc[:,'멋있다':]],axis=1)
    #df = df_encoded

    return df,df_rating,encoder

In [3]:
def make_test_df(rating,clothes,user,encoder):
    df = pd.merge(rating,user,how='inner',on='R_id')
    df = pd.merge(df,clothes,how='inner',on='image')
    
    df=df.drop(columns=['스타일선호','mar','job','income','r_style1','r_style2','r_style3','r_style4','r_style5'])
    df_rating = df['선호여부']
    df = df.drop(columns=['선호여부'])
    df_encoded = encoder.transform(df.loc[:,'r_gender':'분위기'])
    df_encoded = pd.DataFrame(df_encoded,columns= [f"col{i}_{elem}" for i,sublist in enumerate(encoder.categories_) for elem in sublist])
    #df = pd.concat([df[['R_id','image']],df_encoded,df.loc[:,'멋있다':].astype(np.int8)],axis=1)
    df = pd.concat([df_encoded,df.loc[:,'멋있다':]],axis=1)
    #df = df_encoded
    return df,df_rating

In [4]:
rating = pd.read_csv('preprocessed/TL_woman_rating_2019.csv')
clothes = pd.read_csv('preprocessed/TL_woman_clothes_2019.csv',index_col='image')
user = pd.read_csv('preprocessed/TL_woman_user_2019.csv',index_col='R_id')

train_x,train_y,encoder = make_train_df(rating,clothes,user)

In [5]:
train_x.to_csv('./train/train_x_woman.csv',index=False)
train_y.to_csv('./train/train_y_woman.csv',index=False)

In [6]:
with open('encoder/onehot_encoder_woman.pkl','wb') as f:
    pickle.dump(encoder,f)

In [7]:
rating = pd.read_csv('preprocessed/VL_woman_rating_2019.csv')
clothes = pd.read_csv('preprocessed/VL_woman_clothes_2019.csv',index_col='image')
user = pd.read_csv('preprocessed/VL_woman_user_2019.csv',index_col='R_id')

In [8]:
test_x,test_y = make_test_df(rating,clothes,user,encoder)

In [9]:
reg = RandomForestRegressor(random_state=0,n_jobs=-1)
reg.fit(train_x,train_y)

In [10]:
ftr_importances_values = reg.feature_importances_
ftr_importances = pd.Series(ftr_importances_values, index=train_x.columns)
ftr_top = ftr_importances.sort_values(ascending=False)[:20]

ftr_top

독특하다             0.117350
깔끔하다             0.040489
실용적이다            0.026553
col1_30대         0.023268
col1_20대         0.022865
트렌디하다            0.022454
활동적이다            0.022421
무난하다             0.021904
col1_40대         0.021363
편안하다             0.021287
세련되다             0.021219
도시적이다            0.021113
발랄하다             0.020726
col2_summer      0.019403
여성적이다            0.018959
멋있다              0.018670
부드럽다             0.018660
col5_normcore    0.018266
col7_봄/가을        0.017830
col4_triangle    0.017278
dtype: float64

In [11]:
import math
from sklearn.metrics import mean_squared_error
train_predict = reg.predict(train_x)
print("RMSE:{}".format(math.sqrt(mean_squared_error(train_predict, train_y))) )

RMSE:0.3136099496099953


In [12]:
test_predict = reg.predict(test_x)
print("RMSE':{}".format(math.sqrt(mean_squared_error(test_predict, test_y))) )

RMSE':0.7729973117505007


In [13]:
def recommend_internal(clothes_path,encoder_path,gender,age,color,face,body):
    
    df_user=pd.DataFrame.from_dict({'r_gender':[gender],'age':[age],'personal_color':[color],'faceshape':[face],'bodyshape':[body]})
    df_clothes = pd.read_csv(clothes_path)
    df = pd.concat([df_user,df_clothes],axis=1)
    df = df.ffill()
    df_clothes_name = df['image']
    df = df.drop(columns=['image'])

    encoder = ''
    with open(encoder_path,'rb') as f:
        encoder = pickle.load(f)
    df_encoded = encoder.transform(df.loc[:,'r_gender':'분위기'])
    df_encoded = pd.DataFrame(df_encoded,columns= [f"col{i}_{elem}" for i,sublist in enumerate(encoder.categories_) for elem in sublist])
    df_test = pd.concat([df_encoded,df.loc[:,'멋있다':].astype(np.int8)],axis=1)

    predict = reg.predict(df_test)
    rating = pd.DataFrame.from_dict({'rating':predict})
    return pd.concat([df_clothes_name,rating],axis=1).sort_values(by=['rating'], axis=0, ascending=False)

In [14]:
recommend_internal('preprocessed/TL_woman_clothes_2019.csv','encoder/onehot_encoder_woman.pkl','여성','20대','spring','round','round').head(n=10)

Unnamed: 0,image,rating
4827,W_07708_19_normcore_W.jpg,3.71
4615,W_05429_19_normcore_W.jpg,3.71
1028,T_06247_19_normcore_W.jpg,3.706667
1030,T_06252_19_normcore_W.jpg,3.675
11928,W_69663_19_normcore_W.jpg,3.656667
3826,T_14752_19_normcore_W.jpg,3.626667
3944,T_15674_19_normcore_W.jpg,3.625
657,T_05226_19_normcore_W.jpg,3.61
7606,W_37378_19_normcore_W.jpg,3.6075
4531,W_04139_19_normcore_W.jpg,3.606667


In [15]:
recommend_internal('preprocessed/TL_woman_clothes_2019.csv','encoder/onehot_encoder_woman.pkl','여성','20대','winter','oval','round').head(n=10)

Unnamed: 0,image,rating
4010,T_16402_19_normcore_W.jpg,3.695
591,T_05113_19_normcore_W.jpg,3.686667
2222,T_10262_19_normcore_W.jpg,3.65
5068,W_09078_19_normcore_W.jpg,3.6
5908,W_20074_19_normcore_W.jpg,3.56
3234,T_13333_19_normcore_W.jpg,3.55
2500,T_10726_19_normcore_W.jpg,3.54
13019,W_92359_19_normcore_W.jpg,3.54
3680,T_14527_19_normcore_W.jpg,3.54
5433,W_13147_19_normcore_W.jpg,3.53


In [16]:
import joblib
joblib.dump(reg, './model/random_woman.pkl') 

['./model/random_woman.pkl']