LR模型在推荐场景下的应用

In [25]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

movies_path = '../../data/ai-coder-book/ml-latest-small//movies.csv'
ratings_path = '../../data/ai-coder-book/ml-latest-small/ratings.csv'

In [26]:
def convert_2_one_hot(df):
    '''
    genres转换为one hot特征
    '''
    genres_vals = df['genres'].values.tolist()
    genres_set = set()
    for row in genres_vals:
        genres_set.update(row.split('|'))
    genres_list = list(genres_set)
    row_num = 0
    df_new = pd.DataFrame(columns=genres_list)
    for row in genres_vals:
        init_genres_vals = [0] * len(genres_list)
        genres_names = row.split('|')
        for name in genres_names:
            init_genres_vals[genres_list.index(name)] = 1
        df_new.loc[row_num] = init_genres_vals
        row_num += 1
    
    df_update = pd.concat([df, df_new], axis=1)
    return df_update


def convert_rating_2_labels(ratings):
    '''
    把rating转换为 0 1分类
    '''
    label = []
    ratings_list = ratings.values.tolist()
    for rate in ratings_list:
        if rate >= 3.0:
            label.append(1)
        else:
            label.append(0)
    return label


def training_lr(X, y):
    model = LogisticRegression(penalty='l2', C=1, solver='sag', max_iter=500, verbose=1, n_jobs=8)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

    model.fit(X_train, y_train)
    train_pred = model.predict_proba(X_train)
    train_auc = roc_auc_score(y_train, train_pred[:, 1])

    test_pred = model.predict_proba(X_test)
    test_auc = roc_auc_score(y_test, test_pred[:, 1])

    print('lr train auc score: ' + str(train_auc))
    print('lr test auc score: ' + str(test_auc))


In [27]:
def load_data():
    movie_df = pd.read_csv(movies_path)
    rating_df = pd.read_csv(ratings_path)
    
    df_update = convert_2_one_hot(movie_df)
    df_final = pd.merge(rating_df, df_update, on="movieId")

    ratings = df_final["rating"]
    df_final = df_final.drop(columns=['userId', 'movieId', 'timestamp', 'title', 'genres', 'rating'])
    labels = convert_rating_2_labels(ratings)
    trainx = df_final.values.tolist()
    return trainx, labels

In [28]:
trainx, labels = load_data()
training_lr(trainx, labels)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.


Epoch 1, change: 1.00000000
Epoch 2, change: 1.19669647
Epoch 3, change: 0.99005859
Epoch 4, change: 0.57897982
Epoch 5, change: 0.34149755
Epoch 6, change: 0.25703338
Epoch 7, change: 0.13461587
Epoch 8, change: 0.20277361
Epoch 9, change: 0.10547674
Epoch 10, change: 0.07455049
Epoch 11, change: 0.04961816
Epoch 12, change: 0.04026811
Epoch 13, change: 0.01800596
Epoch 14, change: 0.00951108
Epoch 15, change: 0.00846731
Epoch 16, change: 0.00921798
Epoch 17, change: 0.00263897
Epoch 18, change: 0.00249077
Epoch 19, change: 0.00218788
Epoch 20, change: 0.00151322
Epoch 21, change: 0.00085741
Epoch 22, change: 0.00056500
Epoch 23, change: 0.00042417
Epoch 24, change: 0.00030208
Epoch 25, change: 0.00024478
convergence after 26 epochs took 1 seconds


[Parallel(n_jobs=8)]: Done   1 out of   1 | elapsed:    1.0s finished


lr train auc score: 0.6203025337645236
lr test auc score: 0.6082834353762431
