## МГТУ им. Н. Э. Баумана
## Факультет: Информатика, искусственный интеллект и системы управления
## Кафедра: Системы обработки информации и управления
## Дисциплина: Методы машинного обучения
## Лабораторная работа №4 "Создание рекомендательной модели"
## Выполнил: Солохов И. Р. ИУ5-23М

Задание:

1. Выбрать произвольный набор данных (датасет), предназначенный для построения рекомендательных моделей.
2. Опираясь на материалы лекции, сформировать рекомендации для одного пользователя (объекта) двумя произвольными способами.
3. Сравнить полученные рекомендации (если это возможно, то с применением метрик).

In [34]:
import numpy as np
import pandas as pd
import seaborn as sns
import datetime
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.impute import MissingIndicator
from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.datasets import load_wine
from sklearn.datasets import load_linnerud
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances
from IPython.display import Image
import matplotlib.pyplot as plt
import scipy.stats as stats
%matplotlib inline 
sns.set(style="ticks")

In [5]:
data = pd.read_csv('laptops.csv')
data.head()

Unnamed: 0,brand,laptop_name,display_size,processor_type,graphics_card,disk_space,discount_price,old_price,ratings_5max
0,HP,Notebook 14-df0008nx,14.0,Intel Celeron N4000,Intel HD Graphics 600,64 GB (eMMC),1259.0,1259.0,0 / 5
1,Lenovo,IdeaPad 330S-14IKB,14.0,Intel Core i5-8250U,Intel UHD Graphics 620,1 TB HDD,1849.0,2099.0,3.3 / 5
2,Huawei,MateBook D Volta,14.0,Intel Core i5-8250U,NVIDIA GeForce MX150 (2 GB),256 GB SSD,2999.0,3799.0,0 / 5
3,Dell,Inspiron 15 3567,15.6,Intel Core i3-7020U,Intel HD Graphics 620,1 TB HDD,1849.0,1849.0,0 / 5
4,Asus,VivoBook 15 X510UR,15.6,Intel Core i7-8550U,NVIDIA GeForce 930MX (2 GB),1 TB HDD,2499.0,3149.0,0 / 5


In [6]:
data.shape

(205, 9)

In [10]:
brand = data[data['brand'].notnull()]['brand']

In [13]:
brand_isin = data[data['brand'].isin(brand)]

In [14]:
processor = brand_isin[brand_isin['processor_type'].notnull()]
processor = processor[~processor['processor_type'].str.isspace()]

In [16]:
gc = processor['graphics_card'].values
gc[0:5]

array([' Intel HD Graphics 600', ' Intel UHD Graphics 620',
       ' NVIDIA GeForce MX150 (2 GB)', ' Intel HD Graphics 620',
       ' NVIDIA GeForce 930MX (2 GB)'], dtype=object)

In [17]:
ds = processor['disk_space'].values
ds[0:5]

array([' 64 GB (eMMC)', ' 1 TB HDD', ' 256 GB SSD', ' 1 TB HDD',
       ' 1 TB HDD'], dtype=object)

In [18]:
br = processor['brand'].values
br[0:5]

array(['HP', 'Lenovo', 'Huawei', 'Dell', 'Asus'], dtype=object)

In [126]:
tfidfv = TfidfVectorizer()
ds_matrix = tfidfv.fit_transform(ds)
ds_matrix

<205x19 sparse matrix of type '<class 'numpy.float64'>'
	with 690 stored elements in Compressed Sparse Row format>

<h2>Фильтрация на основе disk_space

In [170]:
class SimpleKNNRecommender:
    
    def __init__(self, X_matrix, X_brand, X_gc, X_ds):
        """
        Входные параметры:
        X_matrix - обучающая выборка (матрица объект-признак)
        X_ids - массив идентификаторов объектов
        X_title - массив названий объектов
        X_overview - массив описаний объектов
        """
        #Сохраняем параметры в переменных объекта
        self._X_matrix = X_matrix
        self.df = pd.DataFrame(
            {'brand': pd.Series(X_brand, dtype='str'),
            'graphics_card': pd.Series(X_gc, dtype='str'),
            'disk_space': pd.Series(X_ds, dtype='str'),
            'dist': pd.Series([], dtype='float')})
            
            
    def recommend_for_single_object(self, K: int, \
                X_matrix_object, cos_flag = True, manh_flag = False):
        """
        Метод формирования рекомендаций для одного объекта.
        Входные параметры:
        K - количество рекомендуемых соседей 
        X_matrix_object - строка матрицы объект-признак, соответствующая объекту
        cos_flag - флаг вычисления косинусного расстояния
        manh_flag - флаг вычисления манхэттэнского расстояния
        Возвращаемое значение: K найденных соседей
        """
        
        scale = 1000000
        # Вычисляем косинусную близость
        if cos_flag:
            dist = cosine_similarity(self._X_matrix, X_matrix_object)
            self.df['dist'] = dist * scale
            res = self.df.sort_values(by='dist', ascending=False)
            # Не учитываем рекомендации с единичным расстоянием,
            # так как это искомый объект
            res = res[res['dist'] < scale]
        
        else:
            if manh_flag:
                dist = manhattan_distances(self._X_matrix, X_matrix_object)
            else:
                dist = euclidean_distances(self._X_matrix, X_matrix_object)
            self.df['dist'] = dist * scale
            res = self.df.sort_values(by='dist', ascending=True)
            # Не учитываем рекомендации с единичным расстоянием,
            # так как это искомый объект
            res = res[res['dist'] > 0.0]            
        
        # Оставляем К первых рекомендаций
        res = res.head(K)
        return res

In [171]:
# MSI = 135
# brand[MSI]

Acer = 20
brand[Acer]

'Dell'

In [172]:
Acer_matrix = ds_matrix[Acer]
Acer_matrix

<1x19 sparse matrix of type '<class 'numpy.float64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [173]:
skr1 = SimpleKNNRecommender(ds_matrix, brand, gc, ds)

In [174]:
rec1 = skr1.recommend_for_single_object(7, Acer_matrix)
rec1

Unnamed: 0,brand,graphics_card,disk_space,dist
8,Huawei,NVIDIA GeForce MX150 (2 GB),256 GB NVMe M.2 SSD,849354.45137
19,Apple,Intel UHD Graphics 617,256 GB (PCIe SSD),816583.297471
104,Apple,Intel UHD Graphics 617,256 GB (PCIe SSD),816583.297471
103,Apple,Intel UHD Graphics 617,256 GB (PCIe SSD),816583.297471
23,Apple,Intel UHD Graphics 617,256 GB (PCIe SSD),816583.297471
22,Apple,Intel UHD Graphics 617,256 GB (PCIe SSD),816583.297471
102,Apple,Intel UHD Graphics 617,256 GB (PCIe SSD),816583.297471


In [175]:
rec2 = skr1.recommend_for_single_object(7, Acer_matrix, cos_flag = False)
rec2
# Получаем такой же результат

Unnamed: 0,brand,graphics_card,disk_space,dist
8,Huawei,NVIDIA GeForce MX150 (2 GB),256 GB NVMe M.2 SSD,548899.897303
104,Apple,Intel UHD Graphics 617,256 GB (PCIe SSD),605667.734867
23,Apple,Intel UHD Graphics 617,256 GB (PCIe SSD),605667.734867
22,Apple,Intel UHD Graphics 617,256 GB (PCIe SSD),605667.734867
19,Apple,Intel UHD Graphics 617,256 GB (PCIe SSD),605667.734867
102,Apple,Intel UHD Graphics 617,256 GB (PCIe SSD),605667.734867
103,Apple,Intel UHD Graphics 617,256 GB (PCIe SSD),605667.734867


In [176]:
rec3 = skr1.recommend_for_single_object(7, Acer_matrix, 
                                        cos_flag = False, manh_flag = True)
rec3

Unnamed: 0,brand,graphics_card,disk_space,dist
8,Huawei,NVIDIA GeForce MX150 (2 GB),256 GB NVMe M.2 SSD,815854.945563
104,Apple,Intel UHD Graphics 617,256 GB (PCIe SSD),930894.490536
23,Apple,Intel UHD Graphics 617,256 GB (PCIe SSD),930894.490536
22,Apple,Intel UHD Graphics 617,256 GB (PCIe SSD),930894.490536
19,Apple,Intel UHD Graphics 617,256 GB (PCIe SSD),930894.490536
102,Apple,Intel UHD Graphics 617,256 GB (PCIe SSD),930894.490536
103,Apple,Intel UHD Graphics 617,256 GB (PCIe SSD),930894.490536


<h2>Коллаборативная фильтрация

In [288]:
data = pd.read_csv('archive/ratings.csv')
data.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [181]:
len(data['display_size'].unique())

11

In [182]:
len(data['discount_price'].unique())

124

In [289]:
def create_utility_matrix(data):
    dsField = 'user_id' #item
    dpField = 'movie_id' #user
    opField = 'rating' #value  display_size
    
    dsList = data[dsField].tolist()
    dpList = data[dpField].tolist()
    opList = data[opField].tolist()    
    
    ds = list(set(dsList)) #users
    dp = list(set(dpList)) #items
    
    ds_index = {ds[i]: i for i in range(len(ds))}    
    pd_dict = {d_p: [0.0 for i in range(len(ds))] for d_p in dp}    
    
    for i in range(0,data.shape[0]):
        d_p = dpList[i] #item
        d_s = dsList[i] #user
        o_p = opList[i] #value
        pd_dict[d_p][ds_index[d_s]] = o_p   
    
    X = pd.DataFrame(pd_dict)
    X.index = ds
        
    itemcols = list(X.columns)
    dp_index = {itemcols[i]: i for i in range(len(itemcols))}
    
    return X, ds_index, dp_index

In [290]:
ds_dp_matrix, ds_index, dp_index = create_utility_matrix(data)

In [333]:
ds_dp_matrix

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,2.0,0.0,0.0,4.0,5.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,5.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [334]:
ds_dp_matrix__test = ds_dp_matrix.loc[[943]]
ds_dp_matrix__test

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
943,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [335]:
ds_dp_matrix__train = ds_dp_matrix.loc[:943]
ds_dp_matrix__train

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,2.0,0.0,0.0,4.0,5.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,5.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [336]:
U, S, VT = np.linalg.svd(ds_dp_matrix__train.T)
V = VT.T

In [337]:
U.shape

(1682, 1682)

In [338]:
V.shape

(943, 943)

In [339]:
S.shape

(943,)

In [340]:
Sigma = np.diag(S)
Sigma.shape

(943, 943)

In [341]:
Sigma

array([[640.63362257,   0.        ,   0.        , ...,   0.        ,
          0.        ,   0.        ],
       [  0.        , 244.83634567,   0.        , ...,   0.        ,
          0.        ,   0.        ],
       [  0.        ,   0.        , 217.84622472, ...,   0.        ,
          0.        ,   0.        ],
       ...,
       [  0.        ,   0.        ,   0.        , ...,   0.69874606,
          0.        ,   0.        ],
       [  0.        ,   0.        ,   0.        , ...,   0.        ,
          0.67513184,   0.        ],
       [  0.        ,   0.        ,   0.        , ...,   0.        ,
          0.        ,   0.64335756]])

In [342]:
r=3
Ur = U[:, :r]
Sr = Sigma[:r, :r]
Vr = V[:, :r]

In [343]:
test_user = np.mat(ds_dp_matrix__test.values)
test_user.shape, test_user

((1, 1682), matrix([[0., 5., 0., ..., 0., 0., 0.]]))

In [344]:
tmp = test_user * Ur * np.linalg.inv(Sr)
tmp

matrix([[-0.04224209,  0.01092715,  0.05854604]])

In [345]:
test_user_result = np.array([tmp[0,0], tmp[0,1], tmp[0,2]])
test_user_result

array([-0.04224209,  0.01092715,  0.05854604])

In [346]:
cos_sim = cosine_similarity(Vr, test_user_result.reshape(1, -1))
cos_sim[:10]

array([[ 0.63431556],
       [-0.37774684],
       [-0.33091006],
       [-0.1374043 ],
       [ 0.95208367],
       [-0.40027685],
       [ 0.36995494],
       [ 0.98914596],
       [-0.3913836 ],
       [-0.32506436]])

In [347]:
cos_sim_list = cos_sim.reshape(-1, cos_sim.shape[0])[0]
cos_sim_list[:10]

array([ 0.63431556, -0.37774684, -0.33091006, -0.1374043 ,  0.95208367,
       -0.40027685,  0.36995494,  0.98914596, -0.3913836 , -0.32506436])

In [374]:
recommended_user_id = np.argsort(-cos_sim_list)[0]
recommended_user_id

942

In [381]:
# Получение названия фильма
movieId_list = list(ds_dp_matrix.columns)
data_movie = pd.read_csv('archive/m_m.csv')
def film_name_by_movieid(ind):
    try:
        movieId = movieId_list[ind]
        flt_links = df_links[df_links['movie_id'] == movieId]
        tmdbId = int(flt_links['user_id'].values[0])
        md_links = data_movie[data_movie['id'] == tmdbId]
        res = md_links['title'].values[0]
        return res
    except:
        return '' 

In [382]:
i=1
for idx, item in enumerate(np.ndarray.flatten(np.array(test_user))):
    if item > 0:
        film_title = film_name_by_movieid(idx)
        print('{} - {} - {}'.format(idx, film_title, item))
        if i==10:
            break
        else:
            i+=1

1 -  - 5.0
8 -  - 3.0
10 -  - 4.0
11 -  - 5.0
21 -  - 4.0
22 -  - 4.0
23 -  - 4.0
26 -  - 4.0
27 -  - 4.0
30 -  - 4.0
