In [1]:
import pandas as pd
import numpy as np

In [28]:
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


### Загрузка


In [3]:
data = pd.read_csv('u.data', sep= '\t', names = ['user_id', 'item_id', 'rating', 'timestamp'])
data.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [4]:
data.to_csv('u.data2', index=False)

In [5]:
data.groupby('user_id').mean()['rating'].reset_index()

Unnamed: 0,user_id,rating
0,1,3.610294
1,2,3.709677
2,3,2.796296
3,4,4.333333
4,5,2.874286
...,...,...
938,939,4.265306
939,940,3.457944
940,941,4.045455
941,942,4.265823


In [6]:
data.groupby('item_id').count()['rating'].sort_values(ascending=False).reset_index()

Unnamed: 0,item_id,rating
0,50,583
1,258,509
2,100,508
3,181,507
4,294,485
...,...,...
1677,1452,1
1678,1593,1
1679,1447,1
1680,814,1


In [7]:
users = pd.read_csv('u.user', sep= '|', names =['user_id', 'age', 'gender', 'occupation', 'zip_code'])
users.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [8]:
df = data.merge(users, on='user_id')

In [9]:
df.sample(5)

Unnamed: 0,user_id,item_id,rating,timestamp,age,gender,occupation,zip_code
11060,138,238,5,879024382,46,M,doctor,53211
28845,280,228,3,891701405,30,F,librarian,22903
61543,543,94,3,877550791,33,M,scientist,95123
31243,190,597,2,891626023,30,M,administrator,95938
80441,748,135,4,879454998,28,M,administrator,94720


In [10]:
df['rating_class'] = np.where(df.rating>=4, 1, 0)

In [11]:
df.sample(5)

Unnamed: 0,user_id,item_id,rating,timestamp,age,gender,occupation,zip_code,rating_class
7883,201,197,4,884113422,27,M,writer,E2A4H,1
813,253,659,5,891628358,26,F,librarian,22903,1
53599,474,343,3,887915082,51,M,executive,93711,0
56428,499,98,4,885599119,42,M,programmer,75006,1
9205,81,591,5,876534124,21,M,student,21218,1


In [12]:
df_class = df[['user_id', 'item_id', 'rating_class']]

### RandomForestRegressor

In [12]:
from sklearn.ensemble import RandomForestRegressor

In [13]:
df_filtered = df[['user_id', 'item_id', 'rating']]

In [14]:
X = df_filtered.drop('rating', axis=1)
y = df_filtered['rating']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [16]:
rfr = RandomForestRegressor()

In [17]:
rfr.fit(X_train, y_train)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [18]:
y_pred = rfr.predict(X_test)

### 6. Оценка

In [19]:
mean_absolute_error(y_test, y_pred)

0.907315

In [20]:
mean_squared_error(y_test, y_pred)

1.3171385000000002

In [21]:
r2_score(y_test, y_pred)

-0.045406301422758144

### RandomForestClassifier

In [13]:
X = df_class.drop('rating_class', axis=1)
y = df_class['rating_class']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [15]:
from sklearn.ensemble import RandomForestClassifier

In [16]:
rfc = RandomForestClassifier()

In [17]:
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [19]:
y_pred = rfc.predict(X_test)

In [24]:
accuracy_score(y_test, y_pred)

0.6231

In [25]:
precision_score(y_test, y_pred)

0.6530166768532262

In [26]:
recall_score(y_test, y_pred)

0.6772616136919315

In [27]:
f1_score(y_test, y_pred)

0.6649182076813656

In [29]:
roc_auc_score(y_test, y_pred)

0.6167931379836236