In [1]:
!pip install pandas scikit-learn joblib



# import library

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import numpy as np

# import dataset

In [59]:
dataset = pd.read_csv('mock_dataset_v3.csv')
dataset["matching_rate"] = dataset.pop("matching_rate")
dataset.head(10)

Unnamed: 0,user_valence,user_arousal,user_genre,movie_valence,movie_arousal,movie_genre,matching_rate
0,0.599343,0.247823,Comedy,0.302381,0.455763,Horror,3.828552
1,0.472347,0.683572,Comedy,0.617867,0.300426,Drama,3.518089
2,0.629538,0.924431,Drama,0.970506,0.15517,Action,1.737513
3,0.804606,0.706493,Drama,0.836355,0.317267,Sci-Fi,4.116567
4,0.453169,0.196126,Romance,0.898297,0.491718,Horror,2.817967
5,0.453173,0.403153,Sci-Fi,0.372196,0.948639,Action,3.220453
6,0.815843,0.753382,Romance,0.252599,0.370597,Action,2.458959
7,0.653487,0.358466,Action,0.468553,0.555947,Drama,3.984703
8,0.406105,0.588764,Action,0.513931,0.495894,Documentary,4.270549
9,0.608512,0.654927,Romance,0.773548,0.797098,Sci-Fi,4.242019


In [60]:
dataset.dropna()
dataset_no_zeros = dataset[~(dataset == 0).any(axis=1)]

## encode text column

In [61]:
from sklearn.preprocessing import OneHotEncoder
genre_cols = ["user_genre", "movie_genre"]
encoder = OneHotEncoder(sparse_output=False)
encoded = encoder.fit_transform(dataset[genre_cols])
encoded_cols = encoder.get_feature_names_out(genre_cols)

In [62]:
dataset_encoded = pd.concat(
    [dataset.drop(columns=genre_cols), pd.DataFrame(encoded, columns=encoded_cols)],
    axis=1
)

In [63]:
dataset_encoded.head(10)

Unnamed: 0,user_valence,user_arousal,movie_valence,movie_arousal,matching_rate,user_genre_Action,user_genre_Comedy,user_genre_Documentary,user_genre_Drama,user_genre_Horror,user_genre_Romance,user_genre_Sci-Fi,movie_genre_Action,movie_genre_Comedy,movie_genre_Documentary,movie_genre_Drama,movie_genre_Horror,movie_genre_Romance,movie_genre_Sci-Fi
0,0.599343,0.247823,0.302381,0.455763,3.828552,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.472347,0.683572,0.617867,0.300426,3.518089,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.629538,0.924431,0.970506,0.15517,1.737513,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.804606,0.706493,0.836355,0.317267,4.116567,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.453169,0.196126,0.898297,0.491718,2.817967,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5,0.453173,0.403153,0.372196,0.948639,3.220453,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.815843,0.753382,0.252599,0.370597,2.458959,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.653487,0.358466,0.468553,0.555947,3.984703,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
8,0.406105,0.588764,0.513931,0.495894,4.270549,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9,0.608512,0.654927,0.773548,0.797098,4.242019,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# split feature and target

In [64]:
x = dataset_encoded.drop(columns=["matching_rate"])
y = dataset["matching_rate"]

In [65]:
print(x)

     user_valence  user_arousal  movie_valence  movie_arousal  \
0        0.599343      0.247823       0.302381       0.455763   
1        0.472347      0.683572       0.617867       0.300426   
2        0.629538      0.924431       0.970506       0.155170   
3        0.804606      0.706493       0.836355       0.317267   
4        0.453169      0.196126       0.898297       0.491718   
..            ...           ...            ...            ...   
245      0.433100      0.607782       0.403075       0.591240   
246      0.405011      0.292551       0.571466       1.000000   
247      0.369334      0.461932       0.583614       0.485595   
248      0.853091      0.324876       0.664636       0.550275   
249      0.580996      0.223440       1.000000       0.762664   

     user_genre_Action  user_genre_Comedy  user_genre_Documentary  \
0                  0.0                1.0                     0.0   
1                  0.0                1.0                     0.0   
2           

In [66]:
print(y)

0      3.828552
1      3.518089
2      1.737513
3      4.116567
4      2.817967
         ...   
245    5.000000
246    2.380281
247    4.073250
248    3.829110
249    2.358329
Name: matching_rate, Length: 250, dtype: float64


In [67]:
regressor = RandomForestRegressor(n_estimators=150, random_state=0)
regressor.fit(x,y)

0,1,2
,n_estimators,150
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


# test predict

In [70]:
sample = [[0.608512,0.654927,0.773548,0.797098,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0]] 

In [73]:
prediction = regressor.predict(sample)
print('matching rate : ', prediction)

matching rate :  [4.19722162]




# save model

In [74]:
joblib.dump(regressor, "cinesense_model.pkl")

['cinesense_model.pkl']

In [75]:
joblib.dump(encoder, 'encoder.pkl')

['encoder.pkl']