In [1]:
from pandas import read_csv, Series, DataFrame, concat
import numpy as np
import pathlib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score, accuracy_score, jaccard_score

In [None]:
Evaluation and Analysis

In [3]:
dataTest = read_csv('Content_Training_Testing/content_user_test.csv')
no_recom = 20

def evaluate(name_algo, location_algo_output_file, eval_output_file):
    output_data = read_csv(location_algo_output_file)

    output_data["numberGamesUserHasInTest"] = 0
    output_data["numberRecommendationUserHas"] = 0
    output_data["ratio"] = 0

    for i, row in output_data.iterrows():
        userGames = dataTest[dataTest["user_id"] == row["user_id"]]["game"].tolist()
        output_data.at[i, 'numberGamesUserHasInTest'] = len(userGames)
        count = 0
        for j in range(1, no_recom+1):
            if row[j] in userGames:
                count += 1
        output_data.at[i, "numberRecommendationUserHas"] = count
        if len(userGames) != 0:
            output_data.at[i, "ratio"] = float(count/len(userGames))
    print(name_algo)
    print(output_data["ratio"].describe(include=[float]))
    print(output_data["numberRecommendationUserHas"].describe(include=[float]))
    print(output_data["numberGamesUserHasInTest"].describe(include=[float]))
    output_data.to_csv(eval_output_file,
                       columns=["user_id", "ratio", "numberRecommendationUserHas", "numberGamesUserHasInTest"], index=False)

In [5]:
evaluate("Content based with genre", pathlib.Path('Test_Files/genre.csv'), pathlib.Path('Content_Output/genre_ratio.csv'))

Content based with genre
count    11214.000000
mean         0.021224
std          0.115793
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: ratio, dtype: float64
count    11214.00000
mean         0.07125
std          0.29322
min          0.00000
25%          0.00000
50%          0.00000
75%          0.00000
max          4.00000
Name: numberRecommendationUserHas, dtype: float64
count    11214.000000
mean         2.184680
std          7.700844
min          0.000000
25%          0.000000
50%          0.000000
75%          1.000000
max        211.000000
Name: numberGamesUserHasInTest, dtype: float64


  values = values.astype(str)


In [4]:
evaluate("Content based with popular tags", pathlib.Path('Test_Files/tags.csv'), pathlib.Path('Content_Output/tags_ratio.csv'))

Content based with popular tags
count    11214.000000
mean         0.018533
std          0.105380
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: ratio, dtype: float64
count    11214.000000
mean         0.066524
std          0.282106
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          4.000000
Name: numberRecommendationUserHas, dtype: float64
count    11214.000000
mean         2.184680
std          7.700844
min          0.000000
25%          0.000000
50%          0.000000
75%          1.000000
max        211.000000
Name: numberGamesUserHasInTest, dtype: float64


  values = values.astype(str)


In [8]:
evaluate("Content based with publisher", pathlib.Path('Test_Files/publisher.csv'), pathlib.Path('Content_Output/publisher_ratio.csv'))

Content based with publisher
count    11214.000000
mean         0.027855
std          0.133903
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: ratio, dtype: float64
count    11214.000000
mean         0.099340
std          0.384222
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          6.000000
Name: numberRecommendationUserHas, dtype: float64
count    11214.000000
mean         2.184680
std          7.700844
min          0.000000
25%          0.000000
50%          0.000000
75%          1.000000
max        211.000000
Name: numberGamesUserHasInTest, dtype: float64


  values = values.astype(str)


In [9]:
evaluate("Content based with game_details", pathlib.Path('Test_Files/game_details.csv'), pathlib.Path('Content_Output/gamedetails_ratio.csv'))

Content based with game_details
count    11214.000000
mean         0.025368
std          0.129171
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: ratio, dtype: float64
count    11214.000000
mean         0.080524
std          0.330970
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          4.000000
Name: numberRecommendationUserHas, dtype: float64
count    11214.000000
mean         2.184680
std          7.700844
min          0.000000
25%          0.000000
50%          0.000000
75%          1.000000
max        211.000000
Name: numberGamesUserHasInTest, dtype: float64


  values = values.astype(str)


In [10]:
evaluate("Content based with genre, publisher, developer", pathlib.Path('Test_Files/genre_pub_dev.csv'), pathlib.Path('Content_Output/genre_pub_dev_ratio.csv'))

Content based with genre, publisher, developer
count    11214.000000
mean         0.033220
std          0.149685
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: ratio, dtype: float64
count    11214.000000
mean         0.103264
std          0.384812
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          4.000000
Name: numberRecommendationUserHas, dtype: float64
count    11214.000000
mean         2.184680
std          7.700844
min          0.000000
25%          0.000000
50%          0.000000
75%          1.000000
max        211.000000
Name: numberGamesUserHasInTest, dtype: float64


  values = values.astype(str)


In [11]:
evaluate("Content based with genre, tags, developer", pathlib.Path('Test_Files/genre_tags_dev.csv'), pathlib.Path('Content_Output/genre_tags_dev_ratio.csv'))

Content based with genre, tags, developer
count    11214.000000
mean         0.017870
std          0.098974
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: ratio, dtype: float64
count    11214.000000
mean         0.071250
std          0.301024
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          4.000000
Name: numberRecommendationUserHas, dtype: float64
count    11214.000000
mean         2.184680
std          7.700844
min          0.000000
25%          0.000000
50%          0.000000
75%          1.000000
max        211.000000
Name: numberGamesUserHasInTest, dtype: float64


  values = values.astype(str)


In [12]:
evaluate("Content based with genre, tags, game details", pathlib.Path('Test_Files/genre_tags_gamedetail.csv'), pathlib.Path('Content_Output/genre_tags_gamedetail_ratio.csv'))

Content based with genre, tags, game details
count    11214.000000
mean         0.027151
std          0.126931
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: ratio, dtype: float64
count    11214.000000
mean         0.092563
std          0.340485
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          4.000000
Name: numberRecommendationUserHas, dtype: float64
count    11214.000000
mean         2.184680
std          7.700844
min          0.000000
25%          0.000000
50%          0.000000
75%          1.000000
max        211.000000
Name: numberGamesUserHasInTest, dtype: float64


  values = values.astype(str)
