### Bring all user interactions together

In [1]:
import pandas as pd
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_rows', 4000)

In [3]:
colnames=['item_id', 'user_id', 'rating_id'] 

cyprian_1 = pd.read_csv("all_sets/cyprian_ratings_1.csv")
cyprian_2 = pd.read_csv("all_sets/cyprian_ratings_2.csv", names=colnames, header=None)
tanush = pd.read_csv("all_sets/tanush_ratings.csv", names=colnames, header=None)
annabelle = pd.read_csv("all_sets/annabelle_ratings.csv", names=colnames, header=None)
arz = pd.read_csv("all_sets/arz_ratings.csv", names=colnames, header=None)

all_events = pd.read_csv("events_and_locations.csv")
tanush.head()

Unnamed: 0,item_id,user_id,rating_id
0,840,0,6
1,685,0,7
2,732,0,10
3,900,0,10
4,242,0,10


In [5]:
full_df = cyprian_1

for df in [cyprian_2, tanush, annabelle, arz]: # add all people's interactions later
    adjustment_id_val = full_df["user_id"].iloc[-1]
    df["user_id"] += adjustment_id_val
    full_df = pd.concat([full_df, df], ignore_index=True)

In [6]:
# Remove duplicate item_id AND user_id pairs
pre_dupl = len(full_df)
full_df = full_df.drop_duplicates(subset=['user_id', 'item_id'])
post_dupl = len(full_df)
print(pre_dupl - post_dupl, "duplicates removed")

144 duplicates removed


In [7]:
# Get data on 
full_df.head()
len(full_df)
full_df["rating_id"].mean()

Unnamed: 0,item_id,user_id,rating_id
0,206,1,9
1,1014,1,3
2,231,1,7
3,725,1,8
4,1148,1,2


3797

5.769818277587569

In [9]:
# Check for general graph connectivity
grouped_counts_bottom = full_df.groupby('item_id').count().sort_values(by="rating_id")
grouped_counts_top = full_df.groupby('item_id').count().sort_values(by="rating_id", ascending=False)

all_values = set(range(len(all_events)))
observed_values = set(full_df['item_id'])
missing_values = all_values - observed_values
missing_values_list = sorted(list(missing_values))

print(len(missing_values_list), "items with no edges!!! This is bad --> in an ideal world all items average deg(3)")

print("Least but connected:")
grouped_counts_bottom.head(20)
print("Most connected:")
grouped_counts_top.head(20)

54 items with no edges!!! This is bad --> in an ideal world all items average deg(3)
Least but connected:


Unnamed: 0_level_0,user_id,rating_id
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1047,1,1
196,1,1
1118,1,1
862,1,1
1023,1,1
494,1,1
490,1,1
489,1,1
206,1,1
859,1,1


Most connected:


Unnamed: 0_level_0,user_id,rating_id
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1235,9,9
730,9,9
1058,8,8
770,8,8
1029,7,7
732,7,7
1180,7,7
1216,7,7
762,7,7
204,7,7


In [10]:
print("Expected interactions after data generation: ", len(full_df))

Expected interactions after data generation:  3797


In [11]:
full_df.to_csv("full_interaction.csv")