In [1]:
import pandas as pd

In [2]:
user_id_col = 'learner_id'
item_id_col = 'media_id'
rating_col = 'like_rating'

In [3]:
df = pd.read_csv('../data/events.csv',
                 usecols=[user_id_col, item_id_col, rating_col, 'event_time'],
                 dtype={user_id_col: object, item_id_col: object},
                 parse_dates=['event_time'])
df.shape

(424659, 4)

### Data Type

In [4]:
df[[user_id_col, item_id_col, rating_col]].dtypes

learner_id     object
media_id       object
like_rating     int64
dtype: object

### Stats

In [5]:
no_of_users = len(df[user_id_col].unique())
no_of_items = len(df[item_id_col].unique())

ideal_no_of_ratings = no_of_users*no_of_items
given_no_of_ratings = len(df)

no_of_users, no_of_items, given_no_of_ratings, ideal_no_of_ratings

(44048, 8110, 424659, 357229280)

In [6]:
df.head()

Unnamed: 0,learner_id,media_id,event_time,like_rating
0,-140668,8918,2018-04-10 22:48:19,1
1,-140665,4,2018-04-10 22:17:43,1
2,-140665,8882,2018-04-10 22:26:35,1
3,-140665,8917,2018-04-10 22:34:38,1
4,-140615,11901,2018-04-12 18:57:23,3


In [7]:
df['event_time'].describe()

count                  424659
unique                 420610
top       2018-02-24 21:58:43
freq                        3
first     2016-06-07 11:56:14
last      2018-04-27 23:57:34
Name: event_time, dtype: object

In [8]:
df[rating_col].value_counts()

3    229588
2    109365
1     85703
0         3
Name: like_rating, dtype: int64

In [9]:
df[df[rating_col] == 0]

Unnamed: 0,learner_id,media_id,event_time,like_rating
182395,975608,8500,2017-06-28 15:33:13,0
199531,979353,32,2017-10-18 17:10:24,0
199532,979353,329,2017-10-18 17:11:23,0


In [10]:
#Get rid of outlier data points
df = df[df[rating_col] != 0]
df[rating_col].value_counts()

3    229588
2    109365
1     85703
Name: like_rating, dtype: int64

### Analyse if learners provide multiple ratings

In [11]:
no_of_rating_df = df[[user_id_col, item_id_col, rating_col]]\
                  .groupby([user_id_col, item_id_col])\
                  .count()\
                  .rename(columns={rating_col : 'no_of_ratings'})\
                  .reset_index()
no_of_rating_df.head()

Unnamed: 0,learner_id,media_id,no_of_ratings
0,-100002,584,1
1,-100002,8501,1
2,-100003,11501,1
3,-100003,11683,1
4,-100003,11737,1


In [12]:
no_of_rating_df['no_of_ratings'].value_counts()

1    303318
2     60669
Name: no_of_ratings, dtype: int64

In [13]:
dual_rating_dataset = set()
twice_rated_df = no_of_rating_df[no_of_rating_df['no_of_ratings'] == 2]
for i, row in twice_rated_df.iterrows():
    user_id_item_id = (row[user_id_col], row[item_id_col])
    dual_rating_dataset.add(user_id_item_id)
len(dual_rating_dataset)

60669

In [14]:
for user_id_item_id in dual_rating_dataset:
    user_id, item_id = user_id_item_id
    break
df[(df[user_id_col] == user_id) & (df[item_id_col] == item_id)].T

Unnamed: 0,291510,291511
learner_id,1013491,1013491
media_id,17,17
event_time,2018-01-30 19:47:45,2018-01-30 10:45:13
like_rating,3,3


In [15]:
different_ratings_df = df[[user_id_col, item_id_col, rating_col]]\
                       .groupby([user_id_col, item_id_col])\
                       .agg({
                         rating_col: (lambda x: len(x.unique()))
                       })\
                       .rename(columns={rating_col : 'distinct_no_of_ratings'})\
                       .reset_index()
different_ratings_df.head()

Unnamed: 0,learner_id,media_id,distinct_no_of_ratings
0,-100002,584,1
1,-100002,8501,1
2,-100003,11501,1
3,-100003,11683,1
4,-100003,11737,1


In [16]:
different_ratings_df['distinct_no_of_ratings'].value_counts()

1    358741
2      5246
Name: distinct_no_of_ratings, dtype: int64

In [17]:
distinct_rating_dataset = set()
for i, row in different_ratings_df[different_ratings_df['distinct_no_of_ratings'] == 2].iterrows():
    user_id_item_id = (row[user_id_col], row[item_id_col])
    distinct_rating_dataset.add(user_id_item_id)
len(distinct_rating_dataset)

5246

In [18]:
for user_id_item_id in distinct_rating_dataset:
    user_id, item_id = user_id_item_id
    break
df[(df[user_id_col] == user_id) & (df[item_id_col] == item_id)].T

Unnamed: 0,274501,274502
learner_id,1009601,1009601
media_id,13416,13416
event_time,2018-01-20 11:08:17,2018-01-05 13:48:22
like_rating,1,3


In [19]:
identical_rating_dataset = dual_rating_dataset - distinct_rating_dataset
len(identical_rating_dataset)

55423

In [20]:
for user_id_item_id in identical_rating_dataset:
    user_id, item_id = user_id_item_id
    break
df[(df[user_id_col] == user_id) & (df[item_id_col] == item_id)].T

Unnamed: 0,291510,291511
learner_id,1013491,1013491
media_id,17,17
event_time,2018-01-30 19:47:45,2018-01-30 10:45:13
like_rating,3,3


### Use Latest Rating

In [21]:
sorted_df = df.sort_values(by='event_time')

In [22]:
sorted_df.head()

Unnamed: 0,learner_id,media_id,event_time,like_rating
181705,-10035,8105,2016-06-07 11:56:14,2
181703,-10039,8195,2016-06-07 12:39:25,2
181682,-10041,7709,2016-06-07 13:39:10,3
181681,-10042,6843,2016-06-07 13:49:28,2
181700,-10039,7558,2016-06-08 16:22:13,2


In [23]:
sorted_df[(sorted_df[user_id_col] == user_id) & (sorted_df[item_id_col] == item_id)].T

Unnamed: 0,291511,291510
learner_id,1013491,1013491
media_id,17,17
event_time,2018-01-30 10:45:13,2018-01-30 19:47:45
like_rating,3,3


In [24]:
latest_rating_df = sorted_df.drop_duplicates([user_id_col, item_id_col], keep='last')
latest_rating_df.shape

(363987, 4)

In [25]:
latest_rating_df[(latest_rating_df[user_id_col] == user_id) & (latest_rating_df[item_id_col] == item_id)].T

Unnamed: 0,291510
learner_id,1013491
media_id,17
event_time,2018-01-30 19:47:45
like_rating,3


In [26]:
latest_rating_df_no_of_rating_df = latest_rating_df[[user_id_col, item_id_col, rating_col]]\
                  .groupby([user_id_col, item_id_col])\
                  .count()\
                  .rename(columns={rating_col : 'no_of_ratings'})\
                  .reset_index()
latest_rating_df_no_of_rating_df['no_of_ratings'].value_counts()

1    363987
Name: no_of_ratings, dtype: int64

In [27]:
no_of_users = len(latest_rating_df[user_id_col].unique())
no_of_items = len(latest_rating_df[item_id_col].unique())

ideal_no_of_ratings = no_of_users*no_of_items
given_no_of_ratings = len(latest_rating_df)

no_of_users, no_of_items, given_no_of_ratings, ideal_no_of_ratings

(44048, 8110, 363987, 357229280)

In [29]:
latest_rating_df.to_csv('../data/latest_rating.csv', index=False)