# Recommendation System

At Pratilipi, we call one story as pratilipi. Attached is a data set containing user_id, pratilipi_id, date. This denotes which user has read which story on a particular date. The data set is provided.

1. Can you build a model using the data set and predict which pratilipis (atleast 5), each user is going to read later?

2. Arrange the dataset in ascending order of time, use the first 75% of the data for training and evaluate your model on the next 25% of the data.

## Data gathering

In [1]:
!gdown --id 1UHJDbnbndi2G9dEgVnNSrUSAXa_QDYBo
!unzip -q ds-assignment.zip

Downloading...
From: https://drive.google.com/uc?id=1UHJDbnbndi2G9dEgVnNSrUSAXa_QDYBo
To: /content/ds-assignment.zip
100% 184M/184M [00:03<00:00, 50.3MB/s]


## Imports

In [2]:
import pickle
import numpy as np
import pandas as pd
from typing import Dict, Set, List
from sklearn.metrics.pairwise import cosine_similarity

## Read the data

In [3]:
user_df = pd.read_csv('./ds-assignment/user-interactions.csv')
user_df.head()

Unnamed: 0.1,Unnamed: 0,user_id,pratilipi_id,read_percent,updated_at
0,0,5506791963854965,1377786220672965,100.0,2022-03-23 00:08:26.227
1,1,5506791979071996,1377786219742624,29.0,2022-03-23 00:08:26.220
2,2,5506791980256358,1377786217096334,22.0,2022-03-23 00:08:26.020
3,3,5506791988747277,1377786224767880,100.0,2022-03-23 00:08:25.306
4,4,5506791992372558,1377786218111595,100.0,2022-03-23 00:08:25.250


In [4]:
print(user_df.shape)

(10000000, 5)


In [5]:
metadata_df = pd.read_csv('./ds-assignment/metadata.csv')
metadata_df.head()

Unnamed: 0,author_id,pratilipi_id,category_name,reading_time,updated_at,published_at
0,-3418949279741297,1025741862639304,translation,0,2020-08-19 15:26:13,2016-09-30 10:37:04
1,-2270332351871840,1377786215601277,translation,171,2021-01-21 16:27:07,2018-06-11 13:17:48
2,-2270332352037261,1377786215601962,translation,92,2020-09-29 12:33:57,2018-06-12 04:19:12
3,-2270332352521845,1377786215640994,translation,0,2019-10-17 09:03:37,2019-09-26 14:58:53
4,-2270332349665658,1377786215931338,translation,47,2020-05-05 11:33:41,2018-11-25 12:28:23


In [6]:
print(metadata_df.shape)

(954501, 6)


### Check the Detail information of the dataframe

In [7]:
user_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000000 entries, 0 to 9999999
Data columns (total 5 columns):
 #   Column        Dtype  
---  ------        -----  
 0   Unnamed: 0    int64  
 1   user_id       int64  
 2   pratilipi_id  int64  
 3   read_percent  float64
 4   updated_at    object 
dtypes: float64(1), int64(3), object(1)
memory usage: 381.5+ MB


In [8]:
metadata_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 954501 entries, 0 to 954500
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   author_id      954501 non-null  int64 
 1   pratilipi_id   954501 non-null  int64 
 2   category_name  954501 non-null  object
 3   reading_time   954501 non-null  int64 
 4   updated_at     954501 non-null  object
 5   published_at   954494 non-null  object
dtypes: int64(3), object(3)
memory usage: 43.7+ MB


### Describe the entire dataset

In [9]:
user_df.describe()

Unnamed: 0.1,Unnamed: 0,user_id,pratilipi_id,read_percent
count,10000000.0,10000000.0,10000000.0,10000000.0
mean,5000000.0,5489158000000000.0,1369415000000000.0,93.25429
std,2886751.0,160567900000000.0,122188000000000.0,21.63297
min,0.0,3255388000000000.0,-5375941000000000.0,0.0
25%,2500000.0,5506792000000000.0,1377786000000000.0,100.0
50%,5000000.0,5506792000000000.0,1377786000000000.0,100.0
75%,7499999.0,5506792000000000.0,1377786000000000.0,100.0
max,9999999.0,5506792000000000.0,1377786000000000.0,2400.0


In [10]:
metadata_df.describe()

Unnamed: 0,author_id,pratilipi_id,reading_time
count,954501.0,954501.0,954501.0
mean,-2379597000000000.0,1368571000000000.0,351.979334
std,392202000000000.0,116110500000000.0,513.959547
min,-9070332000000000.0,-873461100000000.0,0.0
25%,-2270332000000000.0,1377786000000000.0,111.0
50%,-2270332000000000.0,1377786000000000.0,256.0
75%,-2270332000000000.0,1377786000000000.0,461.0
max,-2270332000000000.0,1377786000000000.0,78983.0


## Data Cleaning

### Check `null` values

In [11]:
user_df.isnull().sum()

Unnamed: 0      0
user_id         0
pratilipi_id    0
read_percent    0
updated_at      0
dtype: int64

In [12]:
metadata_df.isnull().sum()

author_id        0
pratilipi_id     0
category_name    0
reading_time     0
updated_at       0
published_at     7
dtype: int64

In [13]:
metadata_df = metadata_df.dropna()

In [14]:
metadata_df.isnull().sum()

author_id        0
pratilipi_id     0
category_name    0
reading_time     0
updated_at       0
published_at     0
dtype: int64

### Check for duplicate rows

In [15]:
def get_duplicate_rows(dataFrame):
    duplicate_rows = dataFrame[dataFrame.duplicated()]
    print(duplicate_rows.shape[0])

In [16]:
get_duplicate_rows(user_df)
get_duplicate_rows(metadata_df)

0
0


## Data EDA

### Getting all the common categories

In [17]:
metadata_df["category_name"].value_counts()

romance                      193218
shortstories                 102096
social                        73919
suspense                      64041
family                        58515
novels                        53584
life                          51356
moral-inspiring               47421
women                         37167
horror                        28506
entertainment                 23362
action-and-adventure          21589
experiences-and-memories      20275
webseries                     16227
relegion-and-spiritual        15476
comedy                        14893
Pratilipi-kalamkar-samman     13237
mythology                     12323
children                      10196
Indiawale                      9674
Pratilipi-Awards-Hindi         9342
fantasy                        9324
swahindi2                      7853
drama                          7387
crime                          7151
politics                       5810
translation                    4250
short-story-challenge       

### Read percent  

In [18]:
user_df["read_percent"].value_counts()

100.000000    8702791
99.000000      221116
50.000000       66502
34.000000       49956
25.000000       45628
               ...   
4.127959            1
81.238440           1
1.893793            1
4.924527            1
76.917840           1
Name: read_percent, Length: 24772, dtype: int64

## Data preprocessing

### Convert `float` type to `int`

In [19]:
user_df['read_percent'] = user_df['read_percent'].apply(lambda x: int(x))

In [20]:
user_df.drop(user_df[user_df["read_percent"] > 100].index, inplace=True) # removing rows which have read_percent > 100
user_df.drop(user_df[user_df["read_percent"] < 100].index, inplace=True) # removing rows which have read_percent < 100

In [21]:
user_df.sort_values('updated_at', ascending=False, inplace=True) # Sort the dataframe according to `updated_at`

## Split the data

In [22]:
test_split=0.25
test, train = np.split(user_df, [int(test_split * len(user_df))])

test.to_csv('./test.csv', sep=',', encoding='utf-8')
train.to_csv('./train.csv', sep=',', encoding='utf-8')

### Read the test and train data

In [23]:
train_df = pd.read_csv('./train.csv')
print(train_df.shape)

test_df = pd.read_csv('./test.csv')
print(test_df.shape)

(6527094, 6)
(2175697, 6)


## Get unique users and number of the unique users

In [24]:
unique_users = set(train['user_id'])
unique_no_users = len(train['user_id'])

## Get unique pratilipi and number of the unique pratilipi

In [25]:
unique_pratilipi = set(train['pratilipi_id'])
unique_no_pratilipi = len(train['pratilipi_id'])

In [26]:
def create_dict(unique_features: Set, dict_one: Dict, dict_two: Dict):
    index = 0
    for unique_feature in unique_features:
        dict_one[unique_feature] = index
        dict_two[index] = unique_feature
        index += 1
    return dict_one, dict_two

In [27]:
user_to_index = {}
index_to_user = {}

user_to_index, index_to_user = create_dict(unique_users, user_to_index, index_to_user) 

pratilipi_to_index = {}
index_to_pratilipi = {}

pratilipi_to_index, index_to_pratilipi = create_dict(unique_pratilipi, pratilipi_to_index, index_to_pratilipi) 

## Create pivot table

In [28]:
user_to_read_pratilipis = {}
users = train['user_id']
for user in unique_users:
    user_to_read_pratilipis[user] = []

user_len = len(users)
count = 0

for train_index, train_row in train.iterrows():
    key = train_row['user_id']
    val = (train_row['pratilipi_id'], train_row['read_percent'])
    user_to_read_pratilipis[key].append(val)
    count += 1

In [1]:
count = 0
with open('./pivot.pickle', "wb") as pivot_file:
    for user in unique_users:
        csv_row = [-1 for x in range(unique_no_pratilipi)]
        pratilipis_read = user_to_read_pratilipis[user]
        if pratilipis_read is None or len(pratilipis_read) == 0:
            pass
        else:
            for pratilipi_and_read_percent in pratilipis_read:
                col_index = pratilipi_to_index[pratilipi_and_read_percent[0]]
                csv_row[col_index] = pratilipi_and_read_percent[1]
        pickle.dump(np.array(csv_row), pivot_file)
        count += 1

## Create Similarity Matrix

In [None]:
def get_row_from_pickle_file(filename: str, row_index: int):
    with open(filename, "rb") as pivot_file:
        for cur_index in range(row_index+1):
            row = pickle.load(pivot_file)
            if cur_index == row_index:
                return row
            else:
                pass
    return None

count = 0
with open('./similarity.pickle', "wb") as similarity_file:
    for cur_index in range(unique_no_users):

        cur_feature = get_row_from_pickle_file(pivot_filename, cur_index)

        with open('./pivot.pickle', "rb") as pivot_file:
            cur_similarity = []
            index = 0
            while True:
                chunk = []
                for chunk_index in range(100):
                    if index == unique_no_users:
                        break
                    else:
                        feature = pickle.load(pivot_file)
                        chunk.append(feature)
                        index += 1

                if len(chunk) != 0:
                    chunk = np.array(chunk)
                    mat = np.vstack((cur_feature, chunk))
                    similarities = cosine_similarity(mat)

                    for similarity in similarities[0][1:]:
                        cur_similarity.append(similarity)
                else:
                    break

            cur_similarity = np.array(cur_similarity)

            pickle.dump(cur_similarity, similarity_file)
            count += 1

## Store Recommendation

In [None]:
with open('./recommendation.pickle', "wb") as recommendation_file:
    with open('./similarity.pickle', "rb") as similarity_file:
        for cur_user_index in range(unique_no_users):
            cur_user_similarity = pickle.load(similarity_file)

            cur_user_sorted_indexes = cur_user_similarity.argsort()[::-1]

            top_similar_users_index = cur_user_sorted_indexes[:5+1]

            temp_index = np.argwhere(top_similar_users_index == cur_user_index)
            top_similar_users_index = np.delete(top_similar_users_index, temp_index)

            top_similar_user_similarity = cur_user_similarity[top_similar_users_index]

            cur_user_watched_movies_index = []
            cur_user_scores = get_row_from_pickle_file(pivot_filename, cur_user_index)
            for index in range(unique_no_pratilipis):
                if cur_user_scores[index] != -1:
                    cur_user_watched_movies_index.append(index)

            top_score_index_list = []
            weight_list = []

            for similar_user_index in top_similar_users_index:
                similar_user_scores = get_row_from_pickle_file(pivot_filename, similar_user_index)

                similar_user_sorted_scores_index = similar_user_scores.argsort()[::-1]

                similar_user_top_scores_index = []
                for index in similar_user_sorted_scores_index:
                    if index not in cur_user_watched_movies_index:
                        similar_user_top_scores_index.append(index)
                        if len(similar_user_top_scores_index) == 5:
                            break

                similar_user_top_scores_index = np.array(similar_user_top_scores_index)
                similar_user_top_scores = similar_user_scores[similar_user_top_scores_index]

                similar_user_weights = similar_user_top_scores * cur_user_similarity[similar_user_index]

                for index in similar_user_top_scores_index:
                    top_score_index_list.append(index)

                for w in similar_user_weights:
                    weight_list.append(w)

            weight_list = np.array(weight_list)
            top_weight_index = weight_list.argsort()[::-1][:5]

            top_pratilipis = [index_to_pratilipi[x] for x in top_weight_index]

            pickle.dump(top_weight_index, recommendation_file)

## Evaluate the model

In [None]:
def calculate_accuracy(user_name_list: List, recommended_movies_list: List):
    test = pd.read_csv(test_filename)

    accuracy_metrics_list = []
    for x in range(len(user_name_list)):
        accuracy_metrics_obj = accuracy_metrics()
        accuracy_metrics_list.append(accuracy_metrics_obj)

    for index, row in test.iterrows():
        match_index = None
        try:
            match_index = user_name_list.index(row["user_id"])
        except ValueError as e:
            pass
        if match_index is not None:
            accuracy_metrics_list[match_index].total_test_movies_watched += 1
            if row["pratilipi_id"] in recommended_movies_list[match_index]:
                accuracy_metrics_list[match_index].cover_count += 1
                accuracy_metrics_list[match_index].weighted_cover += float(row["read_percentage"])/100.0
            else:
                pass
        else:
            pass

    for accuracy_metrics_obj in accuracy_metrics_list:
        if accuracy_metrics_obj.total_test_movies_watched > accuracy_metrics_obj.num_recommend_movies:
            accuracy_metrics_obj.cover_percentage = (accuracy_metrics_obj.cover_count*1.0/accuracy_metrics_obj.num_recommend_movies)*100
            accuracy_metrics_obj.weighted_cover_percentage = (accuracy_metrics_obj.weighted_cover*1.0/accuracy_metrics_obj.num_recommend_movies)*100
        else:
            accuracy_metrics_obj.cover_percentage = (accuracy_metrics_obj.cover_count*1.0/accuracy_metrics_obj.total_test_movies_watched)*100
            accuracy_metrics_obj.weighted_cover_percentage = (accuracy_metrics_obj.weighted_cover*1.0/accuracy_metrics_obj.total_test_movies_watched)*100

    return accuracy_metrics_list

In [None]:
test_unique_users = list(set(test_df["user_id"]))
recomended_pratilipis = get_recommendation(test_unique_users)

accuracy_metrics_list = calculate_accuracy(test_unique_users, recomended_pratilipis)

average_cover_percentage = 0
weighted_cover_percentage = 0
for accuracy_metrics_obj in accuracy_metrics_list:
    average_cover_percentage += accuracy_metrics_obj.cover_percentage
    weighted_cover_percentage += accuracy_metrics_obj.weighted_cover_percentage

print(f'Average Cover Percentage: {average_cover_percentage}')
print(f'Weighted Cover Percentage: {weighted_cover_percentage}')