In [1]:
import numpy as np
import pandas as pd
import random
import scipy

from random import shuffle
from scipy import stats

In [2]:
def generator():
    session = stats.geom.rvs(size = 4831, p = 0.3) # dataset has 4831 rows
    return session

user_session = generator()

In [3]:
count = {x: list(user_session).count(x) for x in user_session}
count

{2: 1006,
 3: 721,
 4: 490,
 1: 1464,
 9: 87,
 6: 230,
 13: 17,
 11: 39,
 5: 357,
 8: 132,
 14: 16,
 17: 5,
 7: 174,
 10: 41,
 15: 11,
 19: 2,
 16: 10,
 12: 22,
 25: 1,
 20: 2,
 18: 3,
 22: 1}

# Generating User Id and Session Id

In [4]:
user_id = range(1, 4831)
user_id_session = list(zip(user_id, [10 * i for i in user_session]))

# number of articles served
articles = 0
for i in range(len(user_id_session)):
    articles -= -user_id_session[i][1]

print("Number of articles served:", articles)

Number of articles served: 159570


In [5]:
user_ids = []
for i in range(len(user_id_session)):
    for j in range(user_id_session[i][1]):
        user_ids.append(user_id_session[i][0])


session_list = list(user_session)
session_id = []
for i in session_list:
    for j in range(1, i+1):
        session_id.append([j for i in range(10)])

session_id = np.array(session_id).flatten()
user_session = list(zip(user_ids, session_id))

In [6]:
data = pd.DataFrame(user_session, columns = ['UserId', 'SessionId'])
print(data.shape)
data.head(11)

(159570, 2)


Unnamed: 0,UserId,SessionId
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1
5,1,1
6,1,1
7,1,1
8,1,1
9,1,1


In [7]:
data.tail(11)

Unnamed: 0,UserId,SessionId
159559,4830,4
159560,4830,5
159561,4830,5
159562,4830,5
159563,4830,5
159564,4830,5
159565,4830,5
159566,4830,5
159567,4830,5
159568,4830,5


**UserId, SessionId generated sucessfully**

# ArticleId Served

In [8]:
article_id = []

for a in range(articles):
    article_id.append(np.random.randint(1, 4831))

len(article_id)

159570

**Difference between np.random.randint and random.randint**

* np.random.randint randomly initialises the whole array at once

* random.randint randomly initialises one element at a time

Reference - To serve articles randomly [shuffle in Python](https://pynative.com/python-random-shuffle/)

In [9]:
shuffle(article_id)
data['ArticleID Served'] = article_id
data.head()

Unnamed: 0,UserId,SessionId,ArticleID Served
0,1,1,3793
1,1,1,3324
2,1,1,1035
3,1,1,1551
4,1,1,2568


# Article Rank

In [10]:
article_rank = []
i = 1

for ar in range(articles):
    article_rank.append(i)
    i += 1
    if i > 10:
        i = 1

len(article_rank)

159570

In [11]:
data['Article Rank'] = article_rank
data.head(12)

Unnamed: 0,UserId,SessionId,ArticleID Served,Article Rank
0,1,1,3793,1
1,1,1,3324,2
2,1,1,1035,3
3,1,1,1551,4
4,1,1,2568,5
5,1,1,1926,6
6,1,1,3210,7
7,1,1,760,8
8,1,1,319,9
9,1,1,565,10


# Click

To generate [Random booleans](https://xspdf.com/help/51744035.html) 

In [12]:
click_session = []

for c in range(articles):
    click_session.append(random.choice([True, False]))
    
len(click_session)

159570

In [13]:
data['Click'] = click_session
data.head(12)

Unnamed: 0,UserId,SessionId,ArticleID Served,Article Rank,Click
0,1,1,3793,1,False
1,1,1,3324,2,True
2,1,1,1035,3,True
3,1,1,1551,4,False
4,1,1,2568,5,True
5,1,1,1926,6,False
6,1,1,3210,7,False
7,1,1,760,8,False
8,1,1,319,9,True
9,1,1,565,10,True


# Time Spent

In [14]:
time_session = []

for t in range(articles):
    time_session.append(np.random.randint(10, 100)) # Least interesting story for 10 sec and most for 100 sec
    
len(time_session)

159570

In [15]:
for i in range(len(click_session)):
    if click_session[i] is True:
        time_session[i] = time_session[i]
    else:
        time_session[i] = 0

data['Time Spent (seconds)'] = time_session
data.head()

Unnamed: 0,UserId,SessionId,ArticleID Served,Article Rank,Click,Time Spent (seconds)
0,1,1,3793,1,False,0
1,1,1,3324,2,True,60
2,1,1,1035,3,True,51
3,1,1,1551,4,False,0
4,1,1,2568,5,True,10


In [16]:
data.tail()

Unnamed: 0,UserId,SessionId,ArticleID Served,Article Rank,Click,Time Spent (seconds)
159565,4830,5,4542,6,False,0
159566,4830,5,1580,7,True,32
159567,4830,5,1953,8,True,35
159568,4830,5,3740,9,False,0
159569,4830,5,845,10,False,0


In [17]:
data.to_csv('user_profile.csv', index = False)

User profile completed

# Ratings
For collaborative filtering only.

In [18]:
i = data[((data['Time Spent (seconds)'] == 0))].index
final_data = data.drop(i)

final_data.head()

Unnamed: 0,UserId,SessionId,ArticleID Served,Article Rank,Click,Time Spent (seconds)
1,1,1,3324,2,True,60
2,1,1,1035,3,True,51
4,1,1,2568,5,True,10
8,1,1,319,9,True,81
9,1,1,565,10,True,86


In [19]:
final_data.tail()

Unnamed: 0,UserId,SessionId,ArticleID Served,Article Rank,Click,Time Spent (seconds)
159559,4830,4,236,10,True,93
159560,4830,5,1724,1,True,89
159561,4830,5,2274,2,True,61
159566,4830,5,1580,7,True,32
159567,4830,5,1953,8,True,35


In [20]:
data.to_csv('Final_data.csv')

In [21]:
ratings = []

for r in range(len(final_data['UserId'])):
    ratings.append(np.random.randint(0, 6))
    
len(ratings)

79840

In [22]:
final_data['Ratings (0 - 5)'] = ratings
final_data.head()

Unnamed: 0,UserId,SessionId,ArticleID Served,Article Rank,Click,Time Spent (seconds),Ratings (0 - 5)
1,1,1,3324,2,True,60,2
2,1,1,1035,3,True,51,4
4,1,1,2568,5,True,10,4
8,1,1,319,9,True,81,1
9,1,1,565,10,True,86,0


In [23]:
final_data.tail()

Unnamed: 0,UserId,SessionId,ArticleID Served,Article Rank,Click,Time Spent (seconds),Ratings (0 - 5)
159559,4830,4,236,10,True,93,5
159560,4830,5,1724,1,True,89,4
159561,4830,5,2274,2,True,61,3
159566,4830,5,1580,7,True,32,5
159567,4830,5,1953,8,True,35,4


In [24]:
final_data.to_csv('Ratings.csv', index = False)