In [7]:
import pandas as pd
import numpy as np

In [8]:
data = pd.read_csv('behaviors.tsv', sep = '\t', names=["impressionId","userId","timestamp","click_history","impressions"])

In [9]:
data.head()

Unnamed: 0,impressionId,userId,timestamp,click_history,impressions
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...


### Pre-Processing

In [10]:
def process_impression(impression_list):
    list_of_strings = impression_list.split()
    click = " ".join(sorted([x.split('-')[0] for x in list_of_strings if x.split('-')[1] == '1']))
    non_click = " ".join(sorted([x.split('-')[0] for x in list_of_strings if x.split('-')[1] == '0']))
    return click,non_click

# We can then indexize these two new columns:
data['click'], data['no_clicks'] = zip(*data['impressions'].map(process_impression))

In [11]:
def process_impression2(impression_list):
    list_of_strings = impression_list.split()
    imp_list = sorted([x.split('-')[0] for x in list_of_strings])
    return imp_list

# We can then indexize these two new columns:
data['impList'] = data['impressions'].map(process_impression2)

In [12]:
def process_view(viewList):
    return viewList.split(" ")

In [13]:
data['clickList'] = data['click'].map(process_impression2)
data['noClicksList'] = data['no_clicks'].map(process_impression2)
data["click_history"].fillna(' ', inplace = True)
data["viewHistoryList"] = data["click_history"].map(process_view)

In [14]:
data.head()

Unnamed: 0,impressionId,userId,timestamp,click_history,impressions,click,no_clicks,impList,clickList,noClicksList,viewHistoryList
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0,N55689,N35729,"[N35729, N55689]",[N55689],[N35729],"[N55189, N42782, N34694, N45794, N18445, N6330..."
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...,N17059,N14592 N20495 N20678 N22407 N33677 N39317 N429...,"[N14592, N17059, N20495, N20678, N22407, N3367...",[N17059],"[N14592, N20495, N20678, N22407, N33677, N3931...","[N31739, N6072, N63045, N23979, N35656, N43353..."
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...,N23814,N10960 N11817 N12330 N13131 N16844 N23446 N238...,"[N10960, N11817, N12330, N13131, N16844, N2344...",[N23814],"[N10960, N11817, N12330, N13131, N16844, N2344...","[N10732, N25792, N7563, N21087, N41087, N5445,..."
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0,N49685,N27581 N33632 N35729,"[N27581, N33632, N35729, N49685]",[N49685],"[N27581, N33632, N35729]","[N45729, N2203, N871, N53880, N41375, N43142, ..."
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...,N8400,N10413 N11094 N11363 N11967 N12995 N13486 N160...,"[N10413, N11094, N11363, N11967, N12995, N1348...",[N8400],"[N10413, N11094, N11363, N11967, N12995, N1348...","[N10078, N56514, N14904, N33740]"


In [15]:
# print(type(data['click'][0]))
# print(type(data['click_history'][0]))
data.drop(['impressions','click','no_clicks'],axis=1)

Unnamed: 0,impressionId,userId,timestamp,click_history,impList,clickList,noClicksList,viewHistoryList
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,"[N35729, N55689]",[N55689],[N35729],"[N55189, N42782, N34694, N45794, N18445, N6330..."
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,"[N14592, N17059, N20495, N20678, N22407, N3367...",[N17059],"[N14592, N20495, N20678, N22407, N33677, N3931...","[N31739, N6072, N63045, N23979, N35656, N43353..."
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,"[N10960, N11817, N12330, N13131, N16844, N2344...",[N23814],"[N10960, N11817, N12330, N13131, N16844, N2344...","[N10732, N25792, N7563, N21087, N41087, N5445,..."
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,"[N27581, N33632, N35729, N49685]",[N49685],"[N27581, N33632, N35729]","[N45729, N2203, N871, N53880, N41375, N43142, ..."
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,"[N10413, N11094, N11363, N11967, N12995, N1348...",[N8400],"[N10413, N11094, N11363, N11967, N12995, N1348...","[N10078, N56514, N14904, N33740]"
...,...,...,...,...,...,...,...,...
156960,156961,U21593,11/14/2019 10:24:05 PM,N7432 N58559 N1954 N43353 N14343 N13008 N28833...,"[N11378, N11930, N12446, N14478, N21484, N2235...",[N50055],"[N11378, N11930, N12446, N14478, N21484, N2235...","[N7432, N58559, N1954, N43353, N14343, N13008,..."
156961,156962,U10123,11/13/2019 6:57:04 AM,N9803 N104 N24462 N57318 N55743 N40526 N31726 ...,"[N13105, N13235, N13540, N13579, N13907, N1398...",[N30212],"[N13105, N13235, N13540, N13579, N13907, N1398...","[N9803, N104, N24462, N57318, N55743, N40526, ..."
156962,156963,U75630,11/14/2019 10:58:13 AM,N29898 N59704 N4408 N9803 N53644 N26103 N812 N...,"[N10960, N1539, N17231, N19061, N20676, N23446...",[N46283],"[N10960, N1539, N17231, N19061, N20676, N23446...","[N29898, N59704, N4408, N9803, N53644, N26103,..."
156963,156964,U44625,11/13/2019 2:57:02 PM,N4118 N47297 N3164 N43295 N6056 N38747 N42973 ...,"[N103, N12029, N12656, N12848, N13486, N1410, ...","[N18573, N20630, N21712, N366, N50007]","[N103, N12029, N12656, N12848, N13486, N1410, ...","[N4118, N47297, N3164, N43295, N6056, N38747, ..."


In [16]:
news = pd.read_csv('news.tsv',sep = '\t',names=["itemId","category","subcategory","title","abstract","url","title_entities","abstract_entities"])
news["category"].fillna(' ', inplace = True)
news["subcategory"].fillna(' ', inplace = True)
news["title"].fillna(' ', inplace = True)
news["abstract"].fillna(' ', inplace = True)
news["combined"] = news["category"] + " " + news["subcategory"] + " " + news["title"] + " " + news["abstract"]

In [17]:
click_size = []
non_click_size = []

for i in range(len(data)):
    click_size.append(data['clickList'][i].__len__())
    non_click_size.append(data['noClicksList'][i].__len__())

print(np.mean(click_size))
print(np.mean(non_click_size))

print(max(click_size))
print(np.max(non_click_size))

1.5057114643391838
35.72197623674067
35
297


In [18]:
impressions_count=0
click_count = 0
for i in range(len(data)):
    impressions_count+=data['impList'][i].__len__()
    click_count+=data['clickList'][i].__len__()
print(impressions_count)
print(click_count)

5843444
236344


#### EDA

In [19]:
news_list = news['itemId'].unique()
print(news_list.__len__())

from collections import defaultdict
d = defaultdict(int)


print(type(data['impList']))
# print(len(data['impList'].unique()))
# mini_count = 50

for i in range(len(data)):
    for j in data['impList'][i]:
        d[j]+=1

d_values = []
for i in d:
    d_values.append(d[i])
print('mean_clicks', np.mean(d_values))
print('length of dict d',len(d))

minimum_click_count = 50              
count_greater_than_mini = 0
for i in d_values:
    if(i>minimum_click_count):
        count_greater_than_mini +=1
print('count_greater_than_mini', count_greater_than_mini)

print(d)

for i in range(len(data)):                                       # Removing news articles which have very less clicks
    for j in data['impList'][i]:
        if(d[j]<minimum_click_count):
            data['impList'][i].remove(j)
    for j in data['clickList'][i]:
        if(d[j]<minimum_click_count):
            data['clickList'][i].remove(j)
    for j in data['noClicksList'][i]:
        if(d[j]<minimum_click_count):
            data['noClicksList'][i].remove(j)    

data.head()

51282
<class 'pandas.core.series.Series'>
mean_clicks 288.0246451104101
length of dict d 20288
count_greater_than_mini 4248
defaultdict(<class 'int'>, {'N35729': 15418, 'N55689': 18315, 'N14592': 13254, 'N17059': 8047, 'N20495': 3991, 'N20678': 7059, 'N22407': 4171, 'N33677': 5628, 'N39317': 6344, 'N42977': 9727, 'N58114': 8332, 'N6890': 2769, 'N7821': 6597, 'N10960': 11908, 'N11817': 13026, 'N12330': 1023, 'N13131': 1094, 'N16844': 4838, 'N23446': 15500, 'N23814': 3905, 'N23877': 11281, 'N29739': 4130, 'N35389': 3380, 'N36226': 12639, 'N37348': 1127, 'N3839': 30, 'N38779': 18101, 'N40109': 11319, 'N45509': 10361, 'N46821': 3999, 'N47098': 10819, 'N47346': 2948, 'N48017': 8436, 'N48225': 643, 'N48722': 65, 'N48875': 1845, 'N49712': 7683, 'N50014': 3981, 'N50592': 3020, 'N50872': 18702, 'N51570': 8404, 'N5364': 4909, 'N55555': 2977, 'N56711': 3098, 'N59495': 803, 'N59685': 1894, 'N60550': 6067, 'N64174': 6853, 'N8015': 4590, 'N27581': 8066, 'N33632': 1953, 'N49685': 7229, 'N10413': 515,

Unnamed: 0,impressionId,userId,timestamp,click_history,impressions,click,no_clicks,impList,clickList,noClicksList,viewHistoryList
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0,N55689,N35729,"[N35729, N55689]",[N55689],[N35729],"[N55189, N42782, N34694, N45794, N18445, N6330..."
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...,N17059,N14592 N20495 N20678 N22407 N33677 N39317 N429...,"[N14592, N17059, N20495, N20678, N22407, N3367...",[N17059],"[N14592, N20495, N20678, N22407, N33677, N3931...","[N31739, N6072, N63045, N23979, N35656, N43353..."
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...,N23814,N10960 N11817 N12330 N13131 N16844 N23446 N238...,"[N10960, N11817, N12330, N13131, N16844, N2344...",[N23814],"[N10960, N11817, N12330, N13131, N16844, N2344...","[N10732, N25792, N7563, N21087, N41087, N5445,..."
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0,N49685,N27581 N33632 N35729,"[N27581, N33632, N35729, N49685]",[N49685],"[N27581, N33632, N35729]","[N45729, N2203, N871, N53880, N41375, N43142, ..."
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...,N8400,N10413 N11094 N11363 N11967 N12995 N13486 N160...,"[N10413, N11363, N12995, N13486, N16096, N1616...",[N8400],"[N10413, N11363, N12995, N13486, N16096, N1616...","[N10078, N56514, N14904, N33740]"


In [20]:
# from datetime import datetime, timedelta

# def convert2(date_time):
#   # The format
#     format = '%m/%d/%Y %I:%M%p'
#     datetime_str = datetime.strptime(date_time, format)
 
#     return datetime_str

# date_time = '12/4/2018 10:07AM'
# print(convert2(date_time))
# print(type(convert2(date_time)))

### Convert the time stamp strings to python datetime data type

In [21]:
from datetime import datetime, timedelta

for i in range(len(data)):
    data['timestamp'][i] = datetime.strptime(data['timestamp'][i],'%m/%d/%Y %H:%M:%S %p')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['timestamp'][i] = datetime.strptime(data['timestamp'][i],'%m/%d/%Y %H:%M:%S %p')


#### Assign publication time for the news articles 

In [22]:
max_date = datetime.strptime('11/11/2033 9:05:58 AM','%m/%d/%Y %H:%M:%S %p')

news_start_date = {}

for i in d:
    news_start_date[i] =max_date

for i in range(len(data)):
    for j in data['impList'][i]:
        news_start_date[j] = min(news_start_date[j],data['timestamp'][i])

In [23]:
data.head()

Unnamed: 0,impressionId,userId,timestamp,click_history,impressions,click,no_clicks,impList,clickList,noClicksList,viewHistoryList
0,1,U13740,2019-11-11 09:05:58,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0,N55689,N35729,"[N35729, N55689]",[N55689],[N35729],"[N55189, N42782, N34694, N45794, N18445, N6330..."
1,2,U91836,2019-11-12 06:11:30,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...,N17059,N14592 N20495 N20678 N22407 N33677 N39317 N429...,"[N14592, N17059, N20495, N20678, N22407, N3367...",[N17059],"[N14592, N20495, N20678, N22407, N33677, N3931...","[N31739, N6072, N63045, N23979, N35656, N43353..."
2,3,U73700,2019-11-14 07:01:48,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...,N23814,N10960 N11817 N12330 N13131 N16844 N23446 N238...,"[N10960, N11817, N12330, N13131, N16844, N2344...",[N23814],"[N10960, N11817, N12330, N13131, N16844, N2344...","[N10732, N25792, N7563, N21087, N41087, N5445,..."
3,4,U34670,2019-11-11 05:28:05,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0,N49685,N27581 N33632 N35729,"[N27581, N33632, N35729, N49685]",[N49685],"[N27581, N33632, N35729]","[N45729, N2203, N871, N53880, N41375, N43142, ..."
4,5,U8125,2019-11-12 04:11:21,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...,N8400,N10413 N11094 N11363 N11967 N12995 N13486 N160...,"[N10413, N11363, N12995, N13486, N16096, N1616...",[N8400],"[N10413, N11363, N12995, N13486, N16096, N1616...","[N10078, N56514, N14904, N33740]"


### Trending

In [24]:
date_trending = '11/11/2019 9:05:58 AM'
input_date = datetime.strptime(date_trending,'%m/%d/%Y %H:%M:%S %p')
number_of_days = 1
begin_date = input_date - timedelta(days=number_of_days)
number_of_trending_articles = 25
print(input_date)

2019-11-11 09:05:58


In [25]:
clicks_till_date = defaultdict(int)

for i in range(len(data)):
    if(data['timestamp'][i]>begin_date and data['timestamp'][i]<input_date):                          # clicks between this period for a particular news article.
        for j in data['clickList'][i]:
            clicks_till_date[j] += 1

In [26]:
Trending_list = []

for i in news_start_date:
    if(news_start_date[i]>begin_date and news_start_date[i]<input_date):                                        
        Trending_list.append(i)

def sort_fun(num):
    return clicks_till_date[num]

Trending_list.sort(key = sort_fun)                                                                    # sorting the trending list according to their clicks count
Trending_list.reverse()

Trending_list = Trending_list[:number_of_trending_articles]
print(Trending_list)

['N54489', 'N53585', 'N23414', 'N13930', 'N57957', 'N46526', 'N57402', 'N30518', 'N39765', 'N19542', 'N46029', 'N41222', 'N59981', 'N6693', 'N36789', 'N42597', 'N8191', 'N50675', 'N11768', 'N14440', 'N62688', 'N53054', 'N58410', 'N6342', 'N37781']


### Trending Recommendations

In [27]:
for i in Trending_list:                                                                                 # display the news articls with top click counts
    print(news[news.itemId == i]['title'])

34634    Fast food cashier and manager fired for refusi...
Name: title, dtype: object
45974    Rip Taylor's Cause of Death Revealed, Memorial...
Name: title, dtype: object
41945    FAA threatened to ground 38 Southwest Airlines...
Name: title, dtype: object
36285    Fans are divided after Drake was revealed as a...
Name: title, dtype: object
45897    Dramatic Footage Shows Plane Skidding Off Runw...
Name: title, dtype: object
47957    Ilhan Omar blasts Pete King as an 'Islamophobe...
Name: title, dtype: object
45394    'Unthinkable': Accused killer of Georgia colle...
Name: title, dtype: object
41508    Anna Faris and Michael Barrett may have just c...
Name: title, dtype: object
50661    SEMA 2019 Photo Highlights From Las Vegas
Name: title, dtype: object
36807    This 'new' strain of HIV is actually a good thing
Name: title, dtype: object
31955    How Russia Meddles Abroad for Profit: Cash, Tr...
Name: title, dtype: object
51262    Evo Morales of Bolivia Accepts Asylum in Mexico
Name: