In [None]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
import matplotlib.pylab as plt
import scipy.sparse as sparse

import warnings
warnings.filterwarnings("ignore")

In [None]:
import re
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

In [None]:
df_outcomes=pd.read_table('outcomes.tsv')
df_traces=pd.read_table('traces.tsv')
print(df_outcomes.head(3))
print(df_traces.head(3))

   auction_id  product_id                                      item  \
0       86827    10009602  sony-ericsson-s500i-unlocked-mysterious-   
1       87964    10009881            psp-slim-lite-sony-piano-black   
2       87965    10009881            psp-slim-lite-sony-piano-black   

                                            desc  retail  price  finalprice  \
0  Sony Ericsson S500i Unlocked Mysterious Green  499.99  13.35       13.35   
1               PSP Slim & Lite Sony Piano Black  169.99  74.70       74.70   
2               PSP Slim & Lite Sony Piano Black  169.99  83.10       83.10   

   bidincrement  bidfee        winner  placedbids  freebids  \
0            15      75       Racer11          26         0   
1            15      75        Cemo23          65         0   
2            15      75  Jacobsonnich          94         0   

            endtime_str  flg_click_only  flg_beginnerauction  flg_fixedprice  \
0  19:52 PDT 09-16-2008               0                    0     

In [None]:
print(df_outcomes.info())
print(df_traces.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121419 entries, 0 to 121418
Data columns (total 17 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   auction_id           121419 non-null  int64  
 1   product_id           121419 non-null  int64  
 2   item                 121419 non-null  object 
 3   desc                 121419 non-null  object 
 4   retail               121419 non-null  float64
 5   price                121419 non-null  float64
 6   finalprice           121419 non-null  float64
 7   bidincrement         121419 non-null  int64  
 8   bidfee               121419 non-null  int64  
 9   winner               121419 non-null  object 
 10  placedbids           121419 non-null  int64  
 11  freebids             121419 non-null  int64  
 12  endtime_str          121419 non-null  object 
 13  flg_click_only       121419 non-null  int64  
 14  flg_beginnerauction  121419 non-null  int64  
 15  flg_fixedprice   

In [None]:
df_outcomes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121419 entries, 0 to 121418
Data columns (total 17 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   auction_id           121419 non-null  int64  
 1   product_id           121419 non-null  int64  
 2   item                 121419 non-null  object 
 3   desc                 121419 non-null  object 
 4   retail               121419 non-null  float64
 5   price                121419 non-null  float64
 6   finalprice           121419 non-null  float64
 7   bidincrement         121419 non-null  int64  
 8   bidfee               121419 non-null  int64  
 9   winner               121419 non-null  object 
 10  placedbids           121419 non-null  int64  
 11  freebids             121419 non-null  int64  
 12  endtime_str          121419 non-null  object 
 13  flg_click_only       121419 non-null  int64  
 14  flg_beginnerauction  121419 non-null  int64  
 15  flg_fixedprice   

In [None]:
df_merged = pd.merge(df_outcomes, df_traces, on='auction_id')
df_merged.head(3)

Unnamed: 0,auction_id,product_id,item,desc,retail,price,finalprice,bidincrement,bidfee,winner,...,bid_ct,bid_number,bid_user,bid_butler,bid_cp,bid_user_secs_added,bid_butler_secs_added,bid_infered,bid_group,bid_final
0,222387,10012342,guitar-hero-world-tour-guitar-kit-wii-,Guitar Hero World Tour Guitar Kit (Wii),64.12,18.12,18.12,12,60,Clubfan78,...,19913,1,Becker1990,0,12,0,0,0,0,0
1,222387,10012342,guitar-hero-world-tour-guitar-kit-wii-,Guitar Hero World Tour Guitar Kit (Wii),64.12,18.12,18.12,12,60,Clubfan78,...,13029,2,Gaynor66,0,24,4,0,0,1,0
2,222387,10012342,guitar-hero-world-tour-guitar-kit-wii-,Guitar Hero World Tour Guitar Kit (Wii),64.12,18.12,18.12,12,60,Clubfan78,...,12079,3,Jostrem16,0,36,4,0,0,2,0


In [None]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2049972 entries, 0 to 2049971
Data columns (total 28 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   auction_id             int64  
 1   product_id             int64  
 2   item                   object 
 3   desc                   object 
 4   retail                 float64
 5   price                  float64
 6   finalprice             float64
 7   bidincrement           int64  
 8   bidfee                 int64  
 9   winner                 object 
 10  placedbids             int64  
 11  freebids               int64  
 12  endtime_str            object 
 13  flg_click_only         int64  
 14  flg_beginnerauction    int64  
 15  flg_fixedprice         int64  
 16  flg_endprice           int64  
 17  bid_time               object 
 18  bid_ct                 int64  
 19  bid_number             int64  
 20  bid_user               object 
 21  bid_butler             int64  
 22  bid_cp            

In [None]:
df_merged.drop(columns=['retail','price','finalprice',
                        'bidincrement','bidfee','winner','placedbids','freebids','endtime_str',
                        'flg_click_only','flg_beginnerauction','flg_fixedprice','flg_endprice',
                        'bid_time','bid_ct','bid_number','bid_butler','bid_cp','bid_user_secs_added',
                        'bid_butler_secs_added','bid_infered','bid_group','bid_final'], inplace=True)
df_merged.head()

Unnamed: 0,auction_id,product_id,item,desc,bid_user
0,222387,10012342,guitar-hero-world-tour-guitar-kit-wii-,Guitar Hero World Tour Guitar Kit (Wii),Becker1990
1,222387,10012342,guitar-hero-world-tour-guitar-kit-wii-,Guitar Hero World Tour Guitar Kit (Wii),Gaynor66
2,222387,10012342,guitar-hero-world-tour-guitar-kit-wii-,Guitar Hero World Tour Guitar Kit (Wii),Jostrem16
3,222387,10012342,guitar-hero-world-tour-guitar-kit-wii-,Guitar Hero World Tour Guitar Kit (Wii),Clubfan78
4,222387,10012342,guitar-hero-world-tour-guitar-kit-wii-,Guitar Hero World Tour Guitar Kit (Wii),Momstired2


In [None]:
df_merged.shape

(2049972, 5)

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
wpt=nltk.WordPunctTokenizer()
stop_words=nltk.corpus.stopwords.words('english')

In [None]:
def normalize_document(doc):
  doc=re.sub(r'\W',' ',str(doc))
  doc=re.sub(r'http\S+',' ',str(doc))
  doc=doc.lower()
  doc=re.sub(r'\s+[a-z]\s+',' ',str(doc))
  doc=re.sub(r'^[a-z]\s+',' ',str(doc))
  doc=re.sub(r'\s+',' ',str(doc))
  tokens=wpt.tokenize(doc)
  filtered_tokens=[token for token in tokens if token not in stop_words]
  doc=' '.join(filtered_tokens)
  return doc

In [None]:
df_outcomes['clean_text']= df_outcomes['item']+ " " + df_outcomes['desc']
df_outcomes.head(3)

Unnamed: 0,auction_id,product_id,item,desc,retail,price,finalprice,bidincrement,bidfee,winner,placedbids,freebids,endtime_str,flg_click_only,flg_beginnerauction,flg_fixedprice,flg_endprice,clean_text
0,86827,10009602,sony-ericsson-s500i-unlocked-mysterious-,Sony Ericsson S500i Unlocked Mysterious Green,499.99,13.35,13.35,15,75,Racer11,26,0,19:52 PDT 09-16-2008,0,0,0,0,sony-ericsson-s500i-unlocked-mysterious- Sony ...
1,87964,10009881,psp-slim-lite-sony-piano-black,PSP Slim & Lite Sony Piano Black,169.99,74.7,74.7,15,75,Cemo23,65,0,11:17 PDT 08-28-2008,0,0,0,0,psp-slim-lite-sony-piano-black PSP Slim & Lite...
2,87965,10009881,psp-slim-lite-sony-piano-black,PSP Slim & Lite Sony Piano Black,169.99,83.1,83.1,15,75,Jacobsonnich,94,0,22:52 PDT 11-07-2008,0,1,0,0,psp-slim-lite-sony-piano-black PSP Slim & Lite...


In [None]:
cleaned_corpus=df_outcomes.drop(['auction_id','product_id','item','desc','retail','price','finalprice',
                     'bidincrement','bidfee','winner','placedbids','freebids','endtime_str',
                     'flg_click_only','flg_beginnerauction','flg_fixedprice','flg_endprice'],axis=1)
cleaned_corpus['clean_text']=cleaned_corpus['clean_text'].apply(normalize_document)
cleaned_corpus

Unnamed: 0,clean_text
0,sony ericsson s500i unlocked mysterious sony e...
1,psp slim lite sony piano black psp slim lite s...
2,psp slim lite sony piano black psp slim lite s...
3,sony ericsson s500i unlocked mysterious sony e...
4,sony ericsson s500i unlocked mysterious sony e...
...,...
121414,300 bids voucher 300 bids voucher
121415,300 bids voucher 300 bids voucher
121416,300 bids voucher 300 bids voucher
121417,300 bids voucher 300 bids voucher


In [None]:
tfidf = TfidfVectorizer(stop_words='english')

cleaned_corpus['clean_text'] = cleaned_corpus['clean_text'].fillna('')
tfidf_matrix = tfidf.fit_transform(cleaned_corpus['clean_text'])

tfidf_matrix.shape

(121419, 3424)

In [None]:
tfidf.get_feature_names_out()[3000:3010]

array(['tao', 'tas4511uc', 'tassimo', 'tawny', 'taylormade', 'tb', 'tc',
       'tcd648250b', 'tcd658000', 'tea'], dtype=object)

In [None]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
cosine_sim.shape

In [None]:
indices = pd.Series(df_outcomes, index=df_outcomes['item']).drop_duplicates()