### Start-up -- **Load saved (and preprocessed) data from pickle files.**

In [59]:
#Import libraries.
import pandas as pd
import numpy as np
import math
import csv
from collections import Counter
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

### 1. Read saved (serialized) data

In [2]:
PATH = '../input/csc-575-hw5-winter-2024/'
#Read train and test data.

trainData = pd.read_pickle(f'{PATH}train_x.pkl')
trainTarget = pd.read_pickle(f'{PATH}train_y.pkl')
testData = pd.read_pickle(f'{PATH}test.pkl')

### Train Data - train_x

In [3]:
#One product matches against different searches.
#Product title, description, attribute are originally string, applied stemming and tokenization.
trainData.head(5)

Unnamed: 0,id,product_uid,product_title,search_term,product_description,attributes
0,2,100001,"[simpson, strongti, 12gaug, angl]","[angl, bracket]","[angl, make, joint, stronger, also, provid, co...","[bullet01, versatil, connector, variou, 90, co..."
1,3,100001,"[simpson, strongti, 12gaug, angl]","[l, bracket]","[angl, make, joint, stronger, also, provid, co...","[bullet01, versatil, connector, variou, 90, co..."
2,9,100002,"[behr, premium, textur, deckov, 1gal, sc141, t...",[deck],"[behr, premium, textur, deckov, innov, solid, ...","[applic, method, brushrollerspray, assembl, de..."
3,16,100005,"[delta, vero, 1handl, shower, faucet, trim, ki...","[rain, shower, head]","[updat, bathroom, delta, vero, singlehandl, sh...","[bath, faucet, type, combo, tub, shower, built..."
4,17,100005,"[delta, vero, 1handl, shower, faucet, trim, ki...","[shower, faucet]","[updat, bathroom, delta, vero, singlehandl, sh...","[bath, faucet, type, combo, tub, shower, built..."


### Train Data - train_y

In [4]:
#Relevance score for each search - item combo.
trainTarget.head(5)

0    3.00
1    2.50
2    3.00
3    2.33
4    2.67
Name: relevance, dtype: float64

### Test Data - test_x

In [5]:
#Same format as train data x.
testData.head(5)

Unnamed: 0,id,product_uid,product_title,search_term,product_description,attributes
0,4,100001,"[simpson, strongti, 12gaug, angl]","[metal, l, bracket]","[angl, make, joint, stronger, also, provid, co...","[bullet01, versatil, connector, variou, 90, co..."
1,5,100001,"[simpson, strongti, 12gaug, angl]","[simpson, sku, abl]","[angl, make, joint, stronger, also, provid, co...","[bullet01, versatil, connector, variou, 90, co..."
2,6,100001,"[simpson, strongti, 12gaug, angl]","[simpson, strong, tie]","[angl, make, joint, stronger, also, provid, co...","[bullet01, versatil, connector, variou, 90, co..."
3,7,100001,"[simpson, strongti, 12gaug, angl]","[simpson, strong, tie, hcc668]","[angl, make, joint, stronger, also, provid, co...","[bullet01, versatil, connector, variou, 90, co..."
4,10,100003,"[sterl, ensembl, 3314, x, 60, x, 7514, bath, s...","[bath, shower, kit]","[classic, architectur, meet, contemporari, des...","[builtin, flang, ye, bullet01, slightli, narro..."


## ---Start the homework--- 
### Goal: Create prediction system on relevance score.

# **Train Data**

### Get dictionary word count of the columns.

In [6]:
trainData_dict = trainData

convertColumns = ['product_title', 'search_term', 'product_description', 'attributes']

for column in convertColumns:
    #Convert each cell ito a dictionary of word counts.
    trainData_dict[column + '_wcDict'] = trainData_dict[column].apply(lambda x: dict(Counter(x)))

#Drop original columns.
trainData_dict = trainData_dict.drop(columns=['id', 'product_title', 'search_term', 'product_description', 'attributes'])
trainData_dict.head(5)

Unnamed: 0,product_uid,product_title_wcDict,search_term_wcDict,product_description_wcDict,attributes_wcDict
0,100001,"{'simpson': 1, 'strongti': 1, '12gaug': 1, 'an...","{'angl': 1, 'bracket': 1}","{'angl': 3, 'make': 1, 'joint': 2, 'stronger':...","{'bullet01': 1, 'versatil': 1, 'connector': 1,..."
1,100001,"{'simpson': 1, 'strongti': 1, '12gaug': 1, 'an...","{'l': 1, 'bracket': 1}","{'angl': 3, 'make': 1, 'joint': 2, 'stronger':...","{'bullet01': 1, 'versatil': 1, 'connector': 1,..."
2,100002,"{'behr': 1, 'premium': 1, 'textur': 1, 'deckov...",{'deck': 1},"{'behr': 3, 'premium': 1, 'textur': 2, 'deckov...","{'applic': 1, 'method': 1, 'brushrollerspray':..."
3,100005,"{'delta': 1, 'vero': 1, '1handl': 1, 'shower':...","{'rain': 1, 'shower': 1, 'head': 1}","{'updat': 1, 'bathroom': 1, 'delta': 1, 'vero'...","{'bath': 2, 'faucet': 6, 'type': 4, 'combo': 1..."
4,100005,"{'delta': 1, 'vero': 1, '1handl': 1, 'shower':...","{'shower': 1, 'faucet': 1}","{'updat': 1, 'bathroom': 1, 'delta': 1, 'vero'...","{'bath': 2, 'faucet': 6, 'type': 4, 'combo': 1..."


## PRODUCT DATA
### Check if one product has different title? --> IT DOES.

In [7]:
counter = 0
#Group the dataframe by 'product_uid' and iterate through each group.
for name, group in trainData_dict.groupby('product_uid'):
    #Check if all rows in the group have the same dictionary length and the same keys and values.
    wc_dicts = group['product_title_wcDict'].tolist()
    if all(wc_dict == wc_dicts[0] for wc_dict in wc_dicts):
        counter += 0
    else:
        print(f"Not all {'product_title_wcDict'} are the same for product_uid {name}")

Not all product_title_wcDict are the same for product_uid 100540
Not all product_title_wcDict are the same for product_uid 101052
Not all product_title_wcDict are the same for product_uid 101317
Not all product_title_wcDict are the same for product_uid 101549
Not all product_title_wcDict are the same for product_uid 102458
Not all product_title_wcDict are the same for product_uid 105240
Not all product_title_wcDict are the same for product_uid 122733
Not all product_title_wcDict are the same for product_uid 126511
Not all product_title_wcDict are the same for product_uid 131132
Not all product_title_wcDict are the same for product_uid 136763
Not all product_title_wcDict are the same for product_uid 137331
Not all product_title_wcDict are the same for product_uid 138513


In [9]:
#Original train data.
filteredRows = trainData[trainData['product_uid'] == 101052]
for wc_dict in filteredRows['product_title']:
    print(wc_dict)

['1light', 'oil', 'rub', 'bronz', 'adjust', 'mini', 'pendant']
['1light', 'oilrub', 'bronz', 'adjust', 'mini', 'pendant']
['1light', 'oilrub', 'bronz', 'adjust', 'mini', 'pendant']


In [11]:
#Dictionary train data.
filteredRows2 = trainData_dict[trainData_dict['product_uid'] == 101052]
for wc_dict in filteredRows2['product_title_wcDict']:
    print(wc_dict)

{'1light': 1, 'oil': 1, 'rub': 1, 'bronz': 1, 'adjust': 1, 'mini': 1, 'pendant': 1}
{'1light': 1, 'oilrub': 1, 'bronz': 1, 'adjust': 1, 'mini': 1, 'pendant': 1}
{'1light': 1, 'oilrub': 1, 'bronz': 1, 'adjust': 1, 'mini': 1, 'pendant': 1}


### Convert PRODUCT columns to inverted index dictionaries. 
#### Note: There are same product with multiple rows in the train data. 

In [13]:
def calculateIDF(productFrequency, total_uniqueProduct):
    '''Calculate IDF for for inverted index.'''
    
    #Using log base 10.
    return {term: math.log10(total_uniqueProduct / frequency) for term, frequency in productFrequency.items()}

def generate_invIndex(column, data):
    '''Generate inverted index dictionary for input column in train Data.'''
    
    #N is the total number of unique products in the corpus.
    total_uniqueProduct = len(data['product_uid'].unique())  
    print(f'Total number of unique product: {total_uniqueProduct}')
    
    invertedIndex = {}
    productFrequency = Counter()
       
    for index, row in data.iterrows():
        #Get the TOTAL occurences of each term within the column. 
        terms = set(row[column].keys())  #Use terms of the wc dictionary as keys.
        #A term in multiple rows within the same product ID.
        #Count for that term will be incremented accordingly. 
        productFrequency.update(terms)
    idfTerm = calculateIDF(productFrequency, total_uniqueProduct)
    
    #Dictionary for inverted index {term as key | tuple(idf, list of postings) as value}.
    #idf: of a term in the entire respective column corpus. JUST THE IDF!!! -- 
    #List of postings: element is individual dictionary {'product_uid' as keys | raw term frequency as value}. 
    for term, idf in idfTerm.items():
        invertedIndex[term] = (idf, {})
        
    #Create posting list for each term.
    for index, row in data.iterrows():
        productID = row['product_uid']
        termFrequency = row.get(column, {})
        for term in sorted(termFrequency.keys()):
            if productID in invertedIndex[term][1]:
                invertedIndex[term][1][productID] += termFrequency[term]
            else:
                invertedIndex[term][1][productID] = termFrequency[term]
    return invertedIndex

#Generate inverted index for each product column.
title_invIndex_train = generate_invIndex('product_title_wcDict', trainData_dict)
descr_invIndex_train = generate_invIndex('product_description_wcDict', trainData_dict)
attributes_invIndex_train = generate_invIndex('attributes_wcDict', trainData_dict)

Total number of unique product: 54667
Total number of unique product: 54667
Total number of unique product: 54667


In [16]:
#Get inverted index of the word '12aug' with respect to Title column.
print("Product Title Inverted Index - train data:")
print(title_invIndex_train['12gaug'])

Product Title Inverted Index - train data:
(2.9453335519587927, {100001: 2, 100664: 1, 100995: 6, 101245: 3, 102436: 2, 103250: 1, 103421: 1, 106751: 1, 107640: 2, 109015: 3, 109059: 1, 109196: 1, 112832: 2, 114423: 4, 117545: 1, 122392: 1, 129059: 3, 130474: 2, 130561: 1, 131312: 2, 131566: 1, 133917: 1, 134551: 1, 137719: 1, 137817: 2, 142111: 1, 143504: 1, 144302: 1, 146362: 2, 149416: 2, 152794: 2, 160934: 1, 163592: 1, 173816: 1, 188206: 1, 189166: 1, 192376: 1, 203565: 1})


In [18]:
#Get inverted index of the word '12aug' with respect to Description column.
print("Product Description Inverted Index - train data:")
print(descr_invIndex_train['12gaug'])

Product Description Inverted Index - train data:
(2.5645389730447725, {100001: 2, 100102: 2, 100398: 1, 100664: 1, 100842: 6, 100856: 1, 100976: 1, 100995: 6, 101070: 6, 102083: 3, 103250: 1, 103421: 1, 104919: 1, 105314: 1, 106344: 1, 106751: 1, 107078: 2, 107640: 2, 107740: 6, 108564: 2, 108957: 1, 109059: 1, 109760: 2, 110363: 1, 110978: 2, 112582: 2, 112832: 2, 112889: 1, 113278: 1, 114423: 4, 114970: 1, 116356: 1, 117384: 1, 117485: 4, 117522: 2, 117545: 1, 119222: 4, 119637: 2, 119694: 2, 120428: 4, 121229: 2, 123547: 1, 123576: 1, 123879: 2, 125728: 2, 127165: 1, 127729: 2, 127813: 1, 127978: 2, 128127: 1, 129059: 3, 130474: 2, 130561: 1, 131312: 2, 131566: 1, 132504: 1, 133917: 1, 134889: 1, 136031: 1, 137498: 1, 137539: 1, 137719: 1, 139264: 2, 139579: 1, 141160: 2, 141529: 2, 143688: 1, 144302: 1, 145654: 1, 145677: 2, 146362: 2, 146823: 2, 148201: 1, 148411: 1, 148651: 1, 149416: 2, 151534: 2, 152794: 6, 153932: 1, 154149: 1, 154417: 1, 156529: 1, 156701: 1, 163301: 1, 16677

In [22]:
#Get inverted index of the word 'bullet01' with respect to Attributes column.
#Notice:'bullet01' has negative idf - tf term occurance > N document number. 
print("Product Attributes Inverted Index - train data:")
print(attributes_invIndex_train['12gaug'])

Product Attributes Inverted Index - train data:
(2.684646797973627, {100001: 2, 100102: 2, 100664: 1, 100842: 3, 100856: 1, 100995: 6, 101070: 6, 102005: 1, 102083: 2, 103250: 1, 103421: 1, 104919: 1, 106344: 1, 107078: 2, 107517: 1, 107640: 2, 107740: 3, 107766: 2, 108564: 2, 112582: 2, 112832: 2, 112889: 1, 113278: 1, 114423: 4, 114970: 1, 117485: 4, 117522: 2, 119222: 4, 119637: 1, 121229: 2, 123547: 1, 125728: 2, 127165: 1, 127978: 2, 129059: 3, 130474: 2, 130561: 1, 131312: 2, 137498: 1, 139052: 2, 141529: 1, 144415: 2, 145654: 1, 146362: 2, 146799: 1, 146823: 2, 149416: 2, 152478: 2, 152794: 4, 154149: 1, 156405: 1, 156701: 1, 166771: 1, 167100: 2, 167253: 1, 173292: 1, 173775: 1, 181446: 1, 182685: 1, 187058: 1, 187068: 1, 188114: 1, 189166: 1, 192376: 1, 193877: 1, 194150: 1, 197617: 1, 198770: 1, 202836: 1})


### Compute component vector length for each PRODUCT column.

In [23]:
def computeVectorLength_product(invIndex):
    '''Compute the component vector length for a product column.'''
    vectorLengths = {}
    
    for term,(idfValue, productID_frequency) in invIndex.items():
        for productID, tf in productID_frequency.items():
            if productID not in vectorLengths:
                vectorLengths[productID] = 0
                
            #Accumulate square of individual tf-idf.
            vectorLengths[productID] += math.pow((idfValue * tf), 2.0)
    
    #Sqrt of sum value of each productID.
    for productID in vectorLengths:
        vectorLengths[productID] = math.sqrt(vectorLengths[productID])
    
    return vectorLengths

#Get vector length for each product column.
title_vecLength_train = computeVectorLength_product(title_invIndex_train)
descr_vecLength_train = computeVectorLength_product(descr_invIndex_train)
attributes_vecLength_train = computeVectorLength_product(attributes_invIndex_train)

In [24]:
print("Product Title Vector Length - train data:")
print(title_vecLength_train[100001])

Product Title Vector Length - train data:
9.716528834416671


## SEARCH DATA
### Convert SEARCH column to inverted index dictionary.

In [26]:
def tf_idfSearch(product_invertedIndex, data):
    '''Compute the TF-IDF values for a search term dictionary with respect to a specific product column.
    Returns: A dictionary containing TF-IDF values for each term in the search term dictionary.'''
    
    #Dictionary accumulate term frequency across all rows. 
    tfDict = {}  
    
    #Dictionary for tf-idf search {row index as key (start as 0) | dictionary(term, tf-idf) as value}.
    tf_idfSearch = {}
    
    for index, row in data.iterrows():
        #Retrieve the search term dictionary and productID for the current row.
        search_termDict = row['search_term_wcDict']
        productID = row['product_uid']

        #Accumulate term frequency for each term across all rows in the 'search_term_wcDict' column.
        for term, raw_tf in search_termDict.items():
            tfDict[term] = tfDict.get(term, 0) + raw_tf
        
        #TF-IDF dictionary for each search row.
        tf_idfRow = {}

        #Iterate over each term in the search term dictionary.
        for term, raw_tf in search_termDict.items():
            
            #Check if the term exists in the inverted index.
            if term in product_invertedIndex:
                
                #Retrieve the respective IDF and posting list.
                termIDF, postingList = product_invertedIndex[term]

                #Check if the productID is in the posting list.
                if productID in postingList:
                    #Retrieve the count of the productID in the posting list.
                    product_termCount = postingList[productID]

                    #Calculate the weight of the term in the search term.
                    weight = tfDict[term] * termIDF

                    #Compute TF-IDF value for the term and store it in tf_idfRow dictionary.
                    tf_idfRow[term] = weight * termIDF * product_termCount

        #Store the TF-IDF result for the current row.
        tf_idfSearch[index] = tf_idfRow
    
    return tf_idfSearch

#Get search tf-idf with respect to each product column.
title_tf_idfSearch_train = tf_idfSearch(title_invIndex_train, trainData_dict)
descr_tf_idfSearch_train = tf_idfSearch(descr_invIndex_train, trainData_dict)
attributes_tf_idfSearch_train = tf_idfSearch(attributes_invIndex_train, trainData_dict)

### Compute SEARCH score.

In [27]:
def product_searchScore(tf_idfSearch):
    '''Compute search score for each search against each product column.'''
    
    product_searchScore = {}
    for index, term_tf_idf in tf_idfSearch.items():
        #If empty dictionary - no mutual terms. Score is 0.  
        if len(term_tf_idf) == 0:
            product_searchScore[index] = 0
        else:
            score = sum(term_tf_idf.values())
            product_searchScore[index] = score
    
    return product_searchScore                

#Get search score tf-idf with respect to each product column.
title_searchScore_train = product_searchScore(title_tf_idfSearch_train)
descr_searchScore_train = product_searchScore(descr_tf_idfSearch_train)
attributes_searchScore_train = product_searchScore(attributes_tf_idfSearch_train)

In [30]:
print("Product Title Search score - train data:")
title_searchScore_train[74066]

Product Title Search score - train data:


1225.6014493684163

### Compute component vector length of SEARCH column.

In [31]:
def computeVectorLength_search(tf_idfSearch):
    '''Compute the component vector length for a column.'''
    
    vectorLengths = {}
    
    for index, term_tf_idf in tf_idfSearch.items():
        search_tf_idfSq = 0.0
        
        #Iterate over each term in the query and calculate the squared sum
        for term, tf_idfTerm in term_tf_idf.items():
            search_tf_idfSq += math.pow(tf_idfTerm, 2.0)
        #Take the square root of the squared sum
        vectorLengths[index] = math.sqrt(search_tf_idfSq)

    return vectorLengths

#Get vector length of each tf-idf search, respect to each product column.
title_search_vecLength_train = computeVectorLength_search(title_tf_idfSearch_train)
descr_search_vecLength_train = computeVectorLength_search(descr_tf_idfSearch_train)
attributes_search_vecLength_train = computeVectorLength_search(attributes_tf_idfSearch_train)

In [32]:
#Make sure dictionary generated properly, len = total number of rows. 
len(title_search_vecLength_train)

74067

## Similarities.
### Compute Cosine similarity.

In [33]:
def cosineSimilarity(search_vecLength, product_vecLength,tf_idf_searchScore, data):
    '''Compute Cosine similarity between search and a product column.'''
    
    cosineSim_scoreDict = {}
    #Iterate over each query in tf_idfQueries.
    for index, product_search_score in tf_idf_searchScore.items(): 
        
        productID = data.loc[index, 'product_uid']
        productLength = product_vecLength.get(productID, 0.0)
        
        searchLength = search_vecLength[index]
        
        if productLength > 0 and searchLength > 0:
            #Cosine similarity score.
            cosineSim = product_search_score / (productLength * searchLength)
            cosineSim_scoreDict[index] = cosineSim
        else:
            cosineSim_scoreDict[index] = 0.0
    return cosineSim_scoreDict
            
#Get cosine similarity of each tf-idf search with each product column.
title_cosineSim_train = cosineSimilarity(title_search_vecLength_train, title_vecLength_train,title_searchScore_train, trainData_dict)
descr_cosineSim_train = cosineSimilarity(descr_search_vecLength_train, descr_vecLength_train, descr_searchScore_train, trainData_dict)
attributes_cosineSim_train = cosineSimilarity(attributes_search_vecLength_train,attributes_vecLength_train, attributes_searchScore_train, trainData_dict)

### Compute Jaccard similarity.

In [34]:
def jaccardSimilarity(productColumn, data):
    '''Compute Jaccard Similarity between search and a product column.'''
    
    jaccard_scoreDict = {}

    for index, row in data.iterrows():
        productTerms = set(row[productColumn].keys())
        searchTerms = set(row['search_term_wcDict'].keys())

        #Calculate Jaccard Similarity. 
        intersectionSize = len(searchTerms.intersection(productTerms))
        unionSize = len(searchTerms.union(productTerms))

        if unionSize > 0:
            jaccardScore = intersectionSize / unionSize
            jaccard_scoreDict[index] = jaccardScore
        else:
            jaccard_scoreDict[index] = 0.0

    return jaccard_scoreDict


#Get jaccard similarity of each tf-idf search with each product column.
title_jaccardSim_train = jaccardSimilarity('product_title_wcDict', trainData_dict)
descr_jaccardSim_train = jaccardSimilarity('product_description_wcDict', trainData_dict)
attributes_jaccardSim_train = jaccardSimilarity('attributes_wcDict', trainData_dict)

## Newly constructed data frame - train data.
### Mapping from trainData_dict.

In [36]:
#Create a DataFrame using trainData_dict as a base
trainData_Sim = trainData_dict.copy()

#Add the cosine similarity scores as new columns
trainData_Sim['title_cosineSim'] = trainData_Sim.index.map(title_cosineSim_train)
trainData_Sim['descr_cosineSim'] = trainData_Sim.index.map(descr_cosineSim_train)
trainData_Sim['attributes_cosineSim'] = trainData_Sim.index.map(attributes_cosineSim_train)

trainData_Sim['title_jaccardSim'] = trainData_Sim.index.map(title_cosineSim_train)
trainData_Sim['descr_jaccardSim'] = trainData_Sim.index.map(descr_cosineSim_train)
trainData_Sim['attributes_jaccardSim'] = trainData_Sim.index.map(attributes_cosineSim_train)

#Add Target column.
trainData_Sim = pd.concat([trainData_Sim, trainTarget], axis=1)

#Drop oroginal columns.
trainData_Sim = trainData_Sim.drop(columns=['product_uid', 'product_title_wcDict', 'search_term_wcDict', 'product_description_wcDict', 'attributes_wcDict'])

trainData_Sim.head(5)

Unnamed: 0,title_cosineSim,descr_cosineSim,attributes_cosineSim,title_jaccardSim,descr_jaccardSim,attributes_jaccardSim,relevance
0,0.102917,0.027439,0.03946,0.102917,0.027439,0.03946,3.0
1,0.0,0.0,0.0,0.0,0.0,0.0,2.5
2,0.0,0.051113,0.047228,0.0,0.051113,0.047228,3.0
3,0.083094,0.032615,0.037649,0.083094,0.032615,0.037649,2.33
4,0.111781,0.04263,0.037535,0.111781,0.04263,0.037535,2.67


## Train BASIC regression model. 

In [74]:
trainPredictors = trainData_Sim[['title_cosineSim', 'descr_cosineSim', 'attributes_cosineSim',
               'title_jaccardSim', 'descr_jaccardSim', 'attributes_jaccardSim']]

#Extract target variable
targetTrain = trainData_Sim['relevance']

#Initialize and train a linear regression model
model = LinearRegression()
model.fit(trainPredictors, targetTrain)

#Make predictions on the entire dataset
predictionsTrain = model.predict(trainPredictors)

#Evaluate the model
mse = mean_squared_error(targetTrain, predictionsTrain)
rmse = np.sqrt(mse)
r_squared = r2_score(targetTrain, predictionsTrain)

print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared: {r_squared}")

Mean Squared Error: 0.2730131356531418
Root Mean Squared Error: 0.5225065891002159
R-squared: 0.0425127446865714


### Tune & Enhance Regression Model.

In [75]:
X_train = trainData_Sim.drop('relevance', axis=1)
y_train = trainData_Sim['relevance']

### Linear Regression, Random Forest Regressor, XGBoost, and Support Vector Regressor.

In [76]:
#Commenting out XGBoost regressor model and SVR since they take too long and do not provide a significant increase in RMSE on train data.
#Define models
models = [
    ('Linear Regression', LinearRegression()),
    ('Random Forest Regressor', RandomForestRegressor()) #,
    #('XGBoost', XGBRegressor()),
    #('Support Vector Regressor', SVR())
]

### Define parameter grids for each model.

In [77]:
param_grids = [
    {},  #No hyperparameter for Linear Regression.
    {'n_estimators': [10, 50, 100], 'max_depth': [None, 5, 10]}   # Random Forest   
    #Running XG Boost takes 2 hours
    # {'n_estimators': [100, 200, 300], 'learning_rate': [0.01, 0.1, 0.3]}   # XGBoost
    
    #Running SVR takes 4 hours 
    #{'kernel': ['linear', 'poly', 'rbf'], 'C': [0.1, 1, 10]}# SVR
]

In [78]:
best_model = None
best_mse = float('inf')

#Performing grid search for each model
for (name, tuneModel), param_grid in zip(models, param_grids):
    print(f"Grid search for {name}...")
    grid_search = GridSearchCV(tuneModel, param_grid, cv=3, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)
    
    #Get best model
    if -grid_search.best_score_ < best_mse:
        best_model = grid_search.best_estimator_
        best_mse = -grid_search.best_score_

    print(f"Best parameters found: {grid_search.best_params_}")
    print(f"Best mean squared error: {-grid_search.best_score_}")
    print()

print(f"Best model: {best_model}")
print(f"Best mean squared error: {best_mse}")

Grid search for Linear Regression...
Best parameters found: {}
Best mean squared error: 0.2870134145475407

Grid search for Random Forest Regressor...
Best parameters found: {'max_depth': 5, 'n_estimators': 50}
Best mean squared error: 0.2780781233810384

Best model: RandomForestRegressor(max_depth=5, n_estimators=50)
Best mean squared error: 0.2780781233810384


# **Test Data**
### ---process Test Set--- EXACTLY SAME as trainData

In [27]:
#112,067 rows
testData.head(5)

Unnamed: 0,id,product_uid,product_title,search_term,product_description,attributes
0,4,100001,"[simpson, strongti, 12gaug, angl]","[metal, l, bracket]","[angl, make, joint, stronger, also, provid, co...","[bullet01, versatil, connector, variou, 90, co..."
1,5,100001,"[simpson, strongti, 12gaug, angl]","[simpson, sku, abl]","[angl, make, joint, stronger, also, provid, co...","[bullet01, versatil, connector, variou, 90, co..."
2,6,100001,"[simpson, strongti, 12gaug, angl]","[simpson, strong, tie]","[angl, make, joint, stronger, also, provid, co...","[bullet01, versatil, connector, variou, 90, co..."
3,7,100001,"[simpson, strongti, 12gaug, angl]","[simpson, strong, tie, hcc668]","[angl, make, joint, stronger, also, provid, co...","[bullet01, versatil, connector, variou, 90, co..."
4,10,100003,"[sterl, ensembl, 3314, x, 60, x, 7514, bath, s...","[bath, shower, kit]","[classic, architectur, meet, contemporari, des...","[builtin, flang, ye, bullet01, slightli, narro..."


## PRODUCT DATA
### Convert to term frequency dictionary.

In [38]:
testData_dict = testData

convertColumns = ['product_title', 'search_term', 'product_description', 'attributes']

for column in convertColumns:
    #Convert each cell ito a dictionary of word counts.
    testData_dict[column + '_wcDict'] = testData_dict[column].apply(lambda x: dict(Counter(x)))

#Drop oroginal columns.
testData_dict = testData_dict.drop(columns=['id', 'product_title', 'search_term', 'product_description', 'attributes'])
testData_dict.head(5)

Unnamed: 0,product_uid,product_title_wcDict,search_term_wcDict,product_description_wcDict,attributes_wcDict
0,100001,"{'simpson': 1, 'strongti': 1, '12gaug': 1, 'an...","{'metal': 1, 'l': 1, 'bracket': 1}","{'angl': 3, 'make': 1, 'joint': 2, 'stronger':...","{'bullet01': 1, 'versatil': 1, 'connector': 1,..."
1,100001,"{'simpson': 1, 'strongti': 1, '12gaug': 1, 'an...","{'simpson': 1, 'sku': 1, 'abl': 1}","{'angl': 3, 'make': 1, 'joint': 2, 'stronger':...","{'bullet01': 1, 'versatil': 1, 'connector': 1,..."
2,100001,"{'simpson': 1, 'strongti': 1, '12gaug': 1, 'an...","{'simpson': 1, 'strong': 1, 'tie': 1}","{'angl': 3, 'make': 1, 'joint': 2, 'stronger':...","{'bullet01': 1, 'versatil': 1, 'connector': 1,..."
3,100001,"{'simpson': 1, 'strongti': 1, '12gaug': 1, 'an...","{'simpson': 1, 'strong': 1, 'tie': 1, 'hcc668'...","{'angl': 3, 'make': 1, 'joint': 2, 'stronger':...","{'bullet01': 1, 'versatil': 1, 'connector': 1,..."
4,100003,"{'sterl': 1, 'ensembl': 1, '3314': 1, 'x': 2, ...","{'bath': 1, 'shower': 1, 'kit': 1}","{'classic': 1, 'architectur': 1, 'meet': 1, 'c...","{'builtin': 1, 'flang': 1, 'ye': 1, 'bullet01'..."


### Convert PRODUCT columns to inverted index dictionaries. 

In [39]:
#Generate inverted index for each product column.
title_invIndex_test = generate_invIndex('product_title_wcDict', testData_dict)
descr_invIndex_test = generate_invIndex('product_description_wcDict', testData_dict)
attributes_invIndex_test = generate_invIndex('attributes_wcDict', testData_dict)

Total number of unique product: 74909
Total number of unique product: 74909
Total number of unique product: 74909


### Compute component vector length of each PRODUCT column.

In [40]:
#Get vector length for each product column.
title_vecLength_test = computeVectorLength_product(title_invIndex_test)
descr_vecLength_test = computeVectorLength_product(descr_invIndex_test)
attributes_vecLength_test = computeVectorLength_product(attributes_invIndex_test)

In [41]:
print("Product Title Vector Length:")
print(title_vecLength_test[100001])

Product Title Vector Length:
18.949368613584586


## SEARCH DATA
### Convert SEARCH column to inverted index dictionary.

In [42]:
#Get search tf-idf with respect to each product column.
title_tf_idfSearch_test = tf_idfSearch(title_invIndex_test, testData_dict)
descr_tf_idfSearch_test = tf_idfSearch(descr_invIndex_test, testData_dict)
attributes_tf_idfSearch_test = tf_idfSearch(attributes_invIndex_test, testData_dict)

### Compute SEARCH score.

In [43]:
#Get search score tf-idf with respect to each product column.
title_searchScore_test = product_searchScore(title_tf_idfSearch_test)
descr_searchScore_test = product_searchScore(descr_tf_idfSearch_test)
attributes_searchScore_test = product_searchScore(attributes_tf_idfSearch_test)

In [45]:
print("Product Title Search score - TEST data:")
title_searchScore_test[74066]

Product Title Search score - TEST data:


8036.937356490582

### Compute component vector length of SEARCH column.

In [47]:
#Get vector length of each tf-idf search, respect to each product column.
title_search_vecLength_test = computeVectorLength_search(title_tf_idfSearch_test)
descr_search_vecLength_test = computeVectorLength_search(descr_tf_idfSearch_test)
attributes_search_vecLength_test = computeVectorLength_search(attributes_tf_idfSearch_test)

### Compute Cosine similarity.

In [48]:
#Get cosine similarity of each tf-idf search with each product column.
title_cosineSim_test = cosineSimilarity(title_search_vecLength_test, title_vecLength_test,title_searchScore_test, testData_dict)
descr_cosineSim_test = cosineSimilarity(descr_search_vecLength_test, descr_vecLength_test, descr_searchScore_test, testData_dict)
attributes_cosineSim_test = cosineSimilarity(attributes_search_vecLength_test,attributes_vecLength_test, attributes_searchScore_test, testData_dict)

In [49]:
#Make sure dictionary generated properly, len = total number of rows. 
len(title_cosineSim_test)

112067

### Compute Jaccard similarity.

In [50]:
#Get jaccard similarity of each tf-idf search with each product column.
title_jaccardSim_test = jaccardSimilarity('product_title_wcDict', testData_dict)
descr_jaccardSim_test = jaccardSimilarity('product_description_wcDict', testData_dict)
attributes_jaccardSim_test = jaccardSimilarity('attributes_wcDict', testData_dict)

### Newly constructed data frame - test data.

In [51]:
#Create a DataFrame using trainData_dict as a base
testData_Sim = testData_dict.copy()

# Add the cosine similarity scores as new columns
testData_Sim['title_cosineSim'] = testData_Sim.index.map(title_cosineSim_test)
testData_Sim['descr_cosineSim'] = testData_Sim.index.map(descr_cosineSim_test)
testData_Sim['attributes_cosineSim'] = testData_Sim.index.map(attributes_cosineSim_test)

testData_Sim['title_jaccardSim'] = testData_Sim.index.map(title_cosineSim_test)
testData_Sim['descr_jaccardSim'] = testData_Sim.index.map(descr_cosineSim_test)
testData_Sim['attributes_jaccardSim'] = testData_Sim.index.map(attributes_cosineSim_test)

#Drop oroginal columns.
testData_Sim = testData_Sim.drop(columns=['product_uid', 'product_title_wcDict', 'search_term_wcDict', 'product_description_wcDict', 'attributes_wcDict'])

testData_Sim.head(5)

Unnamed: 0,title_cosineSim,descr_cosineSim,attributes_cosineSim,title_jaccardSim,descr_jaccardSim,attributes_jaccardSim
0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.052772,0.01391,0.020303,0.052772,0.01391,0.020303
2,0.052772,0.01391,0.024464,0.052772,0.01391,0.024464
3,0.052772,0.01391,0.025507,0.052772,0.01391,0.025507
4,0.09172,0.030435,0.03138,0.09172,0.030435,0.03138


In [52]:
testData_Sim.shape

(112067, 6)

## Prediction using test set both BASIC and best models. Write prediction to a csv file.
### Best Model.

In [80]:
#Extract features from the test set,
X_test = testData_Sim

#Make predictions on the test set,
y_pred = best_model.predict(X_test)

In [81]:
bestModel_predictionsTest_df = pd.DataFrame({'id': testData['id'], 'relevance': y_pred.round(2)})
print(bestModel_predictionsTest_df.head(10))
bestModel_predictionsTest_df.to_csv('test_predictions_best_model.csv', index=False)
print(f'Successfully save - Best Model!')

   id  relevance
0   4       2.02
1   5       2.43
2   6       2.40
3   7       2.39
4  10       2.43
5  12       2.31
6  13       2.29
7  14       2.34
8  15       2.32
9  24       2.30
Successfully save - Best Model!


### Basic Model.

In [82]:
#Extract features from the test set
testPredictors = testData_Sim[['title_cosineSim', 'descr_cosineSim', 'attributes_cosineSim',
                               'title_jaccardSim', 'descr_jaccardSim', 'attributes_jaccardSim']]

#Make predictions on the test set
basicModel_predictionsTest = model.predict(testPredictors)
basicModel_predictionsTest_df = pd.DataFrame({'id': testData['id'], 'relevance': predictionsTest.round(2)})
print(basicModel_predictionsTest_df.head(10))
basicModel_predictionsTest_df.to_csv('test_predictions_basic_model.csv', index=False)
print(f'Successfully save - Basic Model!')

   id  relevance
0   4       2.26
1   5       2.30
2   6       2.29
3   7       2.29
4  10       2.34
5  12       2.26
6  13       2.35
7  14       2.40
8  15       2.38
9  24       2.31
Successfully save - Basic Model!
