In [None]:
import os
import openai
import numpy as np 
import pandas as pd
import time 
import pdfkit
import re
import copy
from tqdm import tqdm
from itertools import product
openai.organization = os.environ.get("OPENAI_ORGANIZATION")
openai.api_key = os.environ.get("OPENAI_API_KEY")
pd.set_option('display.max_rows', 500)
openai.Model.retrieve('gpt-3.5-turbo')

In [None]:
dfp = pd.read_csv('ExplodedPositives.csv').dropna()
dfn = pd.read_csv('ExplodedNegatives.csv').dropna()
dfw = pd.read_csv('ExplodedWishlist.csv').dropna()

In [None]:
def semantic_condenser2(df, context):
    master_list = []
    index_exceptions = []
    #Taking 500 first pass features at a time to stay with 4096 context length

    features = df["Features"].tolist()
    ft = len(features)
    #Accounting for number of features under 500, between the exact multiples of 500 etc. 
    qt = len(features) // context
    rem = len(features) % context
    if rem == 0:
        iterations = qt
    else:
        iterations = qt + 1
    ind = 0
    if qt == 0:
        ind2 = len(features)
    else:
        ind2 = context
        #print('Model: ', i)
        #print('Model Occurences: ', ft)
        #print('Iterations', iterations)
    for j in tqdm(range(iterations)):
        try:
            content = openai.ChatCompletion.create(
            model = 'gpt-3.5-turbo',
            messages=[
        {"role": "system", "content": "You are a helpful assistant that performs the specific task being asked."},
        {"role": "user", "content": "Given a list of words in the following list which contain many words with similar semantic "\
         "meaning, condense those words into a smaller list of words which represent the same meaning. The words are related to "\
         "vehicle features. Make each word or feature into a bullet point. Focus purely on vehicle features like Fuel Efficiency, Spacious Interiors etc. "\
         "Try to make each feature as distinct as possible from the others. If similar, try to group them together into a "\
         "single feature. Avoid long bullet points. Do not number the bullets. Do not add anything else to the response like "\
         "a Note or a warning. List:  " + (', ').join(features[ind:ind2])}
         ],
        temperature = 1,
        max_tokens= 1000,
        frequency_penalty = 0,
        presence_penalty = 0.9,
        #n=3
            ).get("choices")[0]['message']['content'].split('\n')
        
            ret = [x.replace('- ', '') for x in content]
        
        #Chaining and further compression at this stage does not work well. It seems to ignore making each word/feature into a 
        #separate bullet and instead uses and to group them together which isn't what we want.
        #content2 = openai.ChatCompletion.create(
        #    model = 'gpt-3.5-turbo',
        #    messages=[
        #{"role": "system", "content": "You are a helpful assistant that performs the specific task being asked."},
        #{"role": "user", "content": "Given a list of words in the following list which contain many words with similar semantic "\
        # "meaning, condense those words into a smaller list of words which represent the same meaning. The words are related to "\
        # "vehicle features. Do not add anything else to the response. Make each word or feature into a bullet point. "\
        # "Try to make each feature as distinct as possible from the others. If similar, try to group them together into a "\
        # "single feature. List:  " + (', ').join(ret)}
        # ],
        #temperature = 1,
        #max_tokens= 1000,
        #frequency_penalty = 0,
        #presence_penalty = 0.9,
        #n=3
        #    ).get("choices")[0]['message']['content'].split('\n')
        
        #ret2 = [x.replace('- ', '') for x in content2]
                #print('Index1: ', ind)
                #print('Index2: ', ind2)
            ind = ind + context
            if (ind2 + context) > ft:
                ind2 = ft
            else:
                ind2 =  ind2 + context
            master_list.append(ret)
            
        except Exception as e:
            print(e)
            index_exceptions.append((ind, ind2))
            continue

        
    #Conversion into a data format that's identical to the orginal input. i.e, a dataframe with relevant column names and no 
    #nested data structures. 
    master_list2 = []
    for i in master_list:
        master_list2.append(', '.join(i))
    level1 = pd.DataFrame(master_list2, columns = ["Features"])
    level1e = pd.DataFrame(level1.assign(Features = level1['Features'].str.split(',')).explode('Features'))
    #level1e = pd.DataFrame(level1e.assign(Features = level1e['Features'].str.split(' and ')).explode('Features')) 
    level1e = pd.DataFrame(level1e.assign(Features = level1e['Features'].str.split('/')).explode('Features')) 
    level1e['Word Count'] = level1e['Features'].str.split(' ').str.len()
    level1e = level1e.drop_duplicates()
    return (level1e, index_exceptions)

In [None]:
level1_masterp = semantic_condenser2(dfp, 500)
level1_mastern = semantic_condenser2(dfn, 250)
level1_masterw = semantic_condenser2(dfw, 250)
level1_masterpos = level1_masterp[0]
level1_masterneg = level1_mastern[0]
level1_masterwish = level1_masterw[0]

In [None]:
level2_masterp = semantic_condenser2(level1_masterpos, 300)
level2_mastern = semantic_condenser2(level1_masterneg, 300)
level2_masterw = semantic_condenser2(level1_masterwish, 300)
level2_masterpos = level2_masterp[0]
level2_masterneg = level2_mastern[0]
level2_masterwish = level2_masterw[0]

In [None]:
level3_masterp = semantic_condenser2(level2_masterpos, 300)
level3_mastern = semantic_condenser2(level2_masterneg, 300)
level3_masterw = semantic_condenser2(level2_masterwish, 300)
level3_masterpos = level3_masterp[0]
level3_masterneg = level3_mastern[0]
level3_masterwish = level3_masterw[0]

In [None]:
level4_masterp = semantic_condenser2(level3_masterpos, 300)
level4_mastern = semantic_condenser2(level3_masterneg, 300)
level4_masterw = semantic_condenser2(level3_masterwish, 300)
level4_masterpos = level4_masterp[0]
level4_masterneg = level4_mastern[0]
level4_masterwish = level4_masterw[0]

In [None]:
#Only at level 5 are we able to feed the entire condensed list into a single context and completely avoid overlap
#So level 5 is the lowest level of master list that can be considered. 
level5_masterp = semantic_condenser2(level4_masterpos, 400)
level5_mastern = semantic_condenser2(level4_masterneg, 400)
level5_masterw = semantic_condenser2(level4_masterwish, 400)
level5_masterpos = level5_masterp[0]
level5_masterneg = level5_mastern[0]
level5_masterwish = level5_masterw[0]

In [None]:
level6_masterp = semantic_condenser2(level5_masterpos, 400)
level6_mastern = semantic_condenser2(level5_masterneg, 400)
level6_masterw = semantic_condenser2(level5_masterwish, 400)
level6_masterpos = level6_masterp[0]
level6_masterneg = level6_mastern[0]
level6_masterwish = level6_masterw[0]

In [None]:
#Very minimal change observed in level 7 hence level 6 seems to be the most optimal
level7_masterp = semantic_condenser2(level6_masterpos, 400)
level7_mastern = semantic_condenser2(level6_masterneg, 400)
level7_masterw = semantic_condenser2(level6_masterwish, 400)
level7_masterpos = level7_masterp[0]
level7_masterneg = level7_mastern[0]
level7_masterwish = level7_masterw[0]

In [None]:
level5_masterpos.to_csv('CondensedPositives_L5.csv')
level5_masterneg.to_csv('CondensedNegatives_L5.csv')
level5_masterwish.to_csv('CondensedWishlist_L5.csv')
level6_masterpos.to_csv('CondensedPositives_L6.csv')
level6_masterneg.to_csv('CondensedNegatives_L6.csv')
level6_masterwish.to_csv('CondensedWishlist_L6.csv')