In [11]:
from langchain.document_loaders import SeleniumURLLoader
import tiktoken

from langchain.chat_models import ChatOpenAI
from langchain.chains.mapreduce import MapReduceChain
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain
from langchain import PromptTemplate, OpenAI, LLMChain

from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)

import pandas as pd
import os
from dotenv import load_dotenv
import boto3
import io
import tempfile



In [2]:
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
aws_access_key_id = os.environ['AWS_ACCESS_KEY_ID']
aws_secret_access_key = os.environ['AWS_SECRET_ACCESS_KEY']
#aws_session_token = os.environ.get('AWS_SESSION_TOKEN')
sts_client = boto3.client('sts')
#brwoserless_api_key = os.getenv("BROWSERLESS_API_KEY")
#serper_api_key = os.getenv("SERP_API_KEY")

In [98]:
#variables

import argparse

# Create an argument parser
parser = argparse.ArgumentParser(description='My Script Description')

# Define command-line arguments
parser.add_argument('--input-folder', required=True, help='Path to the input file')
parser.add_argument('--input-file', required=True, help='Path to the output file')

args = parser.parse_args()


input_folder = args.input_folder
input_file = args.input_file


usage: ipykernel_launcher.py [-h] --input-folder INPUT_FOLDER --input-file INPUT_FILE
ipykernel_launcher.py: error: the following arguments are required: --input-folder, --input-file


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [5]:
def html_loader(urls):
    loader = SeleniumURLLoader(urls=urls)
    data = loader.load()
    return data

def read_excel_s3(bucket_name,key):
    s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
    
    # Specify the S3 bucket name
    s3_bucket_name = bucket_name
    
    
    s3_key = key
    
    # 'https://mini-excels.s3.eu-west-1.amazonaws.com/mini_excels/mini_excelsoutput_1.xlsx'
    
    # Read the Excel file from S3
    s3_response = s3.get_object(Bucket=s3_bucket_name, Key=s3_key)
    excel_data = s3_response['Body'].read()
    
    # You can now parse 'excel_data' using a library like pandas.
    return pd.read_excel(excel_data)

def output_writer(df,nombre):
    data = df
    file_name = nombre
        
    with tempfile.NamedTemporaryFile(suffix='.xlsx', delete=False) as temp_excel_file:
        data.to_excel(temp_excel_file.name, index=False, engine='xlsxwriter')
    
    # Read the temporary file into a binary stream
    with open(temp_excel_file.name, 'rb') as temp_file:
        excel_bytes = io.BytesIO(temp_file.read())
    
    # Specify the S3 key (file path) for the output file
    s3_key = f'output/{file_name}'  # You can customize the path as needed
    
    # Upload the Excel data to S3
    s3.put_object(Bucket=s3_bucket_name, Key=s3_key, Body=excel_bytes)
    
    print(f'Uploaded {file_name} to S3 at s3://{s3_bucket_name}/{s3_key}')

def summarizer(html_texto):
    llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0)
    
    # Map
    map_template = """Aqui tienes el texto del home page de una tienda de ecommerce
    {docs}
    Haz un extenso resumen que caracterize el nicho de la tienda y los puntos importantes a destacar
    """
    map_prompt = PromptTemplate.from_template(map_template)
    map_chain = LLMChain(llm=llm, prompt=map_prompt)
    summary = map_chain.run(html_texto)
    return summary
    
def nicho_finder(resumen):
    template = """Pregunta: Cual es el nicho de esta tienda online segun el siguiente resúmen del home page:
    
    {resumen}
    
    La respuesta debe resumir el nicho en menos de 5 palabras. Responde única y exclusivamente con el nicho de la tienda, nada mas.
    """
    
    prompt = PromptTemplate(template=template, input_variables=["resumen"])
    
    llm = ChatOpenAI(model_name='gpt-3.5-turbo',temperature=0)
    
    #llm = OpenAI(openai_api_key="YOUR_API_KEY", openai_organization="YOUR_ORGANIZATION_ID")
    
    llm_chain = LLMChain(prompt=prompt, llm=llm)
    
    return llm_chain.run(resumen)
    
def advisor(query,summary):
    
    chat = ChatOpenAI(model_name="gpt-4", temperature=0)

    # Template to use for the system message prompt

    template = f"""Transformate en Roger de RojantMedia, un consultor experto en escalar la facturación de tiendas online.
    
            El cliente quiere facturar lo antes posible 20.000€ más al mes, cual es el camino más rápido? que recomendaciones le das personalizadas para su nicho y lo más especificas posibles, usando la información que te he pasado de su tienda online.

            Escribe UNICAMENTE las sugerencias más importantes y nada más.

            Humaniza tu lenguaje, de manera directa y divertida, pero sobretodo que él sienta que le estas dando claves de mucho valor para que quiera implementarlas ya.

            Usa solo los 3 puntos más importantes y estructuralos en (que está haciendo mal y porque eso le impide facturar más, y cual es la solución y que logrará, además acaba aportando datos que demuestren tangiblemente que tienes razón)

            Supon que el cliente ha probado antes a hacer publicidad y no le ha funcionado, esta quemado con este tema... explicale también porque le habrá ido mal

            Supon que el cliente quiere mejorar el valor percibido de su producto para mi cliente potencial y no se cómo venderlos logica y emocionalmente cómo haría un copywritter.
            ----
            INFORMACION TIENDA ONLINE: 

            {summary}

            ----
            
            """

    system_message_prompt = SystemMessagePromptTemplate.from_template(template)

    # Human question prompt
    human_template = "Responde la siguiente pregunta: {question}"
    human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)

    chat_prompt = ChatPromptTemplate.from_messages(
        [system_message_prompt, human_message_prompt]
    )

    chain = LLMChain(llm=chat, prompt=chat_prompt)

    response_0 = chain.run(question=query,summary=summary)

    return response_0

In [81]:
df = read_excel_s3('mini-excels','mini_excels/mini_excelsoutput_1.xlsx')
df
#urls = list(df['domain'][:100])

  return pd.read_excel(excel_data)


Unnamed: 0,domain,country_code,created,emails,instagram,phones,platform
0,www.visionnocturnaytermica.com,España,2023-05-19,shop@visionnocturnaytermica.com,vision_nocturna_y_termica,+34 658 50 13 65,Shopify
1,www.gepcoformacion.es,España,2023-04-14,info@gestionemergencias.com,gepco_formacion,"+34 916 26 38 18,+34 935 64 63 46",WooCommerce
2,mrribsmadrid.es,España,2023-05-19,info@mrribsmadrid.es,mr_ribs_madrid,+34 633 53 95 00,WooCommerce
3,lewu.es,España,2023-04-07,info@lewu.es,lewulearnandenjoywithus,+34 680 84 97 93,WooCommerce
4,www.powerbuildingoficial.com,España,2023-04-14,soporte@powerbuildingoficial.com,powerbuilding_oficial,+34 661 01 73 17,Shopify
5,electrotodo.fr,España,2023-04-07,boutique@electrotodo.fr,,"+34 925 25 88 72,+34 925 21 69 69",Shopify
6,www.harven.es,España,2023-04-28,harven@harven.es,harvengroup,"+34 928 20 16 36,+34 928 20 35 22",WooCommerce
7,www.falconsdevilafranca.cat,España,2023-03-24,falcons@falconsdevilafranca.cat,falconsvfp,+34 678 44 78 40,Wix
8,www.destockage.es,España,2023-06-02,info@destockage.es,,+34 633 12 20 55,Odoo
9,www.reciclajesdelsurbahia.es,España,2023-04-21,info@reciclajesdelsurbahia.es,reciclajesdelsurbahias.l.1,+34 956 40 62 00,BaseKit


In [4]:
#urls = ["https://bellascositas.es/","https://pikadura.es/es/"]
import time
# Record the start time
start_time = time.time()
##########
#Para hacerlo a traves de un excel y un dataframe
urls = list(df['domain'][:10])
#urls = ["Rebels108.com","www.ergonomika.es","Tiendascaprichos.com","Klix.es","www.hillsgarden.co","Lalaynina.com","Superfantasticas.com"]
new_url = []
for url in urls:
    url_nueva = 'https://'+url+'/'
    new_url.append(url_nueva)
print(new_url)


#url = ['https://www.powerbuildingoficial.com/']

NameError: name 'df' is not defined

In [6]:
!pip freeze

aiohttp==3.8.5
aiosignal==1.3.1
annotated-types==0.5.0
anyio==3.7.1
async-timeout==4.0.3
attrs==23.1.0
beautifulsoup4==4.12.2
blis==0.7.10
boltons @ file:///C:/ci_311/boltons_1677729932371/work
brotlipy==0.7.0
catalogue==2.0.9
certifi @ file:///C:/b/abs_4a0polqwty/croot/certifi_1683875377622/work/certifi
cffi @ file:///C:/ci_311/cffi_1676423759166/work
charset-normalizer @ file:///tmp/build/80754af9/charset-normalizer_1630003229654/work
click==8.1.7
colorama @ file:///C:/ci_311/colorama_1676422310965/work
conda @ file:///C:/b/abs_b5q92oum8h/croot/conda_1689272261421/work
conda-content-trust @ file:///C:/ci_311/conda-content-trust_1676467587370/work
conda-libmamba-solver @ file:///C:/b/abs_5a9xvt2a3k/croot/conda-libmamba-solver_1685032355900/work/src
conda-package-handling @ file:///C:/b/abs_ce4_vcfd0y/croot/conda-package-handling_1685024800103/work
conda_package_streaming @ file:///C:/b/abs_88a7k_wmm1/croot/conda-package-streaming_1685019697115/work
confection==0.1.3
cryptography @ fil

In [88]:
paginas_html = html_loader(new_url)

In [89]:
resumenes = []
for html in paginas_html:
    x = html.page_content.replace('\t','')
    y = x.replace('\n',' ')
    resumenes.append(summarizer(y))

In [90]:
query = 'Quiero facturar lo antes posible 20.000€ más al mes, cual es el camino más rápido?'
respuestas = []
nichos = []
for resumen in resumenes:
    nichos.append(nicho_finder(resumen))
    respuestas.append(advisor(query,resumen))

# Record the end time
end_time = time.time()

# Calculate the elapsed time
elapsed_time = end_time - start_time

# Print the result in seconds
print(f"Execution time: {float(elapsed_time)/60} minutes")

Execution time: 12.977583988507588 minutes


In [91]:
df['summary'] = resumenes
df['niche'] = nichos
df['personalized_email'] = respuestas

In [96]:
nom = 'mini_excels/mini_excelsoutput_1.xlsx'
nombre = nom.split('/')[1]
output_writer(df,nombre)

Uploaded mini_excelsoutput_1.xlsx to S3 at s3://mini-excels/output/mini_excelsoutput_1.xlsx


In [49]:
def read_excel_s3(bucket_name,s3_key):
    s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
    
    # Specify the S3 bucket name
    s3_bucket_name = 'mini-excels'
    
    
    s3_key = 'mini_excels/mini_excelsoutput_1.xlsx'
    
    # 'https://mini-excels.s3.eu-west-1.amazonaws.com/mini_excels/mini_excelsoutput_1.xlsx'
    
    # Read the Excel file from S3
    s3_response = s3.get_object(Bucket=s3_bucket_name, Key=s3_key)
    excel_data = s3_response['Body'].read()
    
    # You can now parse 'excel_data' using a library like pandas.
    return pd.read_excel(excel_data)

In [53]:
data1 = pd.read_excel(excel_data)
data2 = pd.read_excel(excel_data)

  data1 = pd.read_excel(excel_data)
  data2 = pd.read_excel(excel_data)


In [77]:
s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)

# Specify the S3 bucket name
s3_bucket_name = 'mini-excels'

# List of output Excel data and file names
output_data = [
    {'data': pd.DataFrame(data1), 'file_name': 'output_file1.xlsx'},
    {'data': pd.DataFrame(data2), 'file_name': 'output_file2.xlsx'},
    # Add more data and file names as needed
]

# Upload each output Excel file to S3
for item in output_data:
    data = item['data']
    file_name = item['file_name']
    
    with tempfile.NamedTemporaryFile(suffix='.xlsx', delete=False) as temp_excel_file:
        data.to_excel(temp_excel_file.name, index=False, engine='xlsxwriter')

    # Read the temporary file into a binary stream
    with open(temp_excel_file.name, 'rb') as temp_file:
        excel_bytes = io.BytesIO(temp_file.read())

    # Specify the S3 key (file path) for the output file
    s3_key = f'output/{file_name}'  # You can customize the path as needed

    # Upload the Excel data to S3
    s3.put_object(Bucket=s3_bucket_name, Key=s3_key, Body=excel_bytes)

    print(f'Uploaded {file_name} to S3 at s3://{s3_bucket_name}/{s3_key}')





Uploaded output_file1.xlsx to S3 at s3://mini-excels/output/output_file1.xlsx
Uploaded output_file2.xlsx to S3 at s3://mini-excels/output/output_file2.xlsx


  pd.read_excel(excel_data)


Unnamed: 0,domain,country_code,created,emails,instagram,phones,platform
0,www.visionnocturnaytermica.com,España,2023-05-19,shop@visionnocturnaytermica.com,vision_nocturna_y_termica,+34 658 50 13 65,Shopify
1,www.gepcoformacion.es,España,2023-04-14,info@gestionemergencias.com,gepco_formacion,"+34 916 26 38 18,+34 935 64 63 46",WooCommerce
2,mrribsmadrid.es,España,2023-05-19,info@mrribsmadrid.es,mr_ribs_madrid,+34 633 53 95 00,WooCommerce
3,lewu.es,España,2023-04-07,info@lewu.es,lewulearnandenjoywithus,+34 680 84 97 93,WooCommerce
4,www.powerbuildingoficial.com,España,2023-04-14,soporte@powerbuildingoficial.com,powerbuilding_oficial,+34 661 01 73 17,Shopify
5,electrotodo.fr,España,2023-04-07,boutique@electrotodo.fr,,"+34 925 25 88 72,+34 925 21 69 69",Shopify
6,www.harven.es,España,2023-04-28,harven@harven.es,harvengroup,"+34 928 20 16 36,+34 928 20 35 22",WooCommerce
7,www.falconsdevilafranca.cat,España,2023-03-24,falcons@falconsdevilafranca.cat,falconsvfp,+34 678 44 78 40,Wix
8,www.destockage.es,España,2023-06-02,info@destockage.es,,+34 633 12 20 55,Odoo
9,www.reciclajesdelsurbahia.es,España,2023-04-21,info@reciclajesdelsurbahia.es,reciclajesdelsurbahias.l.1,+34 956 40 62 00,BaseKit


## Pruebas

In [40]:
df = pd.read_excel(r'C:\Users\Usuario\Desktop\FlaskProjects\Shopify_analizer\99 correctos.xlsx')


In [41]:
import time
# Record the start time
start_time = time.time()
##########
#Para hacerlo a traves de un excel y un dataframe
urls = list(df['domain'])
#urls = ["Rebels108.com","www.ergonomika.es","Tiendascaprichos.com","Klix.es","www.hillsgarden.co","Lalaynina.com","Superfantasticas.com"]
new_url = []
for url in urls:
    url_nueva = 'https://'+url
    new_url.append(url_nueva)
print(new_url)


['https://www.visionnocturnaytermica.com', 'https://www.gepcoformacion.es', 'https://mrribsmadrid.es', 'https://lewu.es', 'https://www.powerbuildingoficial.com', 'https://electrotodo.fr', 'https://www.harven.es', 'https://www.falconsdevilafranca.cat', 'https://www.destockage.es', 'https://www.reciclajesdelsurbahia.es', 'https://store.kingsleague.pro', 'https://es.acebattery.com', 'https://es.ridex.eu', 'https://tienda.artser.es', 'https://market.noticiasgourmet.es', 'https://desarrollo.winegogh.es', 'https://shop.fander.es', 'https://tienda.villanueva.edu', 'https://da.roble.store', 'https://es.tripnbike.com', 'https://admin.bibloo.es', 'https://regala.pancracio.com', 'https://store.a-emotionallight.com', 'https://es.twobrothers-store.com', 'https://pt.ysabelmora.com', 'https://es.aeratron.io', 'https://es.ketocharge.com', 'https://es.watchesofamerica.com', 'https://de.boneandwhite.com', 'https://tienda.intech3d.es', 'https://ie.bouchrafilalilahlou.com', 'https://ibiza-blau-music.sumup

In [42]:
from langchain.document_loaders import UnstructuredURLLoader

def url_loader(urls):
    loader = UnstructuredURLLoader(urls=urls)
    data = loader.load()
    return data

htmls = html_loader(new_url)

end_time = time.time()

# Calculate the elapsed time
elapsed_time = end_time - start_time

# Print the result in seconds
print(f"Execution time: {float(elapsed_time)/60} minutes")

Execution time: 4.646285100777944 minutes


In [44]:
len(htmls)

99

In [55]:
textos = []
for html in htmls:
    x = html.page_content.replace('\t','')
    y = x.replace('\n',' ')
    textos.append(y)

In [57]:
df['paginas']= textos

In [58]:
df.to_excel(r'C:\Users\Usuario\Desktop\FlaskProjects\Shopify_analizer\99 correctos_paginas.xlsx')