In [2]:
%pip install streamlit
%pip install pandas
%pip install requests
%pip install beautifulsoup4
%pip install tqdm
%pip install scikit-learn
%pip install transformers
%pip install libgl1


Collecting streamlit
  Downloading streamlit-1.39.0-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting cachetools<6,>=4.0 (from streamlit)
  Downloading cachetools-5.5.0-py3-none-any.whl.metadata (5.3 kB)
Collecting tenacity<10,>=8.1.0 (from streamlit)
  Downloading tenacity-9.0.0-py3-none-any.whl.metadata (1.2 kB)
Collecting toml<2,>=0.10.1 (from streamlit)
  Downloading toml-0.10.2-py2.py3-none-any.whl.metadata (7.1 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting watchdog<6,>=2.1.5 (from streamlit)
  Downloading watchdog-5.0.3-py3-none-win_amd64.whl.metadata (41 kB)
Downloading streamlit-1.39.0-py2.py3-none-any.whl (8.7 MB)
   ---------------------------------------- 0.0/8.7 MB ? eta -:--:--
   ---------------------------------------- 8.7/8.7 MB 45.1 MB/s eta 0:00:00
Downloading cachetools-5.5.0-py3-none-any.whl (9.5 kB)
Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
   ----------------------------

ERROR: Could not find a version that satisfies the requirement libgl1 (from versions: none)
ERROR: No matching distribution found for libgl1


In [3]:
import streamlit as st
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline

# Inizializza il traduttore di Hugging Face per la traduzione in inglese
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")

# Funzione per pulire il dataset CRM Input
def clean_data(df):
    df = df.drop(columns=["Contact Name", "Contact Job Title"], errors='ignore')
    df = df.drop_duplicates().dropna()
    df['Email Domain'] = df['Contact E-mail'].apply(lambda email: email.split('@')[-1])

    public_domains = ['gmail.com', 'hotmail.com', 'yahoo.com', 'outlook.com', 'live.com', 'icloud.com', 'unknown.com']
    df = df[~df['Email Domain'].isin(public_domains)]
    
    def check_domain_exists(domain):
        try:
            response = requests.get(f"http://{domain}", timeout=2)
            return response.status_code == 200
        except requests.RequestException:
            return False

    df['Domain Exists'] = df['Email Domain'].apply(check_domain_exists)
    df = df[df['Domain Exists']].drop(columns=['Domain Exists'])

    df['Main Domain Name'] = df['Email Domain'].apply(lambda domain: domain.split('.')[0])
    df = df.drop_duplicates(subset=["Main Domain Name"]).drop(columns=['Main Domain Name'])
    df = df.drop_duplicates(subset=["Contact E-mail"]).drop_duplicates(subset=["Email Domain"])
    df = df.drop_duplicates()
    df = df.dropna()
    return df

# Funzione per pulire il dataset delle categorie
def clean_categories(df):
    return df.drop_duplicates().dropna().drop(columns=['Can they buy the solution?', 'Can they influence the buying decision?'], errors='ignore')

# Funzione per estrarre informazioni e tradurre
def extract_meta_information(soup):
    info = {}
    description = soup.find('meta', attrs={'name': 'description'})
    if description:
        info['meta_description'] = description.get('content', '')
    og_description = soup.find('meta', attrs={'property': 'og:description'})
    if og_description:
        info['og_description'] = og_description.get('content', '')
    keywords = soup.find('meta', attrs={'name': 'keywords'})
    if keywords:
        info['meta_keywords'] = keywords.get('content', '')

    paragraphs = soup.find_all('p')
    if paragraphs:
        paragraph_text = " ".join([p.get_text().strip() for p in paragraphs[:7] if p.get_text().strip()])
        info['paragraphs'] = paragraph_text
    return info

# Funzione per estrarre e tradurre le informazioni in inglese
def extract_and_translate_company_info(url):
    try:
        response = requests.get(url, timeout=2)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            meta_info = extract_meta_information(soup)
            description_parts = [meta_info.get(key, '') for key in ['meta_description', 'og_description', 'paragraphs']]
            full_description = " ".join(list(dict.fromkeys(description_parts)))[:700]
            translated_description = translator(full_description, max_length=700)[0]['translation_text']
            return translated_description
        return "Company information not available."
    except:
        return "Company information not available."

# Funzione per aggiungere la colonna "Description" con traduzione
def add_description_column(df):
    descriptions = []
    for domain in tqdm(df['Email Domain'], desc="Extracting and translating company information", unit="company"):
        website_url = f"http://{domain}"
        descriptions.append(extract_and_translate_company_info(website_url))
    df['Description'] = descriptions
    return df

# Funzione per classificare basata su TF-IDF e similarità coseno
def classify_with_cosine_similarity(df_data, df_context):
    all_texts = df_data['Description'].tolist() + df_context['Notes'].tolist()
    vectorizer = TfidfVectorizer().fit(all_texts)
    company_vectors = vectorizer.transform(df_data['Description'])
    category_vectors = vectorizer.transform(df_context['Notes'])

    best_matches = []
    for company_vector in company_vectors:
        similarities = cosine_similarity(company_vector, category_vectors)
        best_match_idx = similarities.argmax()
        best_category = df_context.iloc[best_match_idx]['Player']
        best_matches.append(best_category)

    df_data['Best Category'] = best_matches
    return df_data

# Funzione per aggiungere la colonna "Is Target"
def add_target_column(df_data, df_context):
    target_map = df_context.set_index('Player')['Is Target'].to_dict()
    df_data['Is Target'] = df_data['Best Category'].apply(lambda category: target_map.get(category, "No"))
    return df_data

# Interfaccia Streamlit
st.title("CRM Classifier")

# Caricamento dei file
crm_file = st.file_uploader("Upload CRM Input File", type="xlsx")
context_file = st.file_uploader("Upload Context File", type="xlsx")

if crm_file and context_file:
    df_data = pd.read_excel(crm_file)
    df_context = pd.read_excel(context_file)

    # Pulizia dei dati
    st.write("Processing CRM Input Data...")
    df_data_cleaned = clean_data(df_data)
    st.write("Processing Context Data...")
    df_context_cleaned = clean_categories(df_context)

    # Aggiunta descrizione e traduzione
    st.write("Extracting and translating company information...")
    df_data_with_description = add_description_column(df_data_cleaned)

    # Classificazione
    st.write("Classifying data using cosine similarity...")
    df_data_classified = classify_with_cosine_similarity(df_data_with_description, df_context_cleaned)

    # Aggiunta colonna "Is Target"
    st.write("Adding target column...")
    df_data_final = add_target_column(df_data_classified, df_context_cleaned)

    # Download dei risultati
    st.write("Download the classified data:")
    st.download_button(
        label="Download Classified Data as Excel",
        data=df_data_final.to_excel(index=False),
        file_name="classified_data.xlsx",
        mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
    )


  from .autonotebook import tqdm as notebook_tqdm
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
2024-10-29 22:11:31.832 
  command:

    streamlit run c:\Users\Andrea\text-generation-webui-main\installer_files\env\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
