In [None]:
import numpy as np
import re
import os
import openai
from dotenv import load_dotenv

class Prompt_Generator():


    def __init__(self, prompt, LLM_Handler, num_datasets = 5 , pval=np.array([0.2, 0.4, 0.6, 0.8])):
        """
         Parameters:
        self.original_prompt : original prompt to create subsets
        self. pval : distribution P that p is drawn from. Normally  pval=[0.2, 0.4, 0.6, 0.8]
        self.num_datasets : Number of datasets or subsets (M in the code) 
        self.prompts : a list of prompts that are our datasets.
            
        """
        self.original_prompt = prompt
        self. pval = pval
        self.num_datasets = num_datasets
        self.sample_prompts = []
        self.LLM_Handler = LLM_Handler
        # self.prompts.append(self.original_prompt) the original prompt itself isn't something special.


    def sample_p(self, size=None):
        return np.random.choice(self.pval, size=size)


    def coef_scaling(self):
        return (1/(self.pval *(1-self.pval))).mean()

    def build_vocabulary(self, originalprompt):
        """
        parameter : original prompt

        return:
        word_to_index : a dic to map words to index inthe original prompt.
        unique_words : unique words in the original prompt. If a word is repetitive, the first index will be saved for word_to_index.
        N : number of unique words  
        """
        words = re.findall(r'\b\w+\b', originalprompt)
        # Step 2: Build the vocabulary
        unique_words = []
        word_to_index = {}
        for word in words:
            if word not in word_to_index:
                word_to_index[word] = len(unique_words)
                unique_words.append(word)

        N = len(unique_words)
        return word_to_index, unique_words, N
    

# To Do: resolve the bug that chatgpt doesn't produce exactly that much words.
    def create_baseline_words(self):
        word_to_index, _, N = self.build_vocabulary(self.original_prompt)
        baseline_list = [[] for _ in range(N)]
        rng = np.random.default_rng()
        num_iterations = 5   #100 *N

        for _ in range(num_iterations):
            p = np.random.choice(np.array([0.2, 0.4, 0.6, 0.8]))
            # print(p)
            modified_prompt = remove_words_with_probability(self.original_prompt, probability=p)
            # print(modified_prompt)
            # Get the LLM to fill in the blanks
            filled_prompt = self.LLM_Handler.fill_in_blanks(modified_prompt)
            # print("filled prompt: ", filled_prompt)
            filled_words = re.findall(r'\b\w+\b', filled_prompt)

        
            modified_words = word_pattern.findall(modified_prompt)
            for idx, token in enumerate(modified_words):
                if token == '__':
                    # print("fghj",filled_words[idx])
                    baseline_list[idx].append(filled_words[idx])


        
        return baseline_list



#To Do : might change even before p-featurization
    def create_X(self, method='uncompleted'):
        self.ps = []
        word_to_index, unique_words, N  = self.build_vocabulary(self.original_prompt)
        X = np.zeros((self.num_datasets, N)) 
        nu = self.coef_scaling() # variance of each feature
        for m in range(self.num_datasets):
            p = self.sample_p()
            self.ps.append(p)
            modified_prompt = remove_words_with_probability(self.original_prompt, probability= p)
            X[m,:] = -1/((1-p)* np.sqrt(nu))
            modified_promptWords = re.findall(r'\b\w+\b', modified_prompt)
            for modified_word in modified_promptWords:
                if modified_word !="__":
                    # print("we change probability of word: ", modified_word, "in dataset ", m, " with index ", word_to_index[modified_word])
                    X[m,word_to_index[modified_word]] = 1/(p* np.sqrt(nu))
            
            # completed_prompt = self.LLM_Handler.fill_in_blanks(modified_prompt)
            self.sample_prompts.append(modified_prompt)
        return X
    
    # To Do sample from baseline for each word for putting typical word in it.
    def fill_prompt_form_baseline(self, prompts):

        return prompts
    
    
    
    # To do: add the baseline method. Here is just uncompleted.
    def create_y(self, prompts, method='uncomplete'):
        CLASSIFICATION_PROMPT_Base = """You will be given a headline of a news article.
        Classify the article into one of the following categories: Technology, Politics, Sports, Art or others.
        Return only the name of the category, and nothing else.
        MAKE SURE your output is one of the four categories stated.
        Article headline: {prompt}"""

        CLASSIFICATION_PROMPT_Method2 = """You will be given a headline of a news article with some blanks instead of words.
        Classify the article into one of the following categories: Technology, Politics, Sports, Art or others.
        Return only the name of the category, and nothing else.
        MAKE SURE your output is one of the four categories stated.
        Article headline: {prompt}"""

        if method == 'baseline':
            baseline_list = self.create_baseline_words()
            self.sample_prompts = self.fill_prompt_form_baseline(prompts) #TO Do

        y = np.zeros(len(prompts))
        outputs = []
        for i in range(len(prompts)):
            if method=='baseline':
                API_RESPONSE = self.LLM_Handler.get_completion(
                [{"role": "user", "content": CLASSIFICATION_PROMPT_Base.format(prompt=prompts[i])}],
                model="gpt-4",
                logprobs=True,
                top_logprobs=1,
                )
            elif method =='uncomplete':
                API_RESPONSE = self.LLM_Handler.get_completion(
                [{"role": "user", "content": CLASSIFICATION_PROMPT_Method2.format(prompt=prompts[i])}],
                model="gpt-4",
                logprobs=True,
                top_logprobs=1,
                )


            
            val = API_RESPONSE.choices[0].logprobs.content[0].top_logprobs[0]
            outputs.append(val.token)
            y[i] = val.logprob

        return y, outputs
        
    

          
# Pre-compile regex patterns for efficiency
"""
word_pattern matches complete words composed of alphanumeric characters.
token_pattern splits the prompt into words, whitespace, and punctuation, ensuring that all characters
 (including spaces, punctuation like commas and periods, and newlines) are preserved.
"""
word_pattern = re.compile(r'\b\w+\b')
token_pattern = re.compile(r'\b\w+\b|\s+|[^\w\s]')

def remove_words_with_probability(prompt, probability=0.8):
    tokens = token_pattern.findall(prompt)
    modified_tokens = [
        '__' if word_pattern.fullmatch(token) and np.random.uniform(0,1) > probability else token
        for token in tokens
    ]
    return ''.join(modified_tokens)


In [7]:
class LLM_Handler():
    
    def __init__(self) -> None:
        load_dotenv('config.env')
        openai.api_key = os.environ.get("OPENAI_API_KEY")

    
    def fill_in_blanks(self, prompt):
            response = openai.ChatCompletion.create(
                model="gpt-4",
                messages=[
                    {"role": "system", "content": "Please fill in the blanks in the following sentence with exactly one word for each blank then write the sentence with the filled words completely. Remember just sentence without any other thing and the number of filled words equal to the number of blanks."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=500,  
                temperature=0,
                n=1,
                stop=None
            )
            # Extract the assistant's reply from the response
            filled_prompt = response['choices'][0]['message']['content'].strip()
            return filled_prompt


    def get_completion(
        self,
        messages: list[dict[str, str]],
        model: str = "gpt-4",
        max_tokens=500,
        temperature=0,
        stop=None,
        seed=123,
        tools=None,
        logprobs=None,  # whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each output token returned in the content of message..
        top_logprobs=None,
    ) -> str:
        params = {
            "model": model,
            "messages": messages,
            "max_tokens": max_tokens,
            "temperature": temperature,
            "stop": stop,
            "seed": seed,
            "logprobs": logprobs,
            "top_logprobs": top_logprobs,
        }
        if tools:
            params["tools"] = tools

        completion = openai.ChatCompletion.create(**params)
        return completion

    

In [8]:
 #  suppose this two words Photo-Editing 

headlines = [
    "Tech Giant Unveils Latest Smartphone Model with Advanced Photo-Editing Features.",
    "Local Mayor Launches Initiative to Enhance Urban Public Transport.",
    "Tennis Champion Showcases Hidden Talents in Symphony Orchestra Debut",
]
original_prompt = headlines[1]
prompt_generator = Prompt_Generator(prompt=original_prompt, LLM_Handler=LLM_Handler() ,num_datasets = 50 , pval=np.array([0.2, 0.4, 0.6, 0.8]))

# word_to_index, _, N = prompt_generator.build_vocabulary(original_prompt)
# baseline_list = [[] for _ in range(N)]
# rng = np.random.default_rng()
# num_iterations = 1

# for _ in range(num_iterations):
#     p = np.random.choice(np.array([0.2, 0.4, 0.6, 0.8]))
#     # print(p)

#     modified_prompt = remove_words_with_probability(original_prompt, probability=p)
#     # print(modified_prompt)
#     # Get the LLM to fill in the blanks
#     filled_prompt = prompt_generator.LLM_Handler.fill_in_blanks(modified_prompt)
#     # print("filled prompt: ", filled_prompt)
#     filled_words = re.findall(r'\b\w+\b', filled_prompt)

 
#     modified_words = word_pattern.findall(modified_prompt)
#     for idx, token in enumerate(modified_words):
#         if token == '__':
#             # print("fghj",filled_words[idx])
#             baseline_list[idx].append(filled_words[idx])


# baseline_list = prompt_generator.create_baseline_words()

X = prompt_generator.create_X()
print(prompt_generator.sample_prompts)
y, outputs = prompt_generator.create_y(prompt_generator.sample_prompts)
y = np.exp(y)
print(outputs)

# y

['__ __ Launches __ to Enhance Urban __ __.', 'Local Mayor Launches __ to Enhance __ __ Transport.', 'Local __ Launches Initiative to Enhance Urban Public Transport.', 'Local __ Launches __ to Enhance Urban __ __.', 'Local Mayor Launches __ to Enhance Urban __ Transport.', '__ __ __ __ __ __ Urban __ __.', 'Local Mayor Launches Initiative to __ __ Public __.', 'Local Mayor __ Initiative to Enhance __ Public Transport.', '__ __ __ Initiative to Enhance __ Public __.', '__ __ Launches __ __ __ __ __ __.', 'Local Mayor __ __ __ Enhance Urban Public __.', 'Local __ Launches Initiative to Enhance Urban Public Transport.', 'Local Mayor Launches Initiative __ __ __ __ Transport.', '__ __ Launches __ __ __ __ __ __.', '__ __ Launches Initiative to Enhance Urban Public Transport.', '__ __ __ __ __ Enhance Urban __ __.', '__ Mayor Launches __ __ Enhance __ Public Transport.', '__ __ __ __ __ Enhance __ Public Transport.', '__ __ __ __ __ __ __ __ __.', '__ __ __ __ __ __ __ __ __.', '__ __ Launc

In [9]:
import numpy as np
from sklearn.linear_model import LassoCV
alpha_values = np.logspace(-4, 5, 50)
lasso_cv = LassoCV(alphas = alpha_values ,cv=5, random_state=123).fit(X, y)
print(original_prompt)
best_lambda = lasso_cv.alpha_
print("Optimal lambda:", best_lambda)

# print("Coefficients:", lasso_cv.coef_)

print("Coefficients:", lasso_cv.coef_ * np.sqrt(prompt_generator.coef_scaling()))



Local Mayor Launches Initiative to Enhance Urban Public Transport.
Optimal lambda: 0.01599858719606059
Coefficients: [0.         0.0119888  0.         0.         0.         0.
 0.         0.         0.02903531]


In [None]:
import Solver
lasso_solver = Solver.SparseLassoSolver(coef_scaling= prompt_generator.coef_scaling())
# X = prompt_generator.create_X()
# print(prompt_generator.sample_prompts)
# y, outputs = prompt_generator.create_y(prompt_generator.sample_prompts)
res = lasso_solver.fit(X, y)
print("Coefficients:", res)


In [10]:
import Solver
lasso_solver = Solver.LassoSolver(coef_scaling= prompt_generator.coef_scaling())
# X = prompt_generator.create_X()
# print(prompt_generator.sample_prompts)
# y, outputs = prompt_generator.create_y(prompt_generator.sample_prompts)
res = lasso_solver.fit(X, y)
print("Coefficients:", res)


Optimal lambda: 2.8411709705692926e-05
Coefficients: [ 0.02560887  0.07667842  0.0062007   0.01459472 -0.00722834  0.05050578
  0.02832886  0.03396336  0.08356675]


In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

class SourceCATELearner:
    def __init__(self, random_state=42):
        """
        Initialize learner for source-wise CATE estimation
        """
        self.random_state = random_state
        self.propensity_model = LogisticRegression(random_state=random_state)
        self.outcome_model = LinearRegression()
        self.final_model = LinearRegression()
        
    def estimate_single_source_cate(self, X, y, source_idx):
        """
        Estimate CATE for a single source
        
        Parameters:
        X (np.array): M x n binary matrix where M is number of samples, n is number of sources
        y (np.array): Target variable for each sample
        source_idx (int): Index of the source to estimate CATE for
        
        Returns:
        float: CATE estimate for the specified source
        """
        # Extract treatment for the specific source
        T = X[:, source_idx]
        
        # Create features excluding the target source
        X_reduced = np.delete(X, source_idx, axis=1)
        
        # Step 1: Estimate propensity score e(X) = P(T=1|X)
        self.propensity_model.fit(X_reduced, T)
        e_x = self.propensity_model.predict_proba(X_reduced)[:, 1]
        
        # Step 2: Estimate outcome model m(X)
        self.outcome_model.fit(X_reduced, y)
        m_x = self.outcome_model.predict(X_reduced)
        
        # Step 3: Calculate pseudo-outcome
        pseudo_outcome = (y - m_x) / (T - e_x)
        
        # Step 4: Final regression for CATE
        self.final_model.fit(X_reduced, pseudo_outcome)
        
        # Return average CATE
        return np.mean(self.final_model.predict(X_reduced))
    
    def estimate_all_sources_cate(self, X, y):
        """
        Estimate CATE for all sources simultaneously
        
        Parameters:
        X (np.array): M x n binary matrix
        y (np.array): Target variable for each sample
        
        Returns:
        np.array: CATE estimates for all sources
        """
        n_sources = X.shape[1]
        cates = np.zeros(n_sources)
        
        # Method 1: Sequential estimation
        for i in range(n_sources):
            cates[i] = self.estimate_single_source_cate(X, y, i)
            
        return cates



# Test implementation
if __name__ == "__main__":
    # Generate test data
    M, n = 1000, 5  # M samples, n sources
    X, y, true_cates = generate_test_data(M, n)
    
    # Initialize and test learner
    learner = SourceCATELearner()
    
    # Test single source CATE
    print("\nTesting single source CATE estimation:")
    source_idx = 0
    single_cate = learner.estimate_single_source_cate(X, y, source_idx)
    print(f"Estimated CATE for source {source_idx}: {single_cate:.3f}")
    print(f"True CATE for source {source_idx}: {true_cates[source_idx]:.3f}")
    
    # Test all sources CATE
    print("\nTesting all sources CATE estimation:")
    estimated_cates = learner.estimate_all_sources_cate(X, y)
    print("Estimated CATEs:", estimated_cates)
    print("True CATEs:", true_cates)
    print("\nMean Absolute Error:", np.mean(np.abs(estimated_cates - true_cates)))

In [47]:
from sklearn.linear_model import LinearRegression

ols_model = LinearRegression()
ols_model.fit(X, y)
predictions_ols_train = ols_model.predict(X)
mse_ols_train = mean_squared_error(y, predictions_ols_train)
print("OLS Training Mean Squared Error (MSE):", mse_ols_train)

mae_ols_train = mean_absolute_error(y, predictions_ols_train)
print("OLS Training Mean Absolute Error (MAE):", mae_ols_train)


OLS Training Mean Squared Error (MSE): 22.26310886203446
OLS Training Mean Absolute Error (MAE): 3.1960798147776086


In [1]:
import os
from dotenv import load_dotenv
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

load_dotenv('config.env')
hf_auth_token  = os.environ.get("HF_API_KEY")

model_name = "meta-llama/Llama-3.1-8B-Instruct"  # Example: a restricted-access Llama 3 model

# Supply the access token to each from_pretrained call.
tokenizer = AutoTokenizer.from_pretrained(
    model_name, 
    use_auth_token=hf_auth_token
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,  
    use_auth_token=hf_auth_token
)

model.eval()

print("Model and tokenizer loaded successfully!")


  from .autonotebook import tqdm as notebook_tqdm


: 