<a href="https://colab.research.google.com/github/smartcontracts0/genomics/blob/main/FHE/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
import random
import pandas as pd
import numpy as np
from faker import Faker

# Initialize Faker
fake = Faker()

# Define DNA nucleotides
nucleotides = ['A', 'T', 'C', 'G']

# Function to generate a random DNA sequence of a given length
def generate_dna_sequence(length):
    return ''.join(random.choices(nucleotides, k=length))

# Create a synthetic dataset
def create_synthetic_dataset(n_samples=1000):
    data = []
    for _ in range(n_samples):
        sample_id = fake.uuid4()
        sequence_length = random.randint(50, 150)
        sequence = generate_dna_sequence(sequence_length)
        chromosome = random.choice(list(range(1, 23)) + ['X', 'Y'])
        region_type = random.choice(['coding', 'non-coding'])
        ancestry = random.choice(['African', 'Asian', 'European', 'American'])
        phenotype = random.choice(['healthy', 'diseased'])
        data.append([sample_id, sequence, chromosome, region_type, sequence_length, ancestry, phenotype])
    return pd.DataFrame(data, columns=['sample_id', 'sequence', 'chromosome', 'region_type', 'sequence_length', 'ancestry', 'phenotype'])

# Create the dataset
df = create_synthetic_dataset()
print(df.head())


                              sample_id  \
0  86bc110e-dfdf-4e17-aaa7-ed6bf7118508   
1  7d53a115-015c-4284-a43d-8e7d81c06625   
2  85449686-7123-4681-b69c-d7f909773d64   
3  2fef72ee-87b8-49b1-892f-5dd7d88d7aed   
4  1df56fa3-4c19-4d0f-a87b-e1c89032c908   

                                            sequence chromosome region_type  \
0  TTCGCTAGCAGAAGGGGTGGCCTACCCTCTTTTTACGGGCAGGTAT...          4  non-coding   
1  CGGCAACTCTGCAGGTAAGGGGCCGGTTTGCTAGGTATTGTACGGT...         10  non-coding   
2  TAACTTACGGTGCAGTCGGGGCTTTACCGTTAGCTCCGAAGCCCCC...         13      coding   
3  AAAGCCGTCGGAGTACTTGATTTAGGTTTGTGTGTAGACGGTTATT...          5      coding   
4  GCAGTATACGCAAGATCAGTACCGGCTGCTAGATAAAATAGTTTTA...         18      coding   

   sequence_length  ancestry phenotype  
0               59  European   healthy  
1               94  European   healthy  
2               92   African   healthy  
3              115  American   healthy  
4              148   African  diseased  


In [29]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Encode categorical columns
label_encoders = {}
for column in ['chromosome', 'region_type', 'ancestry', 'phenotype']:
    le = LabelEncoder()
    # Convert the column to string type before encoding to ensure uniformity
    df[column] = df[column].astype(str)  # This line is added to fix the error
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

# Prepare features and labels
X = df[['sequence_length', 'chromosome', 'region_type', 'ancestry']].values
y = df['phenotype'].values

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
from concrete.ml.sklearn import LogisticRegression
from sklearn.preprocessing import StandardScaler

# Scale the features to have zero mean and unit variance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Now we train in the clear and quantize the weights
model = LogisticRegression(n_bits=8)
model.fit(X_train, y_train)

# We can simulate the predictions in the clear
y_pred_clear = model.predict(X_test)

# We then compile on a representative set
model.compile(X_train)

# Finally we run the inference on encrypted inputs !
y_pred_fhe = model.predict(X_test, fhe="execute")

print("In clear  :", y_pred_clear)
print("In FHE    :", y_pred_fhe)
print(f"Similarity: {int((y_pred_fhe == y_pred_clear).mean()*100)}%")

In clear  : [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1]
In FHE    : [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1]
Similarity: 100%
