In [None]:
import pandas as pd
import numpy as np

np.random.seed(42)

# Parameters
n = 5000
haskell_fluency_rate = 0.02  # 2% of applicants are fluent
tpr = 0.85  # True positive rate
fpr = 0.10  # False positive rate

# Step 1: Simulate actual fluency
# 1 = Fluent, 0 = Not Fluent
actual_fluency = np.random.choice([1, 0], size=n, p=[haskell_fluency_rate, 1 - haskell_fluency_rate])

# Step 2: Simulate test results
test_results = []
for fluent in actual_fluency:
    if fluent == 1:
        result = np.random.choice(['Pass', 'Fail'], p=[tpr, 1 - tpr])
    else:
        result = np.random.choice(['Pass', 'Fail'], p=[fpr, 1 - fpr])
    test_results.append(result)

# Step 3: Create DataFrame
df = pd.DataFrame({
    'Fluent_in_Haskell': np.where(actual_fluency == 1, 'Yes', 'No'),
    'Screening_Test_Result': test_results
})

# Preview
print(df['Fluent_in_Haskell'].value_counts())
print(df['Screening_Test_Result'].value_counts())
print(df.head())


Fluent_in_Haskell
No     4901
Yes      99
Name: count, dtype: int64
Screening_Test_Result
Fail    4420
Pass     580
Name: count, dtype: int64
  Fluent_in_Haskell Screening_Test_Result
0                No                  Fail
1                No                  Fail
2                No                  Fail
3                No                  Fail
4                No                  Fail


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Convert labels to binary
y_true = (df['Fluent_in_Haskell'] == 'Yes').astype(int)        # 1 = Fluent
y_pred = (df['Screening_Test_Result'] == 'Pass').astype(int)   # 1 = Predicted Fluent

# Generate classification report
report = classification_report(y_true, y_pred, target_names=['Not Fluent', 'Fluent'])
print(report)


              precision    recall  f1-score   support

  Not Fluent       1.00      0.90      0.94      4901
      Fluent       0.14      0.83      0.24        99

    accuracy                           0.90      5000
   macro avg       0.57      0.86      0.59      5000
weighted avg       0.98      0.90      0.93      5000



In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# 1. Simulate Base Dataset (as before)
np.random.seed(42)
n = 5000
haskell_fluency_rate = 0.02
tpr = 0.85
fpr = 0.10

actual_fluency = np.random.choice([1, 0], size=n, p=[haskell_fluency_rate, 1 - haskell_fluency_rate])

test_results = []
for fluent in actual_fluency:
    if fluent == 1:
        result = np.random.choice(['Pass', 'Fail'], p=[tpr, 1 - tpr])
    else:
        result = np.random.choice(['Pass', 'Fail'], p=[fpr, 1 - fpr])
    test_results.append(result)

df = pd.DataFrame({
    'Fluent_in_Haskell': np.where(actual_fluency == 1, 'Yes', 'No'),
    'Screening_Test_Result': test_results
})

# 2. Simulate Features
df['Test_Score'] = np.where(df['Screening_Test_Result'] == 'Pass',
                            np.random.normal(80, 5, size=n),
                            np.random.normal(60, 5, size=n))

df['Years_of_Experience'] = np.where(df['Fluent_in_Haskell'] == 'Yes',
                                     np.random.poisson(4, size=n),
                                     np.random.poisson(1, size=n))

# 3. Encode target
df['Label'] = (df['Fluent_in_Haskell'] == 'Yes').astype(int)

# 4. Train-Test Split
X = df[['Test_Score', 'Years_of_Experience']]
y = df['Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

# 5. Logistic Regression with class balancing
model = LogisticRegression(class_weight='balanced', solver='liblinear')
model.fit(X_train, y_train)

# 6. Evaluation
y_pred = model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=['Not Fluent', 'Fluent']))

Classification Report:
               precision    recall  f1-score   support

  Not Fluent       1.00      0.92      0.96      1470
      Fluent       0.19      0.93      0.32        30

    accuracy                           0.92      1500
   macro avg       0.60      0.93      0.64      1500
weighted avg       0.98      0.92      0.95      1500

