In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    precision_recall_curve,
    f1_score,
    make_scorer
)
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold

In [None]:
enrichment = pd.read_csv(r"C:\Users\Семён\Desktop\test task - data scientist\infutor_enrichment_dataset.csv")
zipcode = pd.read_csv(r"C:\Users\Семён\Desktop\test task - data scientist\zip_code_dataset.csv")
leads = pd.read_csv(r"C:\Users\Семён\Desktop\test task - data scientist\leads_dataset.csv")
df = leads.merge(enrichment, how='left', on='HASHED_PHONE_NUMBER')
df = df.merge(zipcode, how='left', on='ZIP_CODE')
df['id'] = df['HASHED_PHONE_NUMBER'].astype(str) + ';' + \
           df['IS_APPOINTMENT_SET'].astype(str) + ';' + \
           df['LEAD_CREATED_AT_UTC'].astype(str)
df = df.drop_duplicates(subset=['id'])
df.shape
reserve=df.copy()

In [None]:
a = df.columns
print(", ".join(a))

In [None]:
def analyze_column(df, col):
    print("=" * 60)
    print(f"Column: {col}")
    print(f"Dtype: {df[col].dtype}")
    print(f"Missing values: {df[col].isnull().sum()}")
    print(f"Unique values: {df[col].nunique(dropna=True)}")

    examples = df[col].dropna().unique()
    print(f"All unique values: {examples}")
    

    print("Top-5 most frequent values:")
    value_counts = df[col].value_counts(dropna=True).head(5)
    for val, count in value_counts.items():
        print(f"  {repr(val)}: {count}")

for col in df.columns:
    analyze_column(df, col)

In [None]:
# Remove values occurring in less than X% of cases,
# to avoid class imbalance or perfect separation

target_columns = ['STATE', 'EMAIL_DOMAIN', 'OPERATINGSYSTEMCLASS','OPERATINGSYSTEMNAME',
                  'AGENTLANGUAGECODE', 'AGENTNAME','AGENTVERSIONMAJOR', 'DEVICEBRAND',
                 'DEVICEFIRMWAREVERSION', 'DEVICENAME' ,'DEVICEVERSION',
'FACEBOOKDEVICECLASS','LAYOUTENGINENAME','NETWORKTYPE',
'OPERATINGSYSTEMVERSIONMAJOR','WEBVIEWAPPNAME','MATCHLEVEL']
threshold = 0.1
row_count = len(df)

for col in target_columns:
    value_counts = df[col].value_counts(normalize=True)
    frequent_values = value_counts[value_counts >= threshold].index
    df[col] = df[col].apply(lambda x: x if x in frequent_values else np.nan)

# Logistic Regression

In [None]:
df_clean = df.dropna(subset=['IS_APPOINTMENT_SET']).copy()
df_clean['IS_APPOINTMENT_SET'] = df_clean['IS_APPOINTMENT_SET'].astype(int)
use_cols =  [
'IS_APPOINTMENT_SET', 
      'STATE', 'EMAIL_DOMAIN', 'OPERATINGSYSTEMCLASS','OPERATINGSYSTEMNAME',
                  'AGENTLANGUAGECODE', 'AGENTNAME','AGENTVERSIONMAJOR', 'DEVICEBRAND',
                 'DEVICEFIRMWAREVERSION', 'DEVICENAME' ,'DEVICEVERSION',
'FACEBOOKDEVICECLASS','LAYOUTENGINENAME','NETWORKTYPE',
'OPERATINGSYSTEMVERSIONMAJOR','WEBVIEWAPPNAME','MATCHLEVEL']

df_small = df_clean[use_cols].copy()
df_small = df_small.copy()
df_small = df_small[df_small['IS_APPOINTMENT_SET'].notnull()]
print(df_small.columns)
y = df_small['IS_APPOINTMENT_SET'].astype(int)
X = df_small.drop(columns=['IS_APPOINTMENT_SET'])
categorical_cols = ['STATE', 'EMAIL_DOMAIN', 'OPERATINGSYSTEMCLASS','OPERATINGSYSTEMNAME',
                  'AGENTLANGUAGECODE', 'AGENTNAME','AGENTVERSIONMAJOR', 'DEVICEBRAND',
                 'DEVICEFIRMWAREVERSION', 'DEVICENAME' ,'DEVICEVERSION',
'FACEBOOKDEVICECLASS','LAYOUTENGINENAME','NETWORKTYPE',
'OPERATINGSYSTEMVERSIONMAJOR','WEBVIEWAPPNAME','MATCHLEVEL']
print(categorical_cols)
# numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_cols = []
print(numerical_cols)

num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# cat_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),
#     ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
# ])
cat_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=True, dtype=int))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, numerical_cols),
        ('cat', cat_transformer, categorical_cols)
    ]
)



model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(
        class_weight='balanced',
        max_iter=100,
         fit_intercept=True,
        solver='lbfgs',
        multi_class='multinomial',
        random_state=42
    ))
])


X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.3, random_state=42
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Classification Report:\n")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))


y_proba = model.predict_proba(X_test)[:, 1] 
precision, recall, thresholds = precision_recall_curve(y_test, y_proba)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-6) 
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]
best_f1 = f1_scores[best_idx]
print(f"Threshold for Best F1-score: {best_threshold:.3f}")
print(f"Best F1-score: {best_f1:.3f}")


y_pred_optimal = (y_proba >= best_threshold).astype(int)
print("\n📋 Classification Report (с оптимальным порогом):")
print(classification_report(y_test, y_pred_optimal))
print("📊 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_optimal))


plt.figure(figsize=(10, 6))
plt.plot(thresholds, precision[1:], label='Accuracy', color='blue')
plt.plot(thresholds, recall[1:], label='Recall', color='green')
plt.plot(thresholds, f1_scores[1:], label='F1-score', color='red')
plt.axvline(x=best_threshold, color='black', linestyle='--', label=f'Best threshold = {best_threshold:.2f}')
plt.xlabel('Threshold')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42))
])

param_grid = {
    'classifier__C': [0.1, 1.0, 10.0],
    'classifier__solver': ['lbfgs', 'newton-cg', 'saga'],
    'classifier__max_iter': [100, 300, 1000]
}

scorer = make_scorer(f1_score, average='macro') 

search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring=scorer,
    cv=5,               
    verbose=2,
    n_jobs=-1           
)

search.fit(X_train, y_train)

print("Best params:")
print(search.best_params_)
print("Best F1-score:")
print(search.best_score_)


In [None]:
X_train_transformed = model.named_steps['preprocessor'].transform(X_train)
print(f"Unique variables: {X_train_transformed.shape[1]}")

# LighGBM

In [None]:
df_clean = df.dropna(subset=['IS_APPOINTMENT_SET']).copy()
df_clean['IS_APPOINTMENT_SET'] = df_clean['IS_APPOINTMENT_SET'].astype(int)

use_cols =  [
'IS_APPOINTMENT_SET', 
      'STATE', 'EMAIL_DOMAIN', 'OPERATINGSYSTEMCLASS','OPERATINGSYSTEMNAME',
                  'AGENTLANGUAGECODE', 'AGENTNAME','AGENTVERSIONMAJOR', 'DEVICEBRAND',
                 'DEVICEFIRMWAREVERSION', 'DEVICENAME' ,'DEVICEVERSION',
'FACEBOOKDEVICECLASS','LAYOUTENGINENAME','NETWORKTYPE',
'OPERATINGSYSTEMVERSIONMAJOR','WEBVIEWAPPNAME','MATCHLEVEL']
df_small = df_clean[use_cols].copy()


y = df_small['IS_APPOINTMENT_SET']
X = df_small.drop(columns=['IS_APPOINTMENT_SET'])


categorical_cols = [ 'STATE', 'EMAIL_DOMAIN', 'OPERATINGSYSTEMCLASS','OPERATINGSYSTEMNAME',
                  'AGENTLANGUAGECODE', 'AGENTNAME','AGENTVERSIONMAJOR', 'DEVICEBRAND',
                 'DEVICEFIRMWAREVERSION', 'DEVICENAME' ,'DEVICEVERSION',
'FACEBOOKDEVICECLASS','LAYOUTENGINENAME','NETWORKTYPE',
'OPERATINGSYSTEMVERSIONMAJOR','WEBVIEWAPPNAME','MATCHLEVEL']


X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.3, random_state=42)
for col in categorical_cols:
    X_train[col] = X_train[col].astype('category')
    X_test[col] = X_test[col].astype('category')

model = LGBMClassifier(
    class_weight='balanced',
    n_estimators=200,
    learning_rate=0.05,
    random_state=42,
    n_jobs=-1
)

model.fit(
    X_train,
    y_train,
    categorical_feature=categorical_cols
)

y_proba = model.predict_proba(X_test)[:, 1]
y_pred_default = model.predict(X_test)

print("Using default threshold (0.5):")
print(classification_report(y_test, y_pred_default))
print("Confusion Matrix:", confusion_matrix(y_test, y_pred_default))
precision, recall, thresholds = precision_recall_curve(y_test, y_proba)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-6)
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]
best_f1 = f1_scores[best_idx]
print(f"Best threshold for F1-score: {best_threshold:.3f}")
print(f"Best F1-score: {best_f1:.3f}")

y_pred_optimal = (y_proba >= best_threshold).astype(int)

print("Classification Report (best F1-Score):")
print(classification_report(y_test, y_pred_optimal))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_optimal))

plt.figure(figsize=(10, 6))
plt.plot(thresholds, precision[1:], label='Accuracy')
plt.plot(thresholds, recall[1:], label='Recall')
plt.plot(thresholds, f1_scores[1:], label='F1-score')
plt.axvline(x=best_threshold, color='black', linestyle='--', label=f'Best threshold = {best_threshold:.2f}')
plt.xlabel('Threshold')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
f1_scorer = make_scorer(f1_score)
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [31, 50, 100],
    'max_depth': [ -1, 10, 20],
    }
model = LGBMClassifier(
    class_weight='balanced',
    random_state=42,
    n_jobs=-1)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring=f1_scorer,
    cv=cv,
    verbose=3,
    n_jobs=-1
)
grid_search.fit(X_train, y_train, categorical_feature=categorical_cols)

print(f"Best F1-score: {grid_search.best_score_:.4f}")
print("Best params:")
for param, value in grid_search.best_params_.items():
    print(f"  {param}: {value}")


In [None]:
# from sklearn.model_selection import cross_val_score

# scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')

# print("Accuracy по фолдам:", scores)
# print("Средняя accuracy:", scores.mean())

# Analysis column by column

In [None]:
use_cols =  [
    'IS_APPOINTMENT_SET',
    'STATE',
    'ZIP_CODE',
    'ATTRIBUTES_COREDEMOGRAPHICS_GENDER',
    'ATTRIBUTES_COREDEMOGRAPHICS_HOMEOWNERCD',
    'OPERATINGSYSTEMCLASS',
    'DEVICEBRAND',
    'NETWORKTYPE',
    'EMAIL_DOMAIN',

       'ATTRIBUTES_COREDEMOGRAPHICS_DOB', 
       'ATTRIBUTES_COREDEMOGRAPHICS_MARRIEDCD',
       'ATTRIBUTES_COREDEMOGRAPHICS_WEALTHSCR',
        'ATTRIBUTES_COREDEMOGRAPHICS_EHI',
    
    
    'ATTRIBUTES_SUPPLEMENTALDEMOGRAPHICS_FIREPLCD',
    'ATTRIBUTES_SUPPLEMENTALDEMOGRAPHICS_POOL',
    'ATTRIBUTES_SUPPLEMENTALDEMOGRAPHICS_CREDITCARD',
    'ATTRIBUTES_SUPPLEMENTALDEMOGRAPHICS_HHNBRSR',
    'ATTRIBUTES_SUPPLEMENTALDEMOGRAPHICS_YRBLD',
    'ATTRIBUTES_SUPPLEMENTALDEMOGRAPHICS_LOR',
    'ATTRIBUTES_SUPPLEMENTALDEMOGRAPHICS_HHNBR',
    'ATTRIBUTES_SUPPLEMENTALDEMOGRAPHICS_CENS_POP_DENSITY',
     'ATTRIBUTES_CONNEXSEGMENTATION_CT_HOMEIMPROVE12_ANY',
    'ATTRIBUTES_CONNEXSEGMENTATION_CT_HOMEREMODEL12_ANY',
    'ATTRIBUTES_CONNEXSEGMENTATION_CT_SOCIALUSAGE30_FB',
    'ATTRIBUTES_CONNEXSEGMENTATION_CT_SOCIALUSAGE30_INSTA',
    'ATTRIBUTES_CONNEXSEGMENTATION_CT_SOCIALUSAGE30_LNKIN',
    'ATTRIBUTES_CONNEXSEGMENTATION_CT_SOCIALUSAGE30_PINT',
    'ATTRIBUTES_CONNEXSEGMENTATION_CT_SOCIALUSAGE30_TWITTER',
    'ATTRIBUTES_CONNEXSEGMENTATION_CT_SOCIALUSAGE30_YOUTUBE',
    'ATTRIBUTES_CONNEXSEGMENTATION_CT_STRMSUB_HULU',
    'ATTRIBUTES_CONNEXSEGMENTATION_CT_STRMSUB_NETFLIX',
    'ATTRIBUTES_CONNEXSEGMENTATION_CT_ONLINESHOPSEG_DEALSEEK',
    'ATTRIBUTES_CONNEXSEGMENTATION_CT_ONLINESHOPSEG_OFFLINE',
    'ATTRIBUTES_CONNEXSEGMENTATION_CT_ONLINESHOPSEG_QUALSEEK',
    'ATTRIBUTES_CONNEXSEGMENTATION_CT_ONLINESHOPSEG_STRAITFWD',
    'ATTRIBUTES_CONNEXSEGMENTATION_CT_ONLINESHOPSEG_TRAD',
    'ATTRIBUTES_CONNEXSEGMENTATION_CT_MEDIA_HEAVYUSAGE_INTERNET',
    'ATTRIBUTES_CONNEXSEGMENTATION_CT_MEDIA_HEAVYUSAGE_MAGAZINE',
    'ATTRIBUTES_CONNEXSEGMENTATION_CT_MEDIA_HEAVYUSAGE_NEWSPAPER',
    'ATTRIBUTES_CONNEXSEGMENTATION_CT_MEDIA_HEAVYUSAGE_RADIO',
    'ATTRIBUTES_CONNEXSEGMENTATION_CT_MEDIA_HEAVYUSAGE_TV',
]

def show(col):
    print(f"Column name: {col}")
    print(df[col].unique())
    print(f"NaNs: {len(df[df[col].isna()])}")
    print(df[col].value_counts())
for col in use_cols:
    show(col)

In [None]:
total_rows = len(df)
filled_counts = df.notna().sum()
fill_percentage = (filled_counts / total_rows) * 100
fill_percentage = fill_percentage.round(2)

unique_counts = df.nunique(dropna=True)

summary_df = pd.DataFrame({
    'column': fill_percentage.index,
    'fill_%': fill_percentage.values,
    'n_unique': unique_counts.values
})
summary_df

# USERS WITH MORE THAN 1 ITERATION

In [None]:
df1 = df[df['IS_APPOINTMENT_SET'].notna()]
duplicate_counts = df1['HASHED_PHONE_NUMBER'].value_counts()
duplicated_numbers = duplicate_counts[duplicate_counts > 1].index
df_duplicates_only = df1[df1['HASHED_PHONE_NUMBER'].isin(duplicated_numbers)]
df_duplicates_only['IS_APPOINTMENT_SET'] = df_duplicates_only['IS_APPOINTMENT_SET'].astype(int)
df_duplicates_only.groupby('HASHED_PHONE_NUMBER')['IS_APPOINTMENT_SET'].sum().sort_values(ascending=False).reset_index(drop=False)[:10]
df_duplicates_only.shape

In [None]:
df = df_duplicates_only.copy()
df['LEAD_CREATED_AT_UTC'] = pd.to_datetime(df['LEAD_CREATED_AT_UTC'])
df_sorted = df.sort_values(by=['HASHED_PHONE_NUMBER', 'LEAD_CREATED_AT_UTC'])
def assign_status_for_group(series):
    first_val = series.iloc[0]
    last_val = series.iloc[-1]
    unique_vals = set(series)

    if first_val == last_val:
        if len(unique_vals) > 1:
            return 'other'
        else:
            return f'all_{first_val}'
    else:
        if first_val == 1 and last_val == 0:
            return '1->0'
        elif first_val == 0 and last_val == 1:
            return '0->1'
        else:
            return 'other'
df_sorted['STATUS'] = df_sorted.groupby('HASHED_PHONE_NUMBER')['IS_APPOINTMENT_SET'].transform(assign_status_for_group)
df_sorted['STATUS'].value_counts()

In [6]:
"""
NEXT STEPS
OptionA :
    Markov's equatations
OptionB:
    Cox model
"""

"\nNEXT STEPS\nOptionA :\n    Markov's equatations\nOptionB:\n    Cox model\n"