In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# importing the data
cls_df = pd.read_csv('course_lead_scoring.csv')
cls_df.head()

In [None]:
# check if missing values are present
cls_df.isna().sum()

In [None]:
cls_df.info()

In [None]:
# separating categorical and numerical columns
cat_cols  = list(cls_df.select_dtypes(include=['object', 'category']).columns)

# all num cols except the target col (since target col did not have any null and will be easier for future operations)
num_cols = list(cls_df.select_dtypes(include=np.number).columns)
num_cols.remove('converted')

print(cat_cols, num_cols, sep='\n\n')

In [None]:
cls_df.isnull().any()

In [None]:
# filling nulls of cat and num features

for col in cls_df.columns[cls_df.isna().any()]:
    
    # for cat cols
    if col in cat_cols:
        print('cat', col)
        cls_df[col].fillna('NA', inplace=True)
        
    # filling num cols
    elif col in num_cols:
        print('num', col)
        cls_df[col].fillna(0.0,inplace=True)
        
print(cls_df.info())
print()
print(cls_df.isna().sum())

In [None]:
# splitting data into train 60%, validation 20%, test 20%
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(cls_df, test_size=0.2, random_state=1)

df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

# checking the split count and proportion
print(len(df_train), len(df_val), len(df_test))
print(round(len(df_train)/len(cls_df), 2), round(len(df_val)/len(cls_df), 2), round(len(df_test)/len(cls_df), 2))

In [None]:
# reset indices of the train val test split dataframes
df_full_train = df_full_train.reset_index(drop=True)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [None]:
# extract target variables from the splits
y_train = df_train['converted'].values
y_val = df_val['converted'].values
y_test = df_test['converted'].values

In [None]:
# drop the target vars from the features
del df_train['converted']
del df_val['converted']
del df_test['converted']

In [None]:
# importing roc auc evaluation metrics
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score

In [None]:
# PROBLEM 1

auc_num_cols_dict = {}

# looping through num features
for num_col in num_cols:
    auc_num_cols_dict[num_col] = roc_auc_score(y_train, df_train[num_col])

print(auc_num_cols_dict)
print(auc_num_cols_dict)

In [None]:
# check if any feature's auc is below 0.5
print({feature: auc_score for feature, auc_score in auc_num_cols_dict.items() if auc_score < 0.5})

In [None]:
# since none of the values are below sorting the features with highest correlation at the top
dict(sorted(auc_num_cols_dict.items(), key=lambda item: item[1], reverse=True))

The numerical variable with highest AUC: number_of_courses_viewed with a score of 0.7635 (approx)