In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [24]:
# importing the data
cls_df = pd.read_csv('course_lead_scoring.csv')
cls_df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [25]:
# check if missing values are present
cls_df.isna().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [26]:
cls_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   lead_source               1334 non-null   object 
 1   industry                  1328 non-null   object 
 2   number_of_courses_viewed  1462 non-null   int64  
 3   annual_income             1281 non-null   float64
 4   employment_status         1362 non-null   object 
 5   location                  1399 non-null   object 
 6   interaction_count         1462 non-null   int64  
 7   lead_score                1462 non-null   float64
 8   converted                 1462 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 102.9+ KB


In [27]:
# separating categorical and numerical columns
cat_cols  = list(cls_df.select_dtypes(include=['object', 'category']).columns)

# all num cols except the target col (since target col did not have any null and will be easier for future operations)
num_cols = list(cls_df.select_dtypes(include=np.number).columns)
num_cols.remove('converted')

print(cat_cols, num_cols, sep='\n\n')

['lead_source', 'industry', 'employment_status', 'location']

['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']


In [28]:
cls_df.isnull().any()

lead_source                  True
industry                     True
number_of_courses_viewed    False
annual_income                True
employment_status            True
location                     True
interaction_count           False
lead_score                  False
converted                   False
dtype: bool

In [29]:
# filling nulls of cat and num features

for col in cls_df.columns[cls_df.isna().any()]:
    
    # for cat cols
    if col in cat_cols:
        print('cat', col)
        cls_df[col].fillna('NA', inplace=True)
        
    # filling num cols
    elif col in num_cols:
        print('num', col)
        cls_df[col].fillna(0.0,inplace=True)
        
print(cls_df.info())
print()
print(cls_df.isna().sum())

cat lead_source
cat industry
num annual_income
cat employment_status
cat location
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   lead_source               1462 non-null   object 
 1   industry                  1462 non-null   object 
 2   number_of_courses_viewed  1462 non-null   int64  
 3   annual_income             1462 non-null   float64
 4   employment_status         1462 non-null   object 
 5   location                  1462 non-null   object 
 6   interaction_count         1462 non-null   int64  
 7   lead_score                1462 non-null   float64
 8   converted                 1462 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 102.9+ KB
None

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status

In [30]:
# splitting data into train 60%, validation 20%, test 20%
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(cls_df, test_size=0.2, random_state=1)

df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

# checking the split count and proportion
print(len(df_train), len(df_val), len(df_test))
print(round(len(df_train)/len(cls_df), 2), round(len(df_val)/len(cls_df), 2), round(len(df_test)/len(cls_df), 2))

876 293 293
0.6 0.2 0.2


In [31]:
# reset indices of the train val test split dataframes
df_full_train = df_full_train.reset_index(drop=True)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [32]:
# extract target variables from the splits
y_train = df_train['converted'].values
y_val = df_val['converted'].values
y_test = df_test['converted'].values

In [33]:
# drop the target vars from the features
del df_train['converted']
del df_val['converted']
del df_test['converted']

In [34]:
# importing roc auc evaluation metrics
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score

In [35]:
# QUESTION 1

auc_num_cols_dict = {}

# looping through num features
for num_col in num_cols:
    auc_num_cols_dict[num_col] = roc_auc_score(y_train, df_train[num_col])

print(auc_num_cols_dict)

{'number_of_courses_viewed': 0.7635680590007088, 'annual_income': 0.5519578313253012, 'interaction_count': 0.738270176293409, 'lead_score': 0.6144993577250176}


In [36]:
# check if any feature's auc is below 0.5
print({feature: auc_score for feature, auc_score in auc_num_cols_dict.items() if auc_score < 0.5})

{}


In [37]:
# since none of the values are below 0.5 auc score,  sorting the features with highest correlation at the top
dict(sorted(auc_num_cols_dict.items(), key=lambda item: item[1], reverse=True))

{'number_of_courses_viewed': 0.7635680590007088,
 'interaction_count': 0.738270176293409,
 'lead_score': 0.6144993577250176,
 'annual_income': 0.5519578313253012}

The numerical variable with highest AUC: number_of_courses_viewed with a score of 0.7635 (approx)

In [38]:
zxcvb = cat_cols + num_cols
cls_df[zxcvb].head()

Unnamed: 0,lead_source,industry,employment_status,location,number_of_courses_viewed,annual_income,interaction_count,lead_score
0,paid_ads,,unemployed,south_america,1,79450.0,4,0.94
1,social_media,retail,employed,south_america,1,46992.0,1,0.8
2,events,healthcare,unemployed,australia,5,78796.0,3,0.69
3,paid_ads,retail,,australia,2,83843.0,1,0.87
4,referral,education,self_employed,europe,3,85012.0,3,0.62


In [39]:
cls_df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [40]:
df_train

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score
0,events,manufacturing,2,95543.0,unemployed,europe,3,0.78
1,referral,,1,54924.0,student,south_america,6,0.39
2,organic_search,healthcare,2,77352.0,unemployed,europe,2,0.22
3,paid_ads,other,2,34600.0,employed,south_america,2,0.31
4,paid_ads,education,0,43615.0,unemployed,south_america,2,0.01
...,...,...,...,...,...,...,...,...
871,,other,5,67314.0,,europe,2,0.87
872,events,education,6,63996.0,,australia,4,0.92
873,organic_search,finance,1,73702.0,unemployed,north_america,2,0.55
874,events,technology,1,93341.0,student,middle_east,4,0.99


In [41]:
df_val

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score
0,organic_search,manufacturing,1,0.0,,asia,0,0.73
1,referral,education,2,58777.0,,north_america,0,0.94
2,paid_ads,technology,3,78148.0,employed,middle_east,2,0.80
3,social_media,technology,3,63854.0,employed,africa,1,0.10
4,referral,education,1,69099.0,unemployed,africa,4,0.98
...,...,...,...,...,...,...,...,...
288,paid_ads,finance,2,41947.0,self_employed,europe,4,0.10
289,paid_ads,other,2,0.0,self_employed,north_america,3,0.71
290,referral,,4,92215.0,employed,australia,4,0.57
291,events,other,2,53087.0,employed,europe,2,0.52


In [42]:
# QUESTION 2
# one hot encoding and vectorizing features
from sklearn.feature_extraction import DictVectorizer

# instantiate dict vectorizer
dv = DictVectorizer(sparse=False)

# extracting and vectorizing training features 
X_train = dv.fit_transform(df_train.to_dict(orient='records'))

# extracting and vectorizing validation features
X_val = dv.transform(df_val[cat_cols + num_cols].to_dict(orient='records'))

In [43]:
# fitting the data into logreg model
from sklearn.linear_model import LogisticRegression

# initializing a log reg model with given hyperparams
logreg_q2 = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)

# fitting the training data
logreg_q2.fit(X_train, y_train)

In [44]:
# making predictions on validation set
y_pred_q2 = logreg_q2.predict(X_val)

# auc of the validation set
print('AUC ROC score of logistic regression model: ', round(roc_auc_score(y_val, y_pred_q2), 3))

AUC ROC score of logistic regression model:  0.648


The AUC score of LogisticRegression model with C = 1.0 and max_iter=1000 is: 0.648