In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [4]:
# importing the data
cls_df = pd.read_csv('course_lead_scoring.csv')
cls_df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [5]:
# check if missing values are present
cls_df.isna().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [6]:
cls_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   lead_source               1334 non-null   object 
 1   industry                  1328 non-null   object 
 2   number_of_courses_viewed  1462 non-null   int64  
 3   annual_income             1281 non-null   float64
 4   employment_status         1362 non-null   object 
 5   location                  1399 non-null   object 
 6   interaction_count         1462 non-null   int64  
 7   lead_score                1462 non-null   float64
 8   converted                 1462 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 102.9+ KB


In [7]:
# separating categorical and numerical columns
cat_cols  = list(cls_df.select_dtypes(include=['object', 'category']).columns)

# all num cols except the target col (since target col did not have any null and will be easier for future operations)
num_cols = list(cls_df.select_dtypes(include=np.number).columns)
num_cols.remove('converted')

print(cat_cols, num_cols, sep='\n\n')

['lead_source', 'industry', 'employment_status', 'location']

['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']


In [8]:
# filling nulls of cat and num features

for col in cls_df.columns[cls_df.isna().any()]:
    
    # for cat cols
    if col in cat_cols:
        cls_df.fillna('NA', inplace=True)
        
    # filling num cols
    elif col in num_cols:
        cls_df.fillna('0.0',inplace=True)
        
print(cls_df.info())
print()
print(cls_df.isna().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   lead_source               1462 non-null   object 
 1   industry                  1462 non-null   object 
 2   number_of_courses_viewed  1462 non-null   int64  
 3   annual_income             1462 non-null   object 
 4   employment_status         1462 non-null   object 
 5   location                  1462 non-null   object 
 6   interaction_count         1462 non-null   int64  
 7   lead_score                1462 non-null   float64
 8   converted                 1462 non-null   int64  
dtypes: float64(1), int64(3), object(5)
memory usage: 102.9+ KB
None

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_scor

In [None]:
# splitting data into train 60%, validation 20%, test 20%
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(cls_df, test_size=0.2, random_state=1)

df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

# checking the split count and proportion
print(len(df_train), len(df_val), len(df_test))
print(round(len(df_train)/len(cls_df), 2), round(len(df_val)/len(cls_df), 2), round(len(df_test)/len(cls_df), 2))

876 293 293
0.6 0.2 0.2
