In [60]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [61]:
link = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"

In [62]:
df = pd.read_csv(link)

In [63]:
df.shape

(1462, 9)

In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   lead_source               1334 non-null   object 
 1   industry                  1328 non-null   object 
 2   number_of_courses_viewed  1462 non-null   int64  
 3   annual_income             1281 non-null   float64
 4   employment_status         1362 non-null   object 
 5   location                  1399 non-null   object 
 6   interaction_count         1462 non-null   int64  
 7   lead_score                1462 non-null   float64
 8   converted                 1462 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 102.9+ KB


In [65]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [66]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [67]:
categorical_features = list(df.dtypes[df.dtypes == 'object'].index)
categorical_features

['lead_source', 'industry', 'employment_status', 'location']

In [68]:
for col in categorical_features:
    print(col)
    print(df[col].unique())
    print(df[col].nunique())
    print()

lead_source
['paid_ads' 'social_media' 'events' 'referral' 'organic_search' nan]
5

industry
[nan 'retail' 'healthcare' 'education' 'manufacturing' 'technology'
 'other' 'finance']
7

employment_status
['unemployed' 'employed' nan 'self_employed' 'student']
4

location
['south_america' 'australia' 'europe' 'africa' 'middle_east' nan
 'north_america' 'asia']
7



In [69]:
df[categorical_features]=df[categorical_features].fillna('NA')

In [70]:
df.annual_income = df.annual_income.fillna(0.0)

In [71]:
df.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [72]:
df.industry.mode()

0    retail
Name: industry, dtype: object

In [73]:
df.industry.value_counts().sort_values(ascending=False)

industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

#### Sol 1: The most frequent observation (mode) for the column industry is RETAIL with 203 observations

In [74]:
list(df.dtypes[df.dtypes != object].index)

['number_of_courses_viewed',
 'annual_income',
 'interaction_count',
 'lead_score',
 'converted']

In [75]:
numeric_features = ['number_of_courses_viewed','annual_income','interaction_count','lead_score']

In [90]:
Corr_Matrix = round(df[numeric_features].corr(),4)
Corr_Matrix

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
number_of_courses_viewed,1.0,0.0098,-0.0236,-0.0049
annual_income,0.0098,1.0,0.027,0.0156
interaction_count,-0.0236,0.027,1.0,0.0099
lead_score,-0.0049,0.0156,0.0099,1.0


#### Sol 2: Annual_income and Interaction_count has the biggest correlation with coefficient value as 0.0270	

In [97]:
from sklearn.model_selection import train_test_split


In [98]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [99]:
len(df_train), len(df_val), len(df_test)

(876, 293, 293)

In [100]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [101]:
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

del df_train['converted']
del df_val['converted']
del df_test['converted']

In [102]:
from sklearn.metrics import mutual_info_score

In [105]:
print(categorical_features)
print()
print(numeric_features)

['lead_source', 'industry', 'employment_status', 'location']

['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']


In [None]:
for col in categorical_features:
    print(col)
    score = mutual_info_score(df_full_train.converted, df_full_train[col])
    print(round(score,2))
    print()

lead_source
0.03

industry
0.01

employment_status
0.01

location
0.0

