In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# Data

In [2]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv'

In [3]:
#!wget $data

In [4]:
df = pd.read_csv('course_lead_scoring.csv')

In [5]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


# Data preparation

In [6]:
df.isna().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [7]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [8]:
categorical = ['lead_source','industry','employment_status','location']
numerical = ['number_of_courses_viewed','annual_income','interaction_count','lead_score']

In [9]:
df[categorical] = df[categorical].fillna('NA')
df[numerical] = df[numerical].fillna(0.0)

In [10]:
df.isna().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

# Question 1

In [11]:
df['industry'].mode()

0    retail
Name: industry, dtype: object

# Question 2

In [12]:
from IPython.display import display

In [13]:
for feature in numerical:
    display(feature, 'with', round(df[numerical].corrwith(df[feature]),2).abs())
    print()

'number_of_courses_viewed'

'with'

number_of_courses_viewed    1.00
annual_income               0.01
interaction_count           0.02
lead_score                  0.00
dtype: float64




'annual_income'

'with'

number_of_courses_viewed    0.01
annual_income               1.00
interaction_count           0.03
lead_score                  0.02
dtype: float64




'interaction_count'

'with'

number_of_courses_viewed    0.02
annual_income               0.03
interaction_count           1.00
lead_score                  0.01
dtype: float64




'lead_score'

'with'

number_of_courses_viewed    0.00
annual_income               0.02
interaction_count           0.01
lead_score                  1.00
dtype: float64




# Split the data

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [16]:
len(df_full_train), len(df_test), len(df)

(1169, 293, 1462)

In [17]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [18]:
len(df_train), len(df_val), len(df_test)

(876, 293, 293)

In [19]:
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

In [20]:
del df_train['converted']
del df_val['converted']
del df_test['converted']

In [21]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# Question 3

In [22]:
from sklearn.metrics import mutual_info_score
from sklearn.linear_model import LogisticRegression

In [23]:
def mutual_correlation(series):
    return mutual_info_score(df_full_train.converted, series)

mi = df_full_train[categorical].apply(mutual_correlation)
mi.sort_values(ascending=False)

lead_source          0.025665
employment_status    0.013258
industry             0.011685
location             0.002253
dtype: float64

# Question 4

In [24]:
from sklearn.feature_extraction import DictVectorizer

In [25]:
dicts_train = df_train[categorical + numerical].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(dicts_train)

In [26]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42).fit(X_train, y_train)

In [27]:
dicts_val = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(dicts_val)
y_pred = model.predict_proba(X_val)[:,1]

In [28]:
convert_desition = y_pred >= 0.5

In [29]:
round((convert_desition == y_val).mean(),2)

0.7

# Question 5

In [30]:
features = categorical + numerical
results = {}

all_features = (convert_desition == y_val).mean()

for feature in features:

    features = [f for f in categorical + numerical if f not in feature]

    dicts_train = df_train[features].to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts_train)

    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42).fit(X_train, y_train)

    dicts_val = df_val[categorical + numerical].to_dict(orient='records')
    X_val = dv.transform(dicts_val)
    y_pred = model.predict_proba(X_val)[:,1]

    convert_desition = y_pred >= 0.5

    score = (convert_desition == y_val).mean()
    
    results[f'without {feature}'] = all_features - score

results

{'without lead_source': -0.0034129692832765013,
 'without industry': 0.0,
 'without employment_status': 0.0034129692832763903,
 'without location': -0.010238907849829393,
 'without number_of_courses_viewed': 0.14334470989761094,
 'without annual_income': -0.15358361774744034,
 'without interaction_count': 0.14334470989761094,
 'without lead_score': -0.0068259385665528916}

#### 'without industry': 0.0

# Question 6

In [31]:
C = [0.01, 0.1, 1, 10, 100]

for c in C:
    
    dicts_train = df_train[categorical + numerical].to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts_train)

    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42).fit(X_train, y_train)

    dicts_val = df_val[categorical + numerical].to_dict(orient='records')
    X_val = dv.transform(dicts_val)
    y_pred = model.predict_proba(X_val)[:,1]

    convert_desition = y_pred >= 0.5
    
    score = round((convert_desition == y_val).mean(),3)

    print(c, score)
    print()

0.01 0.7

0.1 0.7

1 0.7

10 0.7

100 0.7

