In [54]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

In [55]:
get_ipython().run_line_magic('matplotlib', 'inline')

In [56]:
df = pd.read_csv('Leads.csv')

In [57]:
len(df)

9240

In [58]:
df.head().T

Unnamed: 0,0,1,2,3,4
Prospect ID,7927b2df-8bba-4d29-b9a2-b6e0beafe620,2a272436-5132-4136-86fa-dcc88c88f482,8cc8c611-a219-4f35-ad23-fdfd2656bd8a,0cc2df48-7cf4-4e39-9de9-19797f9b38cc,3256f628-e534-4826-9d63-4a8b88782852
Lead Number,660737,660728,660727,660719,660681
Lead Origin,API,API,Landing Page Submission,Landing Page Submission,Landing Page Submission
Lead Source,Olark Chat,Organic Search,Direct Traffic,Direct Traffic,Google
Do Not Email,No,No,No,No,No
Do Not Call,No,No,No,No,No
Converted,0,0,1,0,1
TotalVisits,0.0,5.0,2.0,1.0,2.0
Total Time Spent on Website,0,674,1532,305,1428
Page Views Per Visit,0.0,2.5,2.0,1.0,1.0


In [59]:
df.dtypes

Prospect ID                                       object
Lead Number                                        int64
Lead Origin                                       object
Lead Source                                       object
Do Not Email                                      object
Do Not Call                                       object
Converted                                          int64
TotalVisits                                      float64
Total Time Spent on Website                        int64
Page Views Per Visit                             float64
Last Activity                                     object
Country                                           object
Specialization                                    object
How did you hear about X Education                object
What is your current occupation                   object
What matters most to you in choosing a course     object
Search                                            object
Magazine                       

In [60]:
df.columns = df.columns.str.lower().str.replace(' ', '_')
 
string_columns = list(df.dtypes[df.dtypes == 'object'].index)
 
for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')

In [61]:
df.converted = (df.converted == 1).astype(int)

In [62]:
df.converted.head()

0    0
1    0
2    1
3    0
4    1
Name: converted, dtype: int32

In [63]:
(df.converted == 1).head()

0    False
1    False
2     True
3    False
4     True
Name: converted, dtype: bool

In [64]:
from sklearn.model_selection import train_test_split

In [65]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)

In [66]:
df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state=11)
 
y_train = df_train.converted.values
y_val = df_val.converted.values
 
del df_train['converted']
del df_val['converted']

In [67]:
df_train_full.isnull().sum()

prospect_id                                         0
lead_number                                         0
lead_origin                                         0
lead_source                                        26
do_not_email                                        0
do_not_call                                         0
converted                                           0
totalvisits                                       108
total_time_spent_on_website                         0
page_views_per_visit                              108
last_activity                                      82
country                                          1971
specialization                                   1136
how_did_you_hear_about_x_education               1748
what_is_your_current_occupation                  2135
what_matters_most_to_you_in_choosing_a_course    2153
search                                              0
magazine                                            0
newspaper_article           

In [68]:
df_train_full.converted.value_counts()

0    4560
1    2832
Name: converted, dtype: int64

In [69]:
global_mean = df_train_full.converted.mean()

In [70]:
round(global_mean, 3)

0.383

In [80]:
categorical = ['prospect_id', 'lead_origin', 'lead_source', 'do_not_email', 'do_not_call', 'last_activity', 'country', 'specialization', 'how_did_you_hear_about_x_education', 'what_is_your_current_occupation', 'what_matters_most_to_you_in_choosing_a_course', 'search', 'magazine', 'newspaper_article', 'x_education_forums', 'newspaper', 'digital_advertisement', 'through_recommendations', 'receive_more_updates_about_our_courses', 'tags', 'lead_quality', 'update_me_on_supply_chain_content', 'get_updates_on_dm_content', 'lead_profile', 'city', 'asymmetrique_activity_index', 'asymmetrique_profile_index', 'i_agree_to_pay_the_amount_through_cheque', 'a_free_copy_of_mastering_the_interview', 'last_notable_activity']
numerical = ['lead_number', 'converted', 'totalvisits', 'total_time_spent_on_website', 'page_views_per_visit', 'asymmetrique_activity_score', 'asymmetrique_profile_score']

In [92]:
df_train_full[categorical].nunique()

prospect_id                                      7392
lead_origin                                         5
lead_source                                        19
do_not_email                                        2
do_not_call                                         2
last_activity                                      16
country                                            36
specialization                                     19
how_did_you_hear_about_x_education                 10
what_is_your_current_occupation                     6
what_matters_most_to_you_in_choosing_a_course       2
search                                              2
magazine                                            1
newspaper_article                                   2
x_education_forums                                  1
newspaper                                           2
digital_advertisement                               2
through_recommendations                             2
receive_more_updates_about_o

In [73]:
df_train_full.what_is_your_current_occupation.value_counts()

unemployed              4503
working_professional     560
student                  167
other                     13
housewife                  8
businessman                6
Name: what_is_your_current_occupation, dtype: int64

In [74]:
occupation_unemployed = df_train_full[df_train_full.what_is_your_current_occupation == 'unemployed'].converted.mean()
occupation_working_professional = df_train_full[df_train_full.what_is_your_current_occupation == 'working_professional'].converted.mean()
occupation_student = df_train_full[df_train_full.what_is_your_current_occupation == 'student'].converted.mean()
occupation_other = df_train_full[df_train_full.what_is_your_current_occupation == 'other'].converted.mean()
occupation_housewife = df_train_full[df_train_full.what_is_your_current_occupation == 'housewife'].converted.mean()
occupation_businessman = df_train_full[df_train_full.what_is_your_current_occupation == 'businessman'].converted.mean()

In [75]:
print('what_is_your_current_occupation == unemployed :', round(occupation_unemployed, 3))
print('what_is_your_current_occupation == working_professional :', round(occupation_working_professional, 3))
print('what_is_your_current_occupation == student :', round(occupation_student, 3))
print('what_is_your_current_occupation == other :', round(occupation_other, 3))
print('what_is_your_current_occupation == housewife :', round(occupation_housewife, 3))
print('what_is_your_current_occupation == businessman :', round(occupation_businessman, 3))

what_is_your_current_occupation == unemployed : 0.429
what_is_your_current_occupation == working_professional : 0.916
what_is_your_current_occupation == student : 0.359
what_is_your_current_occupation == other : 0.615
what_is_your_current_occupation == housewife : 1.0
what_is_your_current_occupation == businessman : 0.5


In [76]:
global_mean = df_train_full.converted.mean()
 
df_group = df_train_full.groupby(by='what_is_your_current_occupation').converted.agg(['mean'])
df_group['diff'] = df_group['mean'] - global_mean
df_group['risk'] = df_group['mean'] / global_mean
 
df_group

Unnamed: 0_level_0,mean,diff,risk
what_is_your_current_occupation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
businessman,0.5,0.116883,1.305085
housewife,1.0,0.616883,2.610169
other,0.615385,0.232268,1.606258
student,0.359281,-0.023835,0.937785
unemployed,0.429491,0.046375,1.121045
working_professional,0.916071,0.532955,2.391102


In [77]:
from IPython.display import display 
 
for col in categorical:
    df_group = df_train_full.groupby(by=col).converted.agg(['mean'])
    df_group['diff'] = df_group['mean'] - global_mean
    df_group['rate'] = df_group['mean'] / global_mean
    display(df_group)

Unnamed: 0_level_0,mean,diff,rate
prospect_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
000104b9-23e4-4ddc-8caa-8629fe8ad7f4,1.0,0.616883,2.610169
0006d10a-eb01-4ba9-92e2-ad78588b2a40,0.0,-0.383117,0.000000
0011be30-fa97-465b-8e44-0ae83dff7eed,0.0,-0.383117,0.000000
0011f23e-9fd9-4256-b316-efc2e2639b0d,0.0,-0.383117,0.000000
001e6e14-2183-47ab-a405-108e44bc2e66,1.0,0.616883,2.610169
...,...,...,...
ffd99338-2e6b-4c3f-8650-68b94ea5e07f,0.0,-0.383117,0.000000
ffec8e24-0c99-4345-89f1-e3ad6689764f,1.0,0.616883,2.610169
fff076a3-fe95-4c79-9401-e15846be8086,0.0,-0.383117,0.000000
fff49ad0-6015-448c-a7cc-f454c39ffdda,0.0,-0.383117,0.000000


Unnamed: 0_level_0,mean,diff,rate
lead_origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
api,0.308414,-0.074702,0.805014
landing_page_submission,0.360677,-0.02244,0.941427
lead_add_form,0.919861,0.536744,2.400992
lead_import,0.225,-0.158117,0.587288
quick_add_form,1.0,0.616883,2.610169


Unnamed: 0_level_0,mean,diff,rate
lead_source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bing,0.166667,-0.21645,0.435028
blog,0.0,-0.383117,0.0
click2call,0.75,0.366883,1.957627
direct_traffic,0.317624,-0.065493,0.829052
facebook,0.225,-0.158117,0.587288
google,0.4,0.016883,1.044068
live_chat,1.0,0.616883,2.610169
nc_edm,1.0,0.616883,2.610169
olark_chat,0.251768,-0.131349,0.657157
organic_search,0.37432,-0.008797,0.977038


Unnamed: 0_level_0,mean,diff,rate
do_not_email,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.401265,0.018148,1.047368
yes,0.174281,-0.208836,0.454903


Unnamed: 0_level_0,mean,diff,rate
do_not_call,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.383033,-8.3e-05,0.999782
yes,1.0,0.616883,2.610169


Unnamed: 0_level_0,mean,diff,rate
last_activity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
approached_upfront,1.0,0.616883,2.610169
converted_to_lead,0.126437,-0.25668,0.330021
email_bounced,0.092308,-0.290809,0.240939
email_link_clicked,0.239437,-0.14368,0.62497
email_marked_spam,1.0,0.616883,2.610169
email_opened,0.363137,-0.01998,0.947848
email_received,1.0,0.616883,2.610169
form_submitted_on_website,0.255102,-0.128015,0.66586
had_a_phone_conversation,0.727273,0.344156,1.898305
olark_chat_conversation,0.086185,-0.296932,0.224958


Unnamed: 0_level_0,mean,diff,rate
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
asia/pacific_region,0.5,0.116883,1.305085
australia,0.222222,-0.160895,0.580038
bahrain,0.666667,0.28355,1.740113
bangladesh,0.5,0.116883,1.305085
belgium,0.0,-0.383117,0.0
canada,0.0,-0.383117,0.0
china,0.0,-0.383117,0.0
denmark,1.0,0.616883,2.610169
france,0.6,0.216883,1.566102
germany,0.25,-0.133117,0.652542


Unnamed: 0_level_0,mean,diff,rate
specialization,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"banking,_investment_and_insurance",0.458333,0.075216,1.196328
business_administration,0.457478,0.074361,1.194095
e-business,0.408163,0.025046,1.065375
e-commerce,0.356322,-0.026795,0.93006
finance_management,0.453865,0.070748,1.184665
healthcare_management,0.504065,0.120948,1.315695
hospitality_management,0.363636,-0.019481,0.949153
human_resource_management,0.455474,0.072358,1.188866
international_business,0.361702,-0.021415,0.944104
it_projects_management,0.375427,-0.00769,0.979927


Unnamed: 0_level_0,mean,diff,rate
how_did_you_hear_about_x_education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
advertisements,0.45098,0.067864,1.177135
email,0.421053,0.037936,1.099019
multiple_sources,0.349593,-0.033523,0.912498
online_search,0.425868,0.042751,1.111586
other,0.412903,0.029786,1.077747
select,0.474021,0.090904,1.237276
sms,0.2,-0.183117,0.522034
social_media,0.428571,0.045455,1.118644
student_of_someschool,0.47541,0.092293,1.2409
word_of_mouth,0.441281,0.058164,1.151819


Unnamed: 0_level_0,mean,diff,rate
what_is_your_current_occupation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
businessman,0.5,0.116883,1.305085
housewife,1.0,0.616883,2.610169
other,0.615385,0.232268,1.606258
student,0.359281,-0.023835,0.937785
unemployed,0.429491,0.046375,1.121045
working_professional,0.916071,0.532955,2.391102


Unnamed: 0_level_0,mean,diff,rate
what_matters_most_to_you_in_choosing_a_course,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
better_career_prospects,0.482146,0.099029,1.258483
flexibility_&_convenience,0.5,0.116883,1.305085


Unnamed: 0_level_0,mean,diff,rate
search,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.383146,2.9e-05,1.000076
yes,0.363636,-0.019481,0.949153


Unnamed: 0_level_0,mean,diff,rate
magazine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.383117,0.0,1.0


Unnamed: 0_level_0,mean,diff,rate
newspaper_article,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.383033,-8.3e-05,0.999782
yes,1.0,0.616883,2.610169


Unnamed: 0_level_0,mean,diff,rate
x_education_forums,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.383117,0.0,1.0


Unnamed: 0_level_0,mean,diff,rate
newspaper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.383169,5.2e-05,1.000135
yes,0.0,-0.383117,0.0


Unnamed: 0_level_0,mean,diff,rate
digital_advertisement,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.383221,0.000104,1.000271
yes,0.0,-0.383117,0.0


Unnamed: 0_level_0,mean,diff,rate
through_recommendations,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.382918,-0.000199,0.999482
yes,0.75,0.366883,1.957627


Unnamed: 0_level_0,mean,diff,rate
receive_more_updates_about_our_courses,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.383117,0.0,1.0


Unnamed: 0_level_0,mean,diff,rate
tags,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
already_a_student,0.007752,-0.375365,0.020234
busy,0.567742,0.184625,1.481903
closed_by_horizzon,0.996454,0.613337,2.600914
diploma_holder_(not_eligible),0.020833,-0.362284,0.054379
graduation_in_progress,0.06383,-0.319287,0.166607
in_confusion_whether_part_time_or_dlp,0.25,-0.133117,0.652542
in_touch_with_eins,0.272727,-0.11039,0.711864
interested__in_full_time_mba,0.010989,-0.372128,0.028683
interested_in_next_batch,1.0,0.616883,2.610169
interested_in_other_courses,0.02381,-0.359307,0.062147


Unnamed: 0_level_0,mean,diff,rate
lead_quality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
high_in_relevance,0.948718,0.565601,2.476315
low_in_relevance,0.803063,0.419947,2.096132
might_be,0.753392,0.370275,1.96648
not_sure,0.242597,-0.14052,0.633219
worst,0.020408,-0.362709,0.053269


Unnamed: 0_level_0,mean,diff,rate
update_me_on_supply_chain_content,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.383117,0.0,1.0


Unnamed: 0_level_0,mean,diff,rate
get_updates_on_dm_content,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.383117,0.0,1.0


Unnamed: 0_level_0,mean,diff,rate
lead_profile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dual_specialization_student,1.0,0.616883,2.610169
lateral_student,0.952381,0.569264,2.485876
other_leads,0.351621,-0.031496,0.91779
potential_lead,0.781591,0.398474,2.040086
select,0.404403,0.021286,1.05556
student_of_someschool,0.024752,-0.358364,0.064608


Unnamed: 0_level_0,mean,diff,rate
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
mumbai,0.401318,0.018201,1.047509
other_cities,0.408015,0.024898,1.064987
other_cities_of_maharashtra,0.437326,0.054209,1.141495
other_metro_cities,0.401961,0.018844,1.049186
select,0.485479,0.102363,1.267184
thane_&_outskirts,0.447412,0.064295,1.167822
tier_ii_cities,0.339286,-0.043831,0.885593


Unnamed: 0_level_0,mean,diff,rate
asymmetrique_activity_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
01.high,0.313522,-0.069595,0.818344
02.medium,0.428008,0.044891,1.117173
03.low,0.092715,-0.290402,0.242002


Unnamed: 0_level_0,mean,diff,rate
asymmetrique_profile_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
01.high,0.482525,0.099408,1.259473
02.medium,0.304191,-0.078926,0.79399
03.low,0.416667,0.03355,1.087571


Unnamed: 0_level_0,mean,diff,rate
i_agree_to_pay_the_amount_through_cheque,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.383117,0.0,1.0


Unnamed: 0_level_0,mean,diff,rate
a_free_copy_of_mastering_the_interview,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.39578,0.012663,1.033053
yes,0.35545,-0.027667,0.927785


Unnamed: 0_level_0,mean,diff,rate
last_notable_activity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
approached_upfront,1.0,0.616883,2.610169
email_bounced,0.16,-0.223117,0.417627
email_link_clicked,0.222222,-0.160895,0.580038
email_marked_spam,1.0,0.616883,2.610169
email_opened,0.364362,-0.018755,0.951046
email_received,1.0,0.616883,2.610169
form_submitted_on_website,0.0,-0.383117,0.0
had_a_phone_conversation,0.9,0.516883,2.349153
modified,0.23043,-0.152687,0.601461
olark_chat_conversation,0.156863,-0.226254,0.409438


In [81]:
from IPython.display import display 
 
for col in numerical:
    df_group = df_train_full.groupby(by=col).converted.agg(['mean'])
    df_group['diff'] = df_group['mean'] - global_mean
    df_group['rate'] = df_group['mean'] / global_mean
    display(df_group)

Unnamed: 0_level_0,mean,diff,rate
lead_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
579533,1.0,0.616883,2.610169
579538,1.0,0.616883,2.610169
579545,0.0,-0.383117,0.000000
579546,0.0,-0.383117,0.000000
579564,1.0,0.616883,2.610169
...,...,...,...
660681,1.0,0.616883,2.610169
660719,0.0,-0.383117,0.000000
660727,1.0,0.616883,2.610169
660728,0.0,-0.383117,0.000000


Unnamed: 0_level_0,mean,diff,rate
converted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.0,-0.383117,0.0
1,1.0,0.616883,2.610169


Unnamed: 0_level_0,mean,diff,rate
totalvisits,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,0.411966,0.028849,1.075301
1.0,0.164596,-0.218521,0.429624
2.0,0.315946,-0.067171,0.824674
3.0,0.375361,-0.007756,0.979756
4.0,0.3918,0.008683,1.022663
5.0,0.402226,0.019109,1.049877
6.0,0.38992,0.006804,1.017758
7.0,0.388,0.004883,1.012746
8.0,0.47191,0.088793,1.231765
9.0,0.427536,0.044419,1.115942


Unnamed: 0_level_0,mean,diff,rate
total_time_spent_on_website,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.411497,0.028380,1.074077
1,0.200000,-0.183117,0.522034
2,0.090909,-0.292208,0.237288
3,0.222222,-0.160895,0.580038
4,0.000000,-0.383117,0.000000
...,...,...,...
2207,0.000000,-0.383117,0.000000
2217,1.000000,0.616883,2.610169
2226,0.000000,-0.383117,0.000000
2253,1.000000,0.616883,2.610169


Unnamed: 0_level_0,mean,diff,rate
page_views_per_visit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.00,0.411966,0.028849,1.075301
1.00,0.284351,-0.098766,0.742205
1.14,0.500000,0.116883,1.305085
1.20,0.250000,-0.133117,0.652542
1.22,0.500000,0.116883,1.305085
...,...,...,...
14.00,0.833333,0.450216,2.175141
15.00,0.333333,-0.049784,0.870056
16.00,0.000000,-0.383117,0.000000
24.00,1.000000,0.616883,2.610169


Unnamed: 0_level_0,mean,diff,rate
asymmetrique_activity_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7.0,0.0,-0.383117,0.0
8.0,0.0,-0.383117,0.0
9.0,0.0,-0.383117,0.0
10.0,0.069767,-0.313349,0.182105
11.0,0.073171,-0.309946,0.190988
12.0,0.115152,-0.267965,0.300565
13.0,0.139384,-0.243733,0.363816
14.0,0.406272,0.023155,1.060439
15.0,0.632094,0.248977,1.649872
16.0,0.518519,0.135402,1.353421


Unnamed: 0_level_0,mean,diff,rate
asymmetrique_profile_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11.0,0.5,0.116883,1.305085
12.0,0.388889,0.005772,1.015066
13.0,0.22973,-0.153387,0.599634
14.0,0.275862,-0.107255,0.720047
15.0,0.281382,-0.101735,0.734455
16.0,0.40501,0.021894,1.057146
17.0,0.467091,0.083974,1.219187
18.0,0.367442,-0.015675,0.959086
19.0,0.739583,0.356466,1.930438
20.0,0.709163,0.326046,1.851037


In [85]:
df_train_full[numerical].corrwith(df_train_full.converted)

lead_number                    0.021904
converted                      1.000000
totalvisits                    0.033651
total_time_spent_on_website    0.363470
page_views_per_visit           0.001593
asymmetrique_activity_score    0.165739
asymmetrique_profile_score     0.224824
dtype: float64

In [94]:
df_train_full[categorical].corrwith(df_train_full.converted)

Series([], dtype: float64)

In [99]:
train_dict = df_train_full[categorical + numerical].to_dict(orient='records')

In [100]:
from sklearn.feature_extraction import DictVectorizer
 
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

DictVectorizer(sparse=False)

In [101]:
X_train = dv.transform(train_dict)

In [102]:
X_train[0]

array([ 1.,  0., nan, ...,  1.,  0.,  1.])

In [103]:
dv.get_feature_names()



['a_free_copy_of_mastering_the_interview=no',
 'a_free_copy_of_mastering_the_interview=yes',
 'asymmetrique_activity_index',
 'asymmetrique_activity_index=01.high',
 'asymmetrique_activity_index=02.medium',
 'asymmetrique_activity_index=03.low',
 'asymmetrique_activity_score',
 'asymmetrique_profile_index',
 'asymmetrique_profile_index=01.high',
 'asymmetrique_profile_index=02.medium',
 'asymmetrique_profile_index=03.low',
 'asymmetrique_profile_score',
 'city',
 'city=mumbai',
 'city=other_cities',
 'city=other_cities_of_maharashtra',
 'city=other_metro_cities',
 'city=select',
 'city=thane_&_outskirts',
 'city=tier_ii_cities',
 'converted',
 'country',
 'country=asia/pacific_region',
 'country=australia',
 'country=bahrain',
 'country=bangladesh',
 'country=belgium',
 'country=canada',
 'country=china',
 'country=denmark',
 'country=france',
 'country=germany',
 'country=ghana',
 'country=hong_kong',
 'country=india',
 'country=indonesia',
 'country=italy',
 'country=kenya',
 'countr

In [104]:
import math
 
def sigmoid(score):
    return 1 / (1 + math.exp(-score))

In [105]:
def logistic_regression(xi):
    score = bias
    for j in range(n):
        score = score + xi[j] * w[j]
    prob = sigmoid(score)
    return prob

In [109]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [107]:
val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

KeyError: "['converted'] not in index"