## Homework 3: Machine Learning for Classification

In [138]:
import pandas as pd
import numpy as np

# Load the dataset
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"
df = pd.read_csv(url)

# Make everything lower case and change spaces to underscores (in both headings and data)
df.columns = df.columns.str.lower().str.replace(' ', '_')
strings = list(df.dtypes[df.dtypes == 'object'].index)
for col in strings:
    df[col] = df[col].str.lower().str.replace(' ', '_')

### Data preparation

In [139]:
# Check if the missing values are presented in the features.
missing_values = df.isnull().sum()
print("Missing values in each column:\n", missing_values)

# If there are missing values:
# For categorical features, replace them with 'NA'
df.fillna({'lead_source': 'NA', 'industry': 'NA', 'employment_status': 'NA', 'location': 'NA'}, inplace=True)
# For numerical features, replace with with 0.0
df.fillna({'annual_income': 0.0}, inplace=True)

Missing values in each column:
 lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64


### Question 1. What is the most frequent observation (mode) for the column industry?

In [140]:
industry_mode = df['industry'].mode()[0]
print("The most frequent observation (mode) for the column industry is:", industry_mode)

The most frequent observation (mode) for the column industry is: retail


### Question 2.

Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.
What are the two features that have the biggest correlation?

In [141]:
numerical_features = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']
df[numerical_features].corr()

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879
annual_income,0.00977,1.0,0.027036,0.01561
interaction_count,-0.023565,0.027036,1.0,0.009888
lead_score,-0.004879,0.01561,0.009888,1.0


What are the two features that have the biggest correlation?

interaction_count and lead_score 0.009888\
number_of_courses_viewed and lead_score -0.004879\
number_of_courses_viewed and interaction_count -0.023565\
annual_income and interaction_count 0.027036

### Split the data.

Split your data in train/val/test sets with 60%/20%/20% distribution.
Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
Make sure that the target value y is not in your dataframe.

In [142]:
from sklearn.model_selection import train_test_split

np.random.seed(42)
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

del df_train['converted']
del df_val['converted']
del df_test['converted']

### Question 3.

Calculate the mutual information score between y and other categorical variables in the dataset. Use the training set only. Round the scores to 2 decimals using round(score, 2).

In [143]:
from sklearn.metrics import mutual_info_score

def calculate_mi(series):
    return mutual_info_score(series, y_train)

categorical_features = ['lead_source', 'industry', 'employment_status', 'location']
df_mi = df_train[categorical_features].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='mutual information')
display(df_mi)

Unnamed: 0,mutual information
lead_source,0.024803
employment_status,0.016345
industry,0.006161
location,0.001453
