# 04 Classification Homework

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

## Data preperation

In [2]:
!curl -O "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80876  100 80876    0     0   318k      0 --:--:-- --:--:-- --:--:--  318k


In [3]:
df = pd.read_csv('course_lead_scoring.csv')
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [4]:
categorical = [column for column, ctype in df.dtypes.items() if ctype == "object"]
categorical

['lead_source', 'industry', 'employment_status', 'location']

In [5]:
numerical = [column for column, ctype in df.dtypes.items() if ctype != "object" and column != "converted"]
numerical

['number_of_courses_viewed',
 'annual_income',
 'interaction_count',
 'lead_score']

In [6]:
df[categorical] = df[categorical].fillna("NA")

In [7]:
df[numerical] = df[numerical].fillna(0.0)

---

## Question 1
What is the most frequent observation (mode) for the column industry?

Answer - retail

In [8]:
df.industry.mode()

0    retail
Name: industry, dtype: object

---

## Question 2

Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.

What are the two features that have the biggest correlation?

Answer - annual_income and interaction_count

In [9]:
df[["interaction_count"]].corrwith(df.lead_score)

interaction_count    0.009888
dtype: float64

In [10]:
df[["number_of_courses_viewed"]].corrwith(df.lead_score)

number_of_courses_viewed   -0.004879
dtype: float64

In [11]:
df[["number_of_courses_viewed"]].corrwith(df.interaction_count)

number_of_courses_viewed   -0.023565
dtype: float64

In [12]:
df[["annual_income"]].corrwith(df.interaction_count)

annual_income    0.027036
dtype: float64

---

## Split the data

    Split your data in train/val/test sets with 60%/20%/20% distribution.
    Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
    Make sure that the target value y is not in your dataframe.


In [13]:
from sklearn.model_selection import train_test_split

In [14]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [15]:
len(df_train), len(df_val), len(df_test)

(876, 293, 293)

In [16]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [17]:
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

del df_train['converted']
del df_val['converted']
del df_test['converted']

In [18]:
df_full_train.converted.value_counts(normalize=True)

converted
1    0.607357
0    0.392643
Name: proportion, dtype: float64

---

## Question 3

    Calculate the mutual information score between y and other categorical variables in the dataset. Use the training set only.
    Round the scores to 2 decimals using round(score, 2).

Which of these variables has the biggest mutual information score?

Answer - lead_source

In [19]:
from sklearn.metrics import mutual_info_score


def mutual_info_conversion_score(series):
    return mutual_info_score(series, df_full_train.converted)

In [20]:
mi = df_full_train[categorical].apply(mutual_info_conversion_score)
mi.sort_values(ascending=False)

lead_source          0.025665
employment_status    0.013258
industry             0.011685
location             0.002253
dtype: float64

---

## Question 4

    Now let's train a logistic regression.
    Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
    Fit the model on the training dataset.
        To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
        model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

What accuracy did you get?

Asnwer - 0.64 (closest to 0.61)

In [21]:
from sklearn.feature_extraction import DictVectorizer

In [22]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical].to_dict(orient='records')
x_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical].to_dict(orient='records')
x_val = dv.transform(val_dict)

In [23]:
from sklearn.linear_model import LogisticRegression

In [24]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(x_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [25]:
y_pred = model.predict_proba(x_val)[:, 1]
conversion_decision = (y_pred >= 0.5)

In [26]:
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = conversion_decision.astype(int)
df_pred['actual'] = y_val
df_pred['correct'] = df_pred.prediction == df_pred.actual
round(df_pred.correct.mean(), 2)

np.float64(0.61)

---

## Question 5

    Let's find the least useful feature using the feature elimination technique.
    Train a model using the same features and parameters as in Q4 (without rounding).
    Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
    For each feature, calculate the difference between the original accuracy and the accuracy without the feature.

Which of following feature has the smallest difference?

Answer - lead_score

In [27]:
def get_score_without_categorical_column(column=None):
    dv = DictVectorizer(sparse=False)

    if column:
        filtered_categorical = list(set(categorical) - {column})
    else:
        filtered_categorical = categorical
    train_dict = df_train[filtered_categorical].to_dict(orient='records')
    x_train = dv.fit_transform(train_dict)
    
    val_dict = df_val[filtered_categorical].to_dict(orient='records')
    x_val = dv.transform(val_dict)
    
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(x_train, y_train)
    
    y_pred = model.predict_proba(x_val)[:, 1]
    conversion_decision = (y_pred >= 0.5)
    
    df_pred = pd.DataFrame()
    df_pred['probability'] = y_pred
    df_pred['prediction'] = conversion_decision.astype(int)
    df_pred['actual'] = y_val
    df_pred['correct'] = df_pred.prediction == df_pred.actual
    return round(df_pred.correct.mean(), 2)

In [28]:
get_score_without_categorical_column() - get_score_without_categorical_column("industry")

np.float64(0.010000000000000009)

In [29]:
get_score_without_categorical_column() - get_score_without_categorical_column("employment_status")

np.float64(0.05999999999999994)

In [30]:
get_score_without_categorical_column() - get_score_without_categorical_column("lead_score")

np.float64(0.0)

---

## Question 6

    Now let's train a regularized logistic regression.
    Let's try the following values of the parameter C: [0.01, 0.1, 1, 10, 100].
    Train models using all the features as in Q4.
    Calculate the accuracy on the validation dataset and round it to 3 decimal digits.

Which of these C leads to the best accuracy on the validation set?

Answer - 1

In [31]:
def get_score_with_regularized_parameter(regularized_parameter):
    dv = DictVectorizer(sparse=False)

    train_dict = df_train[categorical].to_dict(orient='records')
    x_train = dv.fit_transform(train_dict)
    
    val_dict = df_val[categorical].to_dict(orient='records')
    x_val = dv.transform(val_dict)
    
    model = LogisticRegression(solver='liblinear', C=regularized_parameter, max_iter=1000, random_state=42)
    model.fit(x_train, y_train)
    
    y_pred = model.predict_proba(x_val)[:, 1]
    conversion_decision = (y_pred >= 0.5)
    
    df_pred = pd.DataFrame()
    df_pred['probability'] = y_pred
    df_pred['prediction'] = conversion_decision.astype(int)
    df_pred['actual'] = y_val
    df_pred['correct'] = df_pred.prediction == df_pred.actual
    return round(df_pred.correct.mean(), 3)

In [32]:
for regularzied_parameter in [0.01, 0.1, 1, 10, 100]:
    print(f"Parameter:{regularzied_parameter} - Score:{get_score_with_regularized_parameter(regularzied_parameter)}")

Parameter:0.01 - Score:0.56
Parameter:0.1 - Score:0.601
Parameter:1 - Score:0.608
Parameter:10 - Score:0.604
Parameter:100 - Score:0.604
