In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

# 1. Data Visiualization

In [None]:
"""
1. Data Import
2. Basic Statistic Summary
3. Numerical features exploration
4. Categorical features exploration
5. Missing Data Visualization
"""

In [None]:
# reading the dataset
data = pd.read_csv('./online_shoppers_intention.csv')
# checking the shape of the data
data.shape

In [None]:
# Summarize numerial features
data[data.columns[:9]].describe()

In [None]:
# Summarize categorical features
cat_features = ['SpecialDay', 'Month', 'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType', 'Weekend']
fig, axes = plt.subplots(2, 4, figsize=(20, 8))
for idx, cat in enumerate(cat_features):
    stat_by_cat = data.groupby(by = [cat,'Revenue']).agg(count =('Revenue', 'count')).reset_index()
    sns.barplot(ax = axes[idx//4, idx % 4], x = cat, y = 'count', hue='Revenue', data=stat_by_cat)

In [None]:
# Quantitative measures VS target

# bounce rates vs revenue
plt.subplot(1, 3, 1)
sns.stripplot(data['Revenue'], data['BounceRates'], palette = 'magma')
plt.title('Bounce Rates vs Revenue', fontsize = 20)
plt.xlabel('Boune Rates', fontsize = 15)
plt.ylabel('Revenue', fontsize = 15)

# Exit rates vs revenue
plt.subplot(1, 3, 2)
sns.stripplot(data['Revenue'], data['ExitRates'], palette = 'autumn')
plt.title('ExitRates vs Revenue', fontsize = 20)
plt.xlabel('ExitRates', fontsize = 15)
plt.ylabel('Revenue', fontsize = 15)

# Page values vs revenue
plt.subplot(1, 3, 3)
sns.stripplot(data['Revenue'], data['PageValues'], palette = 'rainbow')
plt.title('PageValues vs Revenue', fontsize = 20)
plt.xlabel('PageValues', fontsize = 15)
plt.ylabel('Revenue', fontsize = 15)

In [None]:
# Plot the percentages of missing data
data_missing_perc = data.isnull().sum()/data.shape[0]
fit = plt.figure(figsize = (20,6))
ax = sns.barplot(x = data_missing_perc.index, y = data_missing_perc.values)
ax.tick_params(axis = 'x', rotation=60)

# 2. Data Cleaning

In [1]:
"""
1. Missing Data (Impute / delete)
2. Standardize?
3. Outliers?
4. Data Split
"""

'\n1. Missing Data (Impute / delete)\n2. Standardize?\n3. Outliers?\n4. Data Split\n'

# 3. Model Training

In [None]:
"""
1. Hyperparameter Tuning
2. Model Selection
3. etc.,
"""