In [None]:
import numpy as np
import pandas as pd
from pandas import DataFrame,Series

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv("../input/shopee-code-league-20/_DA_Marketing_Analytics/train.csv",index_col='row_id')

In [None]:
df.head()

In [None]:
df.info()

Almost all the columns are non-null and no missing values are present.

# Feature Engineering

Let's convert 'Never Open' into 0. For ease of analysis

In [None]:
df['last_open_day'] = df['last_open_day'].replace('Never open',0).astype(int)
df['last_login_day'] = df['last_login_day'].replace('Never login',0).astype(int)
df['last_checkout_day'] = df['last_checkout_day'].replace('Never checkout',0).astype(int)

Dropping features 'grass_date','subject_line_length' for convienience.

In [None]:
df = df.drop(['grass_date','subject_line_length','user_id'],axis=1)

In [None]:
df.describe()

# Data Visualisation

## Open_Flag distribution

In [None]:
sns.countplot('open_flag',data=df)

The target variable is HIGHLY BIASED to 0. 

## Country

In [None]:
sns.countplot('country_code',data=df)

## Last Open day

In [None]:
fig,(ax1,ax2) = plt.subplots(1,2)

xmax = df['last_open_day'].max()
xmin = df['last_open_day'].min()

plt.xlim((xmin,xmax))
sns.kdeplot(df['last_open_day'],ax=ax1)

sns.kdeplot(df['last_open_day'],ax=ax2)
ax2.set_xlim(0,100)

ax1.set_title('Last open day')
ax2.set_title('Last open within 100days')
fig.tight_layout()
fig.set_size_inches(10,5)

Most people open within 100 days. And within 100 days most people opens with as frequent as 20 days

In [None]:
fig,(ax1,ax2,ax3) = plt.subplots(1,3)

sns.kdeplot(df['open_count_last_10_days'],ax=ax1)
ax1.set_xlim(0,10)

sns.kdeplot(df['open_count_last_30_days'],ax=ax2)
ax2.set_xlim(0,30)

sns.kdeplot(df['open_count_last_60_days'],ax=ax3)
ax3.set_xlim(0,60)

ax1.set_title('open_count_last_10_days')
ax2.set_title('open_count_last_30_days')
ax3.set_title('open_count_last_60_days')
fig.set_size_inches(20,5)

From the graph one could conclude that, most people open with each other day. The density spiked most on day 1!!

## Last Login Day

In [None]:
fig,(ax1,ax2) = plt.subplots(1,2)

xmax = df['last_login_day'].max()
xmin = df['last_login_day'].min()

plt.xlim((xmin,xmax))
sns.kdeplot(df['last_login_day'],ax=ax1)

sns.kdeplot(df['last_login_day'],ax=ax2)
ax2.set_xlim(0,1500)

ax1.set_title('Last login day')
ax2.set_title('Last login within 1500days')
fig.tight_layout()
fig.set_size_inches(10,5)

Most people login within 2000 days. And within 100 days most people login within 400-600 days

In [None]:
fig,(ax1,ax2,ax3) = plt.subplots(1,3)

sns.kdeplot(df['login_count_last_10_days'],ax=ax1)
ax1.set_xlim(0,10)

sns.kdeplot(df['login_count_last_30_days'],ax=ax2)
ax2.set_xlim(0,30)

sns.kdeplot(df['login_count_last_60_days'],ax=ax3)
ax3.set_xlim(0,60)

ax1.set_title('login_count_last_10_days')
ax2.set_title('login_count_last_30_days')
ax3.set_title('login_count_last_60_days')
fig.set_size_inches(20,5)

The login density is pretty interesting. Within 10 days, most people login within every 3 days. But if one looks at a long range of days, most login happens every 15 to 30 days.

## Last Checkout Day

In [None]:
fig,(ax1,ax2) = plt.subplots(1,2)

xmax = df['last_checkout_day'].max()
xmin = df['last_checkout_day'].min()

plt.xlim((xmin,xmax))
sns.kdeplot(df['last_checkout_day'],ax=ax1)

sns.kdeplot(df['last_checkout_day'],ax=ax2)
ax2.set_xlim(0,500)

ax1.set_title('Last checkout day')
ax2.set_title('Last checkout within 500days')
fig.tight_layout()
fig.set_size_inches(10,5)

Last time a check out happens mostly within less than 100 days and very less with 300 to 500 days.

In [None]:
fig,(ax1,ax2,ax3) = plt.subplots(1,3)

sns.kdeplot(df['checkout_count_last_10_days'],ax=ax1)
ax1.set_xlim(0,10)

sns.kdeplot(df['checkout_count_last_30_days'],ax=ax2)
ax2.set_xlim(0,30)

sns.kdeplot(df['checkout_count_last_60_days'],ax=ax3)
ax3.set_xlim(0,60)

ax1.set_title('checkout_count_last_10_days')
ax2.set_title('checkout_count_last_30_days')
ax3.set_title('checkout_count_last_60_days')
fig.set_size_inches(20,5)

Graphs infers that, within 10 days, most checkout happens every 3-4 days. And within 10-30 days for 60 days.

In [None]:
sns.heatmap(df.corr(),annot=True)
fig = plt.gcf()
fig.set_size_inches(15,5)

It seems like open_counts has more correlation with open_flag, than any other features.

# Model Building

Since the data contains more overlapping features, only considering last_open_day, last_login_day and last_checkout_day .

Extrating X and Y from the df.

In [None]:
Y = df['open_flag']
#Y = Y.values.reshape(-1,1)
X = df.drop(['open_count_last_10_days', 'open_count_last_30_days',
       'open_count_last_60_days', 'login_count_last_10_days',
       'login_count_last_30_days', 'login_count_last_60_days',
       'checkout_count_last_10_days', 'checkout_count_last_30_days',
       'checkout_count_last_60_days', 'open_flag'],axis=1)

## Classification Model 

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
log_model = LogisticRegression()

In [None]:
log_model.fit(X,Y)

In [None]:
log_model.score(X,Y)

Seems like we have a good model accuracy of 84%.

In [None]:
coeff_df = DataFrame(zip(X.columns,np.transpose(log_model.coef_)))
coeff_df

This seems like higher the value of last_open_day lower the chance of Flag_open.
All other features are linearly related to the Flag_open.

In [None]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(X,Y)

print(f'Splitting happens as {x_train.shape},{x_test.shape},{y_train.shape},{y_test.shape}')

In [None]:
log_model2 = LogisticRegression(class_weight='balance')

log_model2.fit(x_train,y_train)

y_pred = log_model2.predict(x_test)
y_pred

## Model Analysis

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

So, the model has predicted with an accuracy score of 84%

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(y_test,y_pred)

# SVM

In [None]:
from sklearn.svm import SVC

In [None]:
model = SVC()

In [None]:
clf = model.fit(x_train,y_train)

In [None]:
model.score(X,Y)

In [None]:
y_pred = clf.predict(x_test)

## Model Analysis

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
confusion_matrix(y_test,y_pred)

Note that both Logistic Regression and SVM is predicting with similar accuracy score. But the biased data highly influence the model and unless a more unbiased data is acquired the model predictions are highly unreliable.