### 1. Import Libraries

In [None]:
import sys
stdout = sys.stdout
reload(sys)
sys.setdefaultencoding('utf-8')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import *
import seaborn as sns

### 2. Reading Data

In [None]:
# transaction table
df_tran = pd.read_table('../data/000_sample1.gz',
                       compression='gzip',
                       sep='\t',
                       quotechar='"',
                       error_bad_lines=False)

In [None]:
# creative table
df_creative = pd.read_table('../data/creatives000',
                       #compression='gzip',
                       sep='|',
                       quotechar='"',
                       error_bad_lines=False,header=None)
df_creative.columns = ['creative_id','created_at','application_id','name','video_id',
                       'is_streaming','language','enforce_language_match','device_type_whitelist',
                      'delivery_frequency_cap','time_to_show_countdown','time_to_show_close_button',
                      'cloudux_template_id']
sys.stdout = stdout

### 3. Join Tables

In [None]:
df_tran['is_install'] = df_tran['timestamp_at_install'].notnull() * 1
df_tran_sub = df_tran.loc[:,['is_install','creative_id','time_of_this_impression']]

In [None]:
print df_tran_sub.shape
print df_creative.shape

In [None]:
len(df_creative.creative_id.unique())

In [None]:
# df_creative_tag = df_creative_tag.groupby('creative_id').creative_tag.apply(list).reset_index()

In [None]:
df_tran_ad = pd.merge(df_tran_sub, df_creative, how='left', on='creative_id', left_on=None, right_on=None,
         left_index=False, right_index=False, sort=True,
         suffixes=('_x', '_y'), copy=True, indicator=False)

In [None]:
# check NA
df_tran_ad.isnull().sum()

### 4. Add Y & Clean Data

In [None]:
df_tran_ad['language'] = df_tran_ad['language'].apply(lambda x: str(x).lower().split('-')[0])
df_tran_ad['is_install'] = df_tran_ad['is_install'].astype('category')

### 5. Create new features

#### Ad Age

In [None]:
df_tran_ad['time_delta'] = pd.to_datetime(df_tran_ad['time_of_this_impression'])-pd.to_datetime(df_tran_ad['created_at'])

In [None]:
def to_day(x):
    if isinstance(x,pd.tslib.Timedelta):
        return x.days
    else:
        return np.nan

In [None]:
df_tran_ad['time_delta_days'] = df_tran_ad['time_delta'].apply(to_day)

In [None]:
df_tran_ad['time_delta_days'] = df_tran_ad['time_delta'].dt.days

### 5. Continous Variables

In [None]:
# continous_EDA function
def continous_EDA(col,df):
    group0 = df[df['is_install']==0][col].dropna()
    group1 = df[df['is_install']==1][col].dropna()
    
    # T-test
    print 't-test'
    p = ttest_ind(group0, group1, axis=0, equal_var=False)

    # histogram of two groups
    print 'Distribution of two groups'
    fig, ax = plt.subplots()
    i = 0
    for a in [group0, group1]:
        sns.distplot(a, ax=ax, kde=False,label=str(i))
        i += 1
    ax.legend(loc=2,prop={'size':7})
    plt.show()
    
    # Box-plot
    print 'Box-plot'
    ax = sns.boxplot(x="is_install", y=col, data=df)
    plt.show()
    return p

In [None]:
p_lst = []
cont_cols = ['time_delta_days']
for col in cont_cols:
    print '-----------Feature:', col,'----------------'
    p = continous_EDA(col,df_tran_ad)
    p_lst.append(p)
    print '---------------------------------\n'

In [None]:
significant_cols = [col for col,p in zip(cont_cols,p_lst) if p<0.05]
'significant columns:', significant_cols

### 6. Categorical Variables

In [None]:
cat_cols = ['is_streaming','language','enforce_language_match','device_type_whitelist',
            'delivery_frequency_cap','time_to_show_countdown','time_to_show_close_button',
            'cloudux_template_id']

In [None]:
def categotical_EDA(col, df):
    count = pd.crosstab(df['is_install'],df[col])
    count = count.iloc[:,0:]
    pcts = count.div(count.sum(1).astype(float),axis=0)
    g, p, dof, expctd = chi2_contingency(pcts.as_matrix())
    
    ax = pcts.plot(kind='barh',stacked=True, alpha=0.5, figsize=(8,6))
    ax.set_xlabel("Percentage", fontsize=12)
    ax.set_ylabel("Is Install", fontsize=12)
    ax.legend(loc=2,prop={'size':7})
    
    plt.show()
    return p

In [None]:
p_lst = []
for col in cat_cols:
    print '-----------Feature:', col,'----------------'
    p = categotical_EDA(col,df_tran_ad)
    p_lst.append(p)
    print '---------------------------------\n'

In [None]:
significant_cols = [col for col,p in zip(cat_cols,p_lst) if p<0.05]
'significant columns:', significant_cols

### 7. Conclusion
Useful Features:
* Continous Variable: ['time_delta_days']
* Categorial Variable: ['is_streaming']
