In [22]:
import statsmodels.formula.api as smf
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import RFE, RFECV
from sklearn.linear_model import LogisticRegression

# import other functions
from imputer import *
from feature_eng import *
from drop import *

In [2]:
#Download the data
df = pd.read_csv('../data/train_data.zip')
df.head()

Unnamed: 0,external_id,month,year,monthly_number_of_sessions,monthly_unique_sessions,monthly_repeated_sessions,monthly_avg_length_of_session,monthly_avg_light_activity,monthly_avg_moderate_activity,monthly_avg_vigorous_activity,...,avg_wind_9_10,avg_wind_10_11,avg_wind_11_12,avg_wind_12_above,perfect_days,unacast_session_count,hpi,state_and_local_amount_per_capita,state_amount_per_capita,local_amount_per_capita
0,1804425,8,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,90.0,244.2,0.157475,0.009783,0.147692
1,1812706,2,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,27.0,258.95,0.157475,0.009783,0.147692
2,1812706,3,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,27.0,258.95,0.157475,0.009783,0.147692
3,1812706,11,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,24.0,258.95,0.157475,0.009783,0.147692
4,1812706,9,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,12.0,258.95,0.157475,0.009783,0.147692


In [3]:
# drop rows missing target variable
df = drop_missing_unacast(df)
# create X and y
X = df.drop('unacast_session_count', axis=1)
y = df.loc[:, 'unacast_session_count']
# split the data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                    test_size=0.2,
                                                      random_state=2020)
# impute NaN values
result = impute_data(X_train, X_valid)
X_train = result[0]
X_valid = result[1] 
# perform feature eng
X_train = comb_cols(X_train)
X_valid = comb_cols(X_valid)
# perform dropping
X_train = drop_columns(X_train)
X_valid = drop_columns(X_valid)

# perform OHE (climate, density_class, income_class)
X_train_valid = clean_categorical(X_train, X_valid, ['income_class', 'density_class'])
X_train = X_train_valid[0]
X_valid = X_train_valid[1]

print(X_train.shape)
print(X_valid.shape)

(39592, 630)
(9898, 630)


In [4]:
valid = pd.concat([X_valid, y_valid.reset_index()], axis=1)
train = pd.concat([X_train, y_train.reset_index()], axis=1)

valid.shape

(9898, 632)

In [7]:
def variance_threshold_selector(data, threshold=0.5):
    # https://stackoverflow.com/a/39813304/1956309
    selector = VarianceThreshold(threshold)
    selector.fit(data)
    return data[data.columns[selector.get_support(indices=True)]]

# min_variance = .9 * (1 - .9)  # You can play here with different values.
min_variance = 0.001
low_variance = variance_threshold_selector(train.drop(columns=['climate']), min_variance) 

In [14]:
train_var = pd.concat([low_variance, train[['climate']]], axis=1)

In [15]:
train_var_col = train_var[['monthly_repeated_sessions', 
                                         'monthly_avg_length_of_session', 
                                         'B12001e17', 
                                         'B23008e21', 'B20004e16', 'B20004e13', 
                                        'B08301e10', 'B11005e18', 'B20004e14',
 'B19055e3']]

In [17]:
train_var_col.describe()

Unnamed: 0,monthly_repeated_sessions,monthly_avg_length_of_session,B12001e17,B23008e21,B20004e16,B20004e13,B08301e10,B11005e18
count,39592.0,39592.0,39592.0,39592.0,39592.0,39592.0,39592.0,39592.0
mean,2.973884,82834.3,55.175035,266.765003,29105.40493,33081.305643,83.112169,276.282734
std,14.500668,294821.5,57.771582,213.519319,10919.688482,10957.658927,198.503534,186.488228
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,18.0,119.0,23654.0,25530.5,0.0,150.0
50%,0.0,0.0,38.0,216.0,29464.0,31436.0,23.0,232.0
75%,0.0,0.0,72.0,357.0,35172.0,39830.0,87.0,358.0
max,737.0,5390389.0,503.0,2158.0,120417.0,80507.0,4206.0,1909.0


In [19]:
train_var.corr()['monthly_avg_length_of_session'].sort_values(ascending=False)[1:]

monthly_avg_light_activity          0.859978
monthly_avg_clear_length            0.844768
monthly_avg_moderate_activity       0.827760
monthly_avg_length_temp_70_to_80    0.662581
monthly_number_of_sessions          0.587751
                                      ...   
temp_max_35_below                  -0.071781
temp_avg_35_45                     -0.075626
weather_snow                       -0.075768
temp_max_35_45                     -0.088080
temp_avg_35_below                  -0.088732
Name: monthly_avg_length_of_session, Length: 621, dtype: float64

#### RFE

In [24]:
lr = LogisticRegression(solver='liblinear')
rfe=RFE(estimator=lr, n_features_to_select=600)

In [None]:
rfe.fit(train_var.drop(columns=['climate']), y_train)

In [None]:
print(rfe.support)

In [None]:
print(rfe.ranking)

#### RFECV

In [None]:
lr = LogisticRegression(solver='liblinear')
rfe_cv=RFECV(estimator=lr, cv=5)

In [None]:
rfe_cv.fit(train_var.drop(columns=['climate']), y_train)

In [None]:
print(rfe_cv.n_features_)

In [None]:
print(rfe_cv.support)