In [1]:
import statsmodels.formula.api as smf
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import RFE, RFECV
from sklearn.linear_model import LogisticRegression

# import other functions
from imputer import *
from feature_eng import *
from drop import *

In [2]:
#Download the data
df = pd.read_csv('../data/train_data.zip')
df.head()

Unnamed: 0,external_id,month,year,monthly_number_of_sessions,monthly_unique_sessions,monthly_repeated_sessions,monthly_avg_length_of_session,monthly_avg_light_activity,monthly_avg_moderate_activity,monthly_avg_vigorous_activity,...,avg_wind_9_10,avg_wind_10_11,avg_wind_11_12,avg_wind_12_above,perfect_days,unacast_session_count,hpi,state_and_local_amount_per_capita,state_amount_per_capita,local_amount_per_capita
0,1804425,8,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,90.0,244.2,0.157475,0.009783,0.147692
1,1812706,2,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,27.0,258.95,0.157475,0.009783,0.147692
2,1812706,3,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,27.0,258.95,0.157475,0.009783,0.147692
3,1812706,11,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,24.0,258.95,0.157475,0.009783,0.147692
4,1812706,9,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,12.0,258.95,0.157475,0.009783,0.147692


In [3]:
# drop rows missing target variable
df = drop_missing_unacast(df)
# create X and y
X = df.drop('unacast_session_count', axis=1)
y = df.loc[:, 'unacast_session_count']
# split the data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                    test_size=0.2,
                                                      random_state=2020)
# impute NaN values
result = impute_data(X_train, X_valid)
X_train = result[0]
X_valid = result[1] 
# perform feature eng
X_train = comb_cols(X_train)
X_valid = comb_cols(X_valid)
# perform dropping
X_train = drop_columns(X_train)
X_valid = drop_columns(X_valid)

# perform OHE (climate, density_class, income_class)
X_train_valid = clean_categorical(X_train, X_valid, ['income_class', 'density_class'])
X_train = X_train_valid[0]
X_valid = X_train_valid[1]

print(X_train.shape)
print(X_valid.shape)

(39592, 630)
(9898, 630)


In [4]:
valid = pd.concat([X_valid, y_valid.reset_index()], axis=1)
train = pd.concat([X_train, y_train.reset_index()], axis=1)

In [5]:
def variance_threshold_selector(data, threshold=0.5):
    # https://stackoverflow.com/a/39813304/1956309
    selector = VarianceThreshold(threshold)
    selector.fit(data)
    return data[data.columns[selector.get_support(indices=True)]]

# min_variance = .9 * (1 - .9)  # You can play here with different values.
min_variance = 0.001
low_variance = variance_threshold_selector(train.drop(columns=['climate']), min_variance) 

In [6]:
train_var = pd.concat([low_variance, train[['climate']]], axis=1)

In [9]:
train_var[['monthly_repeated_sessions', 'B23008e21', 'B23008e23', 'B12001e11', 'B23008e26', 'B11005e14', 
           'B11005e17', 'B20004e9', 'B19101e7', 'B19101e4', 'B12001e15', 'B09002e19', 'B23008e27', 'B20004e7', 
           'B11016e10', 'B17012e31', 'B19101e8', 'C18108e5', 'B25012e8', 'B25012e15', 'B17020e4', 
           'B25012e16', 'B25012e10', 'B25012e6', 'B09002e17', 'B25012e12', 'B25012e11', 'B08301e6', 'B17012e6']].var().sort_values()

monthly_repeated_sessions    2.102694e+02
B25012e16                    7.232822e+02
B17012e6                     7.275718e+02
B09002e17                    1.453748e+03
B08301e6                     2.045454e+03
B19101e8                     2.053583e+03
B25012e8                     2.090486e+03
B19101e4                     2.286114e+03
B19101e7                     2.348009e+03
B23008e27                    4.038522e+03
B23008e23                    4.194277e+03
B11005e14                    7.171151e+03
B09002e19                    7.681787e+03
B12001e15                    8.560184e+03
B25012e15                    1.005072e+04
B17020e4                     1.206659e+04
B25012e6                     1.331326e+04
B17012e31                    1.455870e+04
B23008e26                    2.240136e+04
B25012e12                    3.503938e+04
B25012e11                    3.949364e+04
B23008e21                    4.559050e+04
B11016e10                    9.067735e+04
B11005e17                    1.383

In [12]:
train_var.var().sort_values(ascending=False).head(106)

historic_total_session_length    7.224510e+14
distance_to_U                    2.178945e+11
distance_to_O                    2.063218e+11
Democrats_08_Votes               1.786226e+11
Democrats_12_Votes               1.644906e+11
                                     ...     
B19059e3                         8.815859e+05
B13012e1                         8.394686e+05
B25012e2                         8.281461e+05
B13014e15                        8.007081e+05
B19055e3                         7.944908e+05
Length: 106, dtype: float64

#### RFE

In [7]:
lr = LogisticRegression(solver='liblinear')
rfe=RFE(estimator=lr, n_features_to_select=710)

In [None]:
rfe.fit(train_var.drop(columns=['climate']), y_train)

In [None]:
print(rfe.support)

In [None]:
print(rfe.ranking)

#### RFECV

In [None]:
lr = LogisticRegression(solver='liblinear')
rfe_cv=RFECV(estimator=lr, cv=3)

In [None]:
rfe_cv.fit(train_var.drop(columns=['climate']), y_train)

In [None]:
print(rfe_cv.n_features_)

In [None]:
print(rfe_cv.support)