### Preprocessing data

In [19]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
# Importing the dataset
import time
start = time.time()

df = pd.read_csv("./train_sample.csv")
#df = pd.read_csv("./train.csv")

end = time.time()
print('import_time',end - start)


import_time 0.11452484130859375


#### Upsampling

In [20]:
#upsample minority class with replacement
from sklearn.utils import resample

majority_count = df['is_attributed'].value_counts()[0]

# Separate majority and minority classes
df_majority = df[df.is_attributed==0]
df_minority = df[df.is_attributed==1]
 
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=majority_count,    # to match majority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
df_up = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
df_up.is_attributed.value_counts()
# 1    576
# 0    576
# Name: balance, dtype: int64


1    99773
0    99773
Name: is_attributed, dtype: int64

#### Test on Logistic Regression

In [21]:
# Separate input features (X) and target variable (y)
y_u = df_up['is_attributed']
X_u = df_up.drop(['click_time', 'attributed_time','is_attributed'], axis=1)


import sklearn.linear_model
from sklearn.metrics import accuracy_score

# Train model
clf_1 = sklearn.linear_model.LogisticRegression().fit(X_u, y_u)
 
# Predict on training set
pred_y_1 = clf_1.predict(X_u)
 
# Is our model still predicting just one class?
print( np.unique( pred_y_1 ) )
# [0 1]
 
# How's our accuracy?
print( accuracy_score(y_u, pred_y_1) )
# 0.513888888889

[0 1]
0.7419642588676295


#### Downsampling

In [22]:
minority_count = df['is_attributed'].value_counts()[1]

# Separate majority and minority classes
df_majority = df[df.is_attributed==0]
df_minority = df[df.is_attributed==1]
 
# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=minority_count,     # to match minority class
                                 random_state=123) # reproducible results
 
# Combine minority class with downsampled majority class
df_down = pd.concat([df_majority_downsampled, df_minority])
 
# Display new class counts
df_down.is_attributed.value_counts()
# 1    49
# 0    49
# Name: balance, dtype: int64

1    227
0    227
Name: is_attributed, dtype: int64

#### Test on Logistic Regression

In [23]:
# Separate input features (X) and target variable (y)
y_d = df_down['is_attributed']
X_d = df_down.drop(['click_time', 'attributed_time','is_attributed'], axis=1)


import sklearn.linear_model

# Train model
clf_1 = sklearn.linear_model.LogisticRegression().fit(X_d, y_d)
 
# Predict on training set
pred_y_1 = clf_1.predict(X_d)
 
# Is our model still predicting just one class?
print( np.unique( pred_y_1 ) )
# [0 1]
 
# How's our accuracy?
print( accuracy_score(y_d, pred_y_1) )
# 0.513888888889

[0 1]
0.7775330396475771


In [30]:
# uncomment for upsampling
X=X_u
y=y_u

# uncomment for downsampling
#X=X_d
#y=y_d

### This is an attempt at implementing LightGBM

In [38]:
# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
# Feature Scaling
print(x_train)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

           ip  app  device   os  channel
11680    5348   29       1   19      343
73233  204203   19       0    0      213
10013  254101   19      33   29      213
99562    6481    5       1   13      377
40687  247999   10       1   36      113
77870  243534   19      21   24      213
22027  121486    2       1   41      435
48972   79045   15       2   17      245
94265  140691   18       1   19      121
11861   93054   18       1   13      107
17962  272901   72       1    3      101
52481   84661   13       1   18      477
81056   83309   35       1   19      274
36902   65214   19       0   29      347
35982    7909    8       1   13      145
63794   26870    3       1   11      466
78399    5314   10       1    1      113
92322   88914   32       1   19       21
39163  201182   15       1   17      245
80228   42841    1       1   20      124
46733  126254    3       1   19      371
33038  210641    8       1   13      145
4766   202255    5       1   27      113
41993   35308   

In [32]:
"""
import lightgbm as lgb
d_train = lgb.Dataset(x_train, label=y_train)
params = {}
params['learning_rate'] = 0.003
#boosting type: rf, gbdt, dart, goss
params['boosting_type'] = 'rf'
params['objective'] = 'binary'
params['metric'] = 'binary_logloss'
params['sub_feature'] = 0.5
params['num_leaves'] = 1024
params['min_data'] = 50
params['max_depth'] = 10
#feature_fraction if using random forest
params['feature_fraction'] = 0.8
#fraction = 1-(1/e)
params['bagging_fraction'] = 0.632
#lambda specifies regularization. Between 0 and 1
params['lambda'] = 0.5
#default:64
params['max_cat_group'] = 64

clf = lgb.train(params, d_train, 100)
"""

import lightgbm as lgb
d_train = lgb.Dataset(x_train, y_train)
params = {}
params['learning_rate'] = 0.003
#boosting type: rf, gbdt, dart, goss
params['boosting_type'] = 'gbdt'
params['objective'] = 'binary'
params['metric'] = 'binary_logloss'
params['sub_feature'] = 0.5
params['feature_fraction'] = 0.8
params['num_leaves'] = 1024
params['min_data'] = 50
params['max_depth'] = 5
#for rf
#params['bagging_freq'] = 1
#params['bagging_fraction'] = 0.632

start = time.time()
clf = lgb.train(params, d_train, 10000)
end = time.time()
print('time',end - start)


time 30.745941162109375


In [37]:
#df_full = pd.read_csv("./train.csv")



array([[-0.84410285, -0.10789885,  0.00486814, -0.03177196, -0.19260207],
       [-0.52293521, -0.68694472, -0.09788207,  0.25300808,  1.45666106],
       [ 0.55072177,  0.6159085 , -0.09788207, -0.09649469, -1.06804031],
       ...,
       [ 0.78670673, -0.68694472, -0.09788207, -0.01882741,  1.04239117],
       [-0.26078156, -0.50599288, -0.09788207, -0.2647738 , -0.72411815],
       [-0.3644112 ,  0.47114703, -0.09788207, -0.14827288,  0.28419912]])

In [33]:
#Prediction
y_pred=clf.predict(x_test)
#convert into binary values
for i in range(0,len(y_pred)):
    if y_pred[i]>=.3431:       # setting threshold to .5
       y_pred[i]=1
    else:  
       y_pred[i]=0

In [34]:
#Confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
#Accuracy
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(y_pred,y_test)
print('cm',cm)
print('accuracy',accuracy)

cm [[24615   191]
 [    0 25081]]
accuracy 0.9961713472447732


In [35]:
print( np.unique( y_pred ) )

[0. 1.]
