In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 50)

%matplotlib inline

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
train_set = pd.read_csv('../input/train.csv')
# test_set = pd.read_csv('../input/test.csv')
print(train_set)

Firstly,  let us simply explore the dateset to have to basic expression of our data

In [None]:
train_set.info()

There are 4459 samples in our data, each sample has 4993 columns, in which 4991 columns can be regarded as features , ID has no special meaning and target is the value wo are going to predict.
Then,, let's see if there are missing values in this dataset

In [None]:
missing_set = train_set.isnull().sum(axis=0).reset_index()
missing_set.columns = ['column', 'count']
missing_set.sort_values(by=["count"], inplace=True, ascending=False)
missing_set

Now we can say there are no missing values in this dataset, so, wo do not need to do precessing for missing values in the following analysis. 
Next, let's dive deeper into our data.

In [None]:
desc = train_set.describe().T
constant_index = desc[desc['std'] == 0].index
constant_index

We have 256 columns whose standard deviation equal to 0, meaning that both these columns have a constant value and they have nothing to do with out target. So, in the coming analysis, we will remove these columns to make our data less scalable.

In [None]:
train_set = train_set.drop(columns=list(constant_index))

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 12))
sns.distplot(train_set["target"], ax=axes[0,0])
sns.distplot(np.log(train_set["target"]), ax=axes[0, 1])
sns.boxplot(data=train_set["target"], ax=axes[1, 0])
sns.boxplot(data=np.log(train_set["target"]), ax=axes[1, 1])

Apparently,  out target is right skew distributed, if we display it in hist and box graph, we can see that it has a long tail which can even be regarded as outliers, most of its value are concentrated among a narrow area。 But after log transfermation, our target has a much better distribution, very close to normal distribution.

In [None]:
corr = abs(train_set.corr())

In [None]:
count = corr == 1
count = count.sum(axis=0).reset_index()
count.columns = ['column', 'count']
count.sort_values(by=["count"], inplace=True, ascending=False)
count

In [None]:
lg_1 = count['count'] > 1
lg_1.sum()

We can see that in the correlation matrix, not only the value in the digonal equal to 1, but also there are some values equal to 1 in other position. We can summarize that many columns in our data must be the same or linear dependent to each other. So, we have to remove those duplicated columns, as we have a 4736*4736 correlation coefficient matrix, to find those value equal to 1 and then remove the column in our data one by one can be a tough work. So PCA can be apply to our data to remove redundant features，before that， we will calculate  the 1000 largest correlation coefficient and decide how many principal component to reserve in our new data.

In [None]:
n_largest = corr.nlargest(n=1000, columns='target')
n_largest

It shows that more than 3000 features  has a correlation  coefficient less than 0.036, which means they nearly has nothing to do with our target, so in our PCA analysis, 500 components may be enough to be reserved.

In [None]:
heat = corr.nlargest(n=10, columns='target')
columns = list(heat.index)
heat = heat[heat.index]
f, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(heat, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=heat.columns, xticklabels=heat.index, ax=ax)

We draw a hear map to show the ten largest features correlated to our target. And now, we will apply PCA to our data.

In [None]:
from sklearn.decomposition import PCA
train_set = train_set.drop(columns="ID")
target = train_set['target']
train_set = train_set.drop(columns="target")
train_set

In [None]:
pca = PCA(n_components=500)
new_set = pca.fit_transform(train_set)
new_set

PCA is applied to our data and a new data set  which has only 500 features has been created, next, we will use LR and LGB model to predict the target value according to this new data set.

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression
new_target = np.log(target)
linear_model = LinearRegression()
linear_scores = cross_validate(linear_model, new_set, new_target, cv=5, return_train_score=True)
linear_scores


In [None]:
import lightgbm as lgb 
from sklearn.model_selection import train_test_split
lgb_train, lgb_test, lgb_target_train, lgb_target_test = train_test_split(new_set, new_target, test_size=0.20, shuffle = True, random_state = 42)
param =  {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 32,
    'learning_rate': 0.02,
    'verbose': 0,
    'lambda_l1': 1,
    'scale_pos_weight': 8 
} 
lgtrain = lgb.Dataset(lgb_train, lgb_target_train)
lgvalid = lgb.Dataset(lgb_test, lgb_target_test)
lgb_clf = lgb.train(
    param,
    lgtrain,
    num_boost_round=10000,
    valid_sets=[lgtrain, lgvalid],
    valid_names=['train','valid'],
    early_stopping_rounds=100,
    verbose_eval=100
    )
# lgb.cv(param, lgb_train, 5, nfold=5)