In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import gc

import warnings
warnings.filterwarnings('ignore')

import lightgbm as lgbm
from lightgbm import *
import time
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import joblib

In [None]:
%%time
df_train = pd.read_parquet('../input/ubiquant-parquet/train_low_mem.parquet')
df_train.head()

In [None]:
df_train.info()

In [None]:
df_train.describe()

In [None]:
df_train.dtypes

# EDA

In [None]:
print('数据集缺失值：{}'.format(df_train.isnull().sum().sum()))

没有缺失值，随机抽取部分样本进行分析。

In [None]:
df_sample = df_train.sample(frac=0.05,random_state=8)

## Categorical variable

查看time_id 与 target 之间的关系。

In [None]:
f, ax = plt.subplots(figsize=(20, 6))
df_sample.sort_values(by='time_id',inplace=True)
data = df_sample.groupby('time_id')['target'].mean()
fig = sns.scatterplot(x=data.index, y=data.values,data=data)
ax.set_ylabel('Mean Target')

在300~500之间 Target有较大波动。

查看investment_id 与 target 之间的关系。

In [None]:

f, ax = plt.subplots(figsize=(20, 6))
df_sample.sort_values(by='investment_id',inplace=True)
data = df_sample.groupby('investment_id')['target'].mean()
fig = sns.scatterplot(x=data.index, y=data.values,data=data)
ax.set_ylabel('Mean Target')

investment_id 与 Target 相关性很小。

## Target value

查看Target特征直方图。

In [None]:

sns.set_style("white")
sns.set_color_codes(palette='deep')
f, ax = plt.subplots(figsize=(8, 7))
#Check the new distribution 
sns.distplot(df_sample['target'], color="b");
ax.xaxis.grid(False)
ax.set(ylabel="Frequency")
ax.set(xlabel="Target")
ax.set(title="Target distribution")
sns.despine(trim=True, left=True)
plt.show()

In [None]:
# Skew and kurt
print("Skewness: %f" % df_sample['target'].skew())
print("Kurtosis: %f" % df_sample['target'].kurt())

查看Target特征箱型图，有很多的异常值。

In [None]:
f, ax = plt.subplots(figsize=(4, 8))
fig = sns.boxplot(y=df_sample['target'])

## Verview of training data

快速查看一部分的特征分布情况

In [None]:
plt.figure(figsize=(20,20))
for i in range(15):
    plt.subplot(5,5,i+1)
    plt.hist(df_sample[f'f_{i}'],bins=50)
    plt.title(f'f_{i}')
plt.show()

In [None]:
plt.figure(figsize=(20,20))
for i in range(15):
    plt.subplot(5,5,i+1)
    plt.scatter(df_sample[f'f_{i}'],df_sample['target'])
    plt.title(f'f_{i}')
plt.show()

 f_1 f_5 f_6 更接近正态分布，而其他特征可能有更多的离群值。

## Outliers

尝试移除部分离群值，提高模型准确度。

In [None]:
feature_cols = df_train.columns.unique()
feature_cols = feature_cols.drop(["row_id", "time_id", "target",'investment_id'])
feature_cols

In [None]:

outlier_list = []
outlier_col = []

for col in feature_cols :
    
    temp_df = df_train[(df_train[col] > df_train[col].mean() + df_train[col].std() * 70) |
                       (df_train[col] < df_train[col].mean() - df_train[col].std() * 70) ]
    temp2_df = df_train[(df_train[col] > df_train[col].mean() + df_train[col].std() * 35) |
                        (df_train[col] < df_train[col].mean() - df_train[col].std() * 35) ]
    if len(temp_df) >0 : 
        outliers = temp_df.index.to_list()
        outlier_list.extend(outliers)
        outlier_col.append(col)
        print(col, len(temp_df))
    elif len(temp2_df)>0 and len(temp2_df) <6 :
        outliers = temp2_df.index.to_list()
        outlier_list.extend(outliers)
        outlier_col.append(col)
        print(col, len(temp2_df))

outlier_list = list(set(outlier_list))
print(len(outlier_col), len(outlier_list))

移除一些std比较大的离群值。

In [None]:

df_train.drop(outlier_list, inplace = True)
df_train.head()

In [None]:

plt.figure(figsize=(20,20))
for i in range(15):
    plt.subplot(5,5,i+1)
    plt.scatter(df_sample[f'f_{i}'],df_sample['target'])
    plt.title(f'f_{i}')
plt.show()    

## Target Correlation

In [None]:

correlations = df_sample.corrwith(df_sample['target']).iloc[:-1].to_frame()
correlations['Abs Corr'] = correlations[0].abs()
sorted_correlations = correlations.sort_values('Abs Corr', ascending=False)['Abs Corr']
fig, ax = plt.subplots(figsize=(6,8))
sns.heatmap(sorted_correlations.iloc[1:].to_frame()[sorted_correlations>=.04], cmap='inferno', annot=True, vmin=-1, vmax=1, ax=ax)
plt.title('Feature Correlations With Target')
plt.show()

特征与Target之间的相关性很小，需要继续挖掘。

## Correlation Between Features

In [None]:

corr = df_sample.iloc[:, 4:].corr()
sns.clustermap(corr, metric="correlation", cmap="inferno", figsize=(20, 20))
plt.suptitle('Correlations Between Features', fontsize=24, weight='bold')
plt.show()

一些特征之间有很强的相关性。

In [None]:

corr = corr.abs()

corrs = corr.unstack()
pair = corrs.sort_values(ascending=False)
pair = pair.reset_index(name='correlation').rename(columns={'level_0': 'feature_a', 'level_1': 'feature_b', 0: 'correlation'})
pair = pair[pair['feature_a'] != pair['feature_b']].iloc[::2,:]
pair = pair[:10]
pair

有些特征之间存在很强的相关性，继续查看f_228与f_262的散点图。

In [None]:

data = pd.concat([df_sample['f_228'], df_sample['f_262']], axis=1)
sns.scatterplot(x='f_228', y='f_262', data=data)

两者呈现强相关性，建模时应避免出现多重共线性。

# Dimension Reduction

用PCA减少特征数量。

In [None]:

features = df_sample.iloc[:, 4:].columns.tolist()

pipe = Pipeline([('scaler', StandardScaler()),('pca', PCA(n_components=0.9))])
pipe.fit(df_sample[features])
joblib.dump(pipe, "./pipe.joblib")
pca_samples = pipe.transform(df_sample[features])

In [None]:

fig, ax = plt.subplots(figsize=(20, 5))
plt.plot([i for i in range(len(pipe.named_steps['pca'].explained_variance_ratio_))], 
         pipe.named_steps['pca'].explained_variance_ratio_.cumsum(), linestyle='--', drawstyle='steps-mid',
         label='Cumulative Explained Variance', linewidth = 1.5)
sns.barplot([i for i in range(len(pipe.named_steps['pca'].explained_variance_ratio_))], 
            pipe.named_steps['pca'].explained_variance_ratio_, 
            alpha=0.85, label='Individual Explained Variance', edgecolor='black', saturation = 2, linewidth = 0.5)
plt.ylabel('Explained Variance Ratio', fontsize = 14, fontname = 'monospace', weight='semibold')
plt.xlabel('Number of Principal Components', fontsize = 14, fontname = 'monospace', weight='semibold')
ax.set_title('Explained Variance', fontsize = 20, fontname = 'monospace', weight='bold')
plt.xticks(fontsize=8, rotation=90)
plt.legend(fontsize = 13)
plt.axis([0,99,0,1])
plt.show()
print('降维后保留特征数：{}'.format(len(pipe.named_steps['pca'].explained_variance_ratio_)))

In [None]:

df_Target = df_train[['row_id','time_id','investment_id','target']]
df_features = df_train.drop(['row_id','time_id','investment_id','target'],axis=1)

del df_train
gc.collect()

In [None]:
df_pca = pipe.transform(df_features)
df_pca = pd.DataFrame(df_pca,index=None)

df_pca = df_pca.reset_index(drop=True)
df_Target = df_Target.reset_index(drop=True)

In [None]:
df_train = pd.concat([df_Target,df_pca],axis=1)
df_train.head()

# Further memory reduce

减少数据内存，减少产生数据错误。

In [None]:
def reduce_mem_usage(df):
  
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
     
    return df

df_train = reduce_mem_usage(df_train)

In [None]:
df_train.head()

# Base Model

使用lightgbm建模。

In [None]:
y = df_train['target'].copy()
x = df_train.iloc[:,4:]

model_entire = lgbm.LGBMRegressor(
        objective="regression",
        metric="rmse",
        n_estimators=300 )

In [None]:
del df_train
gc.collect()

In [None]:
%%time
model_entire.fit(x,y)

In [None]:
joblib.dump(model_entire, "./model_entire.joblib")

# Prediction and Submission

In [None]:
import ubiquant
env = ubiquant.make_env()
iter_test = env.iter_test()

In [None]:
iter_test

In [None]:
model_entire = joblib.load("../input/ubiquant/model_entire.joblib")
pipe = joblib.load("../input/ubiquant/pipe.joblib")

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    
    test_df = test_df[feature_cols]
    pca_test = pipe.transform(test_df)
    test_df = pd.DataFrame(pca_test,index=None)
    y_pred  = model_entire.predict(test_df)
    sample_prediction_df["target"] = y_pred
    
    display(test_df)
    display(sample_prediction_df)
    
    env.predict(sample_prediction_df)