# This is an exploration notebook of this competition.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import random
from sklearn.linear_model import LinearRegression
import lightgbm as lgbm

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15,6)
# pd.set_option('max_columns', 200)

## Load the <a href="https://www.kaggle.com/robikscube/ubiquant-parquet">dataset</a> from Parquet format.

In [None]:
%%time
df = pd.read_parquet('../input/ubiquant-parquet/train_low_mem.parquet')

### Get a overview of the data

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.dtypes

In [None]:
timestamps, assets = df['time_id'].nunique(), df['investment_id'].nunique()
print(f"number of assets: {assets} \t with timestamp: {timestamps}")

In [None]:
print(f"the range of assets are from {df['investment_id'].min()} to {df['investment_id'].max()}")

#### The asset number is not the same as the range of assets. It implies that there are other assets in test dataset or potential new ones

### Check if there are missing values

In [None]:
df.isnull().sum().sum()

## Plot the target variation with time
#### **Investment_id** represents a stock. Randomly choose 5 stocks to see the trend.

In [None]:
sample = random.sample(list(df.investment_id.unique()), 5)
df.query('investment_id in @sample').set_index(['time_id', 'investment_id'])['target'].unstack().plot(alpha=0.5, title='Time Series Plot of sampled Target')

### What is the total number of target with respect to investment id?

In [None]:
obs_by_assets = df.groupby(['investment_id'])['target'].count()

obs_by_assets.plot(kind='hist', bins=100)
plt.title('target by asset distribution')
plt.show()

### Check the distriburion of investments vs. time. 

In [None]:
df[['time_id', 'investment_id']].plot(kind='scatter', x='time_id', y='investment_id', figsize=(20,30), s=0.5)
plt.show()

#### It shows some investments are more frequently recorded than others. It should be considered while modeling.

### What is the average target value of each investment_id?

In [None]:
mean_targets = df.groupby(['investment_id'])['target'].mean()
mean_mean_targets = mean_targets.mean()

In [None]:
mean_targets.plot(kind='hist', bins=100)
plt.title('target mean distribution')
plt.show()

print(f"Mean of all targets: {mean_mean_targets: 0.5f}")

#### The distribution of target mean over time is close to normal distribution. The mean of distribution is about -0.023 with some outliers at negative and positive sides, which corresponds to -0.4 and 0.8.

### The standard deviation of target in each investment

In [None]:
std_target = df.groupby(['investment_id'])['target'].std()
mean_std_target = std_target.mean()
std_target.plot(kind='hist', bins=100)
plt.show()
print(f"the average standard deviation of target: {mean_std_target: 0.5f}")

#### The distribution of std of target is right skewed, and there are still outliers in both sides.

### Do numbers of recorded target affect the mean value of target?

In [None]:
ax = sns.jointplot(x=obs_by_assets, y=mean_targets, kind='reg',
                   height=10, joint_kws={'line_kws':{'color': 'blue'}})
ax.ax_joint.set_xlabel('observations')
ax.ax_joint.set_ylabel('mean of target')
plt.show()

#### Through this joint plot of observations in each investment and mean target value in each investment, it shows there is a growing trend when the observations increase. Also, the dispersion of target values is more apparent when the number of recorded investments is relatively low. Let's plot the relation between observations and std to confirm this trend.

In [None]:
ax = sns.jointplot(x=obs_by_assets, y=std_target, kind='reg',
                   height=10, joint_kws={'line_kws':{'color': 'blue'}})
ax.ax_joint.set_xlabel('observations')
ax.ax_joint.set_ylabel('STD of target')
plt.show()

#### From the plot, it is more risky when the observations are low.

### How the number of investment varies in time?

In [None]:
df.groupby('time_id')['investment_id'].nunique().plot()
plt.title('number of unique assets by time')
plt.show()

In [None]:
plt.figure(figsize=(15,8))
plt.subplot(3,1,1)
df.groupby(['time_id'])['investment_id'].nunique().plot()
plt.title("number of investment by time")
plt.subplot(3,1,2)
df.groupby(['time_id'])['target'].mean().plot()
plt.title("target mean value by time")
plt.subplot(3,1,3)
df.groupby(['time_id'])['target'].std().plot()
plt.title("STD of target by time")
plt.tight_layout()
plt.show()

In [None]:
r = np.corrcoef(df.groupby('time_id')['investment_id'].nunique(), df.groupby('time_id')['target'].mean())[0][1]
print(f'Correlation of numbers of investments with targets: {r: .3f}')

#### The correlation between investment number with target value is negative (-0.184). Also, the target value fluctuates more severely then investment number decreases.

### From the mean target-time plot and std-time plot, it seems like the mean of target is forced to zero, and the standard deviation is standardized to unit

In [None]:
mean_target_byT = df.groupby('time_id')['target'].mean()
std_target_byT = df.groupby('time_id')['target'].std()

plt.fill_between(
        mean_target_byT.index,
        mean_target_byT - std_target_byT,
        mean_target_byT + std_target_byT,
        alpha = 0.1,
        color = 'b'
)
plt.plot(
        mean_target_byT.index,
        mean_target_byT.values, '.-', color='b', label = 'Target mean')
plt.axhline(y=mean_mean_targets, color='r', linestyle='--', label='Total mean')
plt.xlabel('time')
plt.ylabel('target')
plt.show()

## What if using investment numbers to calculate the correlation of each features?

In [None]:
obs_by_assets_dict = obs_by_assets.to_dict()
target = df.investment_id.copy().replace(obs_by_assets_dict).astype(np.int16)
features = df.columns[4:]

del(obs_by_assets_dict)

In [None]:
corrs = list()
for col in features:
    corr = np.corrcoef(target, df[col])[0][1]
    corrs.append(corr)
    
del(target)

In [None]:
corrs_feature_assetNum =pd.Series(corrs, index=features)
corrs_feature_assetNum.nlargest(20).plot(kind='barh').invert_yaxis()
plt.show()

These features could be helpful for prediction. Let's take a closer look.

In [None]:
plt.subplot(3,1,1)
df.groupby('time_id')['f_164'].mean().plot()
plt.subplot(3,1,2)
df.groupby('time_id')['target'].mean().plot()
plt.subplot(3,1,3)
df.groupby('time_id')['f_276'].mean().plot()
plt.show()

It shows that the trend of these features is similar to the target.

In [None]:
df['f_164'].hist(bins=100)
plt.show()

In [None]:
df['f_276'].hist(bins=100)
plt.show()

In [None]:
feature_top20 = corrs_feature_assetNum.sort_values(ascending=False)[:20].index

### Check the performance of linear regression for features and target

In [None]:
sample_01precent = df.sample(frac=0.001)

In [None]:
sample_01precent.head()

In [None]:
fig, ax = plt.subplots(3,2)
for i, sample in enumerate(feature_top20[:6]):
    sns.regplot(data=sample_01precent, x=sample_01precent[f'{sample}'], y='target', ax=ax[math.floor(i/2), i%2]).set_title(f'{sample} Scatter Plot with Target')
fig.tight_layout()
fig.show()

In [None]:
fig, ax = plt.subplots(3,2)
for i, sample in enumerate(feature_top20[-6:]):
    sns.regplot(data=sample_01precent, x=sample_01precent[f'{sample}'], y='target', ax=ax[math.floor(i/2), i%2]).set_title(f'{sample} Scatter Plot with Target')
fig.tight_layout()
fig.show()

## Use linear regression to predict the target

In [None]:
y = df['target']
X = df[feature_top20]

model = LinearRegression()
model.fit(X, y)
model.score(X,y)

In [None]:
y_pred = model.predict(X)
comparison_table = pd.DataFrame({'Target': y, 'Prediction:': y_pred})
comparison_table

In [None]:
import ubiquant
env = ubiquant.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test set and sample submission

for (test_df, sample_prediction_df) in iter_test:
    sample_prediction_df['target'] = model.predict(test_df[feature_top20])  # make your predictions here
    env.predict(sample_prediction_df)   # register your predictions