In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%matplotlib inline
from IPython.core.display import display, HTML, Javascript

html_contents ="""
<!DOCTYPE html>
<html lang="en">
    <head>
        <link rel="stylesheet" href="https://www.w3schools.com/w3css/4/w3.css">
        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Raleway">
        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Oswald">
        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Open Sans">
        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css">
        <style>
        .title-section{
            font-family: "Roboto", Verdana, sans-serif;
            font-weight: bold;
            color: "#8B008B";
            letter-spacing: 8px;
        }
        hr { border: 1px solid #E58F65 !important;
             color: #E58F65 !important;
             background: #E58F65 !important;
           }
        body {
            font-family: "Open Sans", sans-serif;
            }        
        </style>
    </head>    
</html>
"""

HTML(html_contents)

#  <span class="title-section w3-xxlarge" style="color:#FF0080"> Ubiquant Market Prediction - Brief Intro </span>

### <span class="title-section w3-large" style="color:magenta">To be updated.</span>



#  <span class="title-section w3-xxlarge" style="color:#FF0080"> 1. Import Libraries</span>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import dask.dataframe as dd
import datatable as dt

import gc  

import seaborn as sns
from scipy.stats import pearsonr
from lightgbm import LGBMRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

sns.set_style('whitegrid')
plt.style.use("fivethirtyeight")
%matplotlib inline

# For reading stock data from yahoo
from pandas_datareader.data import DataReader
   
# For time stamps
from datetime import datetime
from math import sqrt
from math import sqrt


#ignore the warnings
import warnings
warnings.filterwarnings('ignore')

#  <span class="title-section w3-xxlarge" style="color:#FF0080"> 2. Load Dataset</span>

In [None]:
%%time
train = pd.read_parquet('../input/ubiquant-parquet/train_low_mem.parquet')




In [None]:
train.info()

In [None]:
train.dtypes




In [None]:


train.head(10)

In [None]:
train['target'].value_counts()

In [None]:
np.random.seed(2110)
#train = train.sample(10000)
#test = test.sample(10000)

In [None]:
train['target'].hist(bins = 100, figsize = (20,10))

In [None]:
train['investment_id'].nunique()

In [None]:

'''                                                          
                                                            
#  Categorical Data
a = 529  # number of rows
b = 6  # number of columns
c = 1  # initialize plot counter

fig = plt.figure(figsize=(14,10))

for f in train['investment_id'].unique():
    plt.subplot(a, b, c)
    train[train['investment_id'] == f]['target'].hist(bins = 100, alpha = 0.2, figsize = (20,10))
    #plt.title('{}, subplot: {}{}{}'.format(i, a, b, c))
    plt.xlabel(f)
    c = c + 1

plt.show()
'''

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="whitegrid")

# Initialize the matplotlib figure
f, ax = plt.subplots(figsize=(6, 15))
# Plot the total crashes
sns.set_color_codes("pastel")
sns.barplot(y=train['investment_id'].value_counts().sort_values(ascending=False), x="investment_id", data=train,
            label="Total", color="b")

In [None]:
train['investment_id'].value_counts().plot(kind = 'barh',figsize = (20,40))

plt.figure(figsize = (20,200))

plt.style.use('ggplot')

inv = train['investment_id'].unique()
values = train['investment_id'].value_counts()

plt.barh(inv, values)
plt.title('Investment ID counts')
plt.ylabel('investment_id', size = 8)
plt.xlabel('Count')
plt.show()

In [None]:
correlation = train.corr()

In [None]:
import seaborn as sns
sns.clustermap(correlation, figsize=(20, 40),cmap = sns.cubehelix_palette(as_cmap=True))

In [None]:
plt.figure(figsize = (12,5))
ax = sns.distplot(train['target'], bins=5000)
plt.xlim(-3,3)
plt.xlabel("Histogram of the Internal Rate of Return(IRR) values", size=18)
plt.show();
gc.collect()

##  <span class="title-section w3-xlarge" style="color:#FF0080"> Data features</span>

####  <span class="title-section w3-large" style="color:#FF0080"> Features</span>




In [None]:
from itertools import cycle
plt.style.use("ggplot")
color_pal = plt.rcParams["axes.prop_cycle"].by_key()["color"]
color_cycle = cycle(plt.rcParams["axes.prop_cycle"].by_key()["color"])

for investment_id in np.random.choice(train['investment_id'].unique(), 25):
    d = train.query('investment_id == @investment_id')
    d.set_index('time_id')['target'] \
        .plot(figsize=(15, 5),
              title=f'Investment_id {investment_id}',
              color=next(color_cycle),
              style='.-')
    plt.show()
    plt.figure(figsize=(15,5))
    cumReturn = train.loc[train['investment_id']==investment_id,'target'].cumsum()
    time_id = train.loc[train['investment_id']==investment_id,'time_id']
    plt.plot(time_id, cumReturn,  color=next(color_cycle), lw=2)
    plt.title(f'Investment_id {investment_id} cumulative returns')
    plt.ylabel (f'Return {investment_id}', fontsize=18)
    plt.show()

In [None]:
for f in np.random.choice(train['investment_id'].unique(), 120):
    train[train['investment_id'] == f]['target'].hist(bins = 50, alpha = 0.2, figsize = (20,10))

#  <span class="title-section w3-xxlarge" style="color:#FF0080"> 3. Basic EDA</span>

In [None]:
fig, axes = plt.subplots(10,6,figsize=(14, 14))
axes = axes.flatten()

for idx, ax in enumerate(axes):
    sns.kdeplot(data=train, x=f'f_{idx}', 
                fill=True, 
                ax=ax)

    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.spines['left'].set_visible(False)
    if f'f_{idx}' in train.columns:
        ax.spines[:].set_visible(True)
        ax.spines[:].set_color('red')
    ax.set_title(f'f_{idx}', loc='right', weight='bold', fontsize=10)

fig.supxlabel('Feature Distribution (by feature f_0-f_59)', ha='center', fontweight='bold')

fig.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(10,6,figsize=(14, 14))
axes = axes.flatten()

for idx, ax in enumerate(axes):
    sns.kdeplot(data=train, x=f'f_{idx}', 
                fill=True, 
                ax=ax)

    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.spines['left'].set_visible(False)
    if f'f_{idx}' in train.columns:
        ax.spines[:].set_visible(True)
        ax.spines[:].set_color('red')
    ax.set_title(f'f_{idx}', loc='right', weight='bold', fontsize=10)

fig.supxlabel('Feature Distribution (by feature f_60-f_119)', ha='center', fontweight='bold')

fig.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(10,6,figsize=(14, 14))
axes = axes.flatten()

for idx, ax in enumerate(axes, 120):
    sns.kdeplot(data=train, x=f'f_{idx}', 
                fill=True, 
                ax=ax)

    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.spines['left'].set_visible(False)
    if f'f_{idx}' in train.columns:
        ax.spines[:].set_visible(True)
        ax.spines[:].set_color('red')
    ax.set_title(f'f_{idx}', loc='right', weight='bold', fontsize=10)

fig.supxlabel('Feature Distribution (by feature f_120-f_179)', ha='center', fontweight='bold')

fig.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(10,6,figsize=(14, 14))
axes = axes.flatten()

for idx, ax in enumerate(axes, 180):
    sns.kdeplot(data=train, x=f'f_{idx}', 
                fill=True, 
                ax=ax)

    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.spines['left'].set_visible(False)
    if f'f_{idx}' in train.columns:
        ax.spines[:].set_visible(True)
        ax.spines[:].set_color('red')
    ax.set_title(f'f_{idx}', loc='right', weight='bold', fontsize=10)

fig.supxlabel('Feature Distribution (by feature f_180-f_239)', ha='center', fontweight='bold')

fig.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(10,6,figsize=(14, 14))
axes = axes.flatten()

for idx, ax in enumerate(axes, 240):
    sns.kdeplot(data=train, x=f'f_{idx}', 
                fill=True, 
                ax=ax)

    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.spines['left'].set_visible(False)
    if f'f_{idx}' in train.columns:
        ax.spines[:].set_visible(True)
        ax.spines[:].set_color('purple')
    ax.set_title(f'f_{idx}', loc='right', weight='bold', fontsize=10)

fig.supxlabel('Feature Distribution (by feature f240-f299)', ha='center', fontweight='bold')

fig.tight_layout()
plt.show()

##  <span class="title-section w3-xlarge" style="color:#FF0080"> Work In Progress</span>