# Ubiquent Market Prediction
In this notebook, we will aim to make a **Time Series Forecasting Model** that will predict future returns, concered with **Ubiquent Market** 

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pandas import DataFrame, read_csv, concat, Series
import os
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import datatable as dt
from scipy import stats
from cesium.featurize import featurize_time_series as ft
from pandas.plotting import lag_plot
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss as LL
from sklearn.metrics import mean_squared_error as MSE, mean_absolute_error as MAE

# Integrate RAPIDS With Kaggle

In [None]:
import sys
!cp ../input/rapids/rapids.21.06 /opt/conda/envs/rapids.tar.gz
!cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
sys.path = ["/opt/conda/envs/rapids/lib/python3.7/site-packages"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib/python3.7"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path 
!cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Load the Data
The dataset is much huge. For successfull loading of data in our memory, we will load a chunk of data that will contain first **100,000** samples.

In [None]:
%%time
# load the data
data = pd.read_csv('../input/ubiquant-market-prediction/train.csv', 
                   chunksize=100000)

## EDA (Exploratory Data Analysis)

In [None]:
# using a chunk to analyze data patterns
chunk_1 = data.read(200000)

In [None]:
chunk_1.head(4)

In [None]:
print('Number of Rows: %s' % str((chunk_1.shape[0])))
print('Number of Columns: %s' % str((chunk_1.shape[1])))

In [None]:
chunk_1['target'].value_counts()

In [None]:
chunk_1.info()

In [None]:
chunk_1.time_id.nunique()

In [None]:
sns.set(rc={'figure.figsize':(11.7, 8.27)})
sns.set_theme(style='darkgrid')
sns.displot(x=chunk_1['target'][:40000], kde=True, color='grey')

In [None]:
sns.displot(x=chunk_1['f_0'][:40000], kde=True, color='grey')

In [None]:
# make a function to plot histograms
def plot_hist(x):
    return chunk_1.hist(x)

In [None]:
plot_hist('f_66')

In [None]:
plot_hist('f_55')

In [None]:
plt.title('Scatterplot of two features')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.scatter(x=chunk_1['f_4'][:5000], y=chunk_1['f_5'][:5000], color='grey')

In [None]:
chunk_1.isnull().sum().to_numpy()

In [None]:
print('Number of unique investment IDs: %s' % str(chunk_1['investment_id'].nunique()))

In [None]:
print('Number of unique investment Time Stamps: %s' % str(chunk_1['time_id'].nunique()))

In [None]:
plt.boxplot(x=chunk_1['target'][:10000])

In [None]:
plt.title('KDE Density of Target Feature in Data')
sns.kdeplot(x=chunk_1['target'], color='grey')

### Make a Lag Plot of Features

In [None]:
series = pd.read_csv('../input/ubiquant-market-prediction/train.csv', chunksize=100000, header=0, index_col=0,
parse_dates=True, squeeze=True)

lag_plot(series.read(5000))

In [None]:
from pandas.plotting import autocorrelation_plot
autocorrelation_plot(series.read(10000))

In [None]:
sns.lineplot(x=chunk_1.groupby('time_id')['investment_id'].nunique().index, 
             y=chunk_1.groupby('time_id')['investment_id'].nunique(), 
             color='blue')

### Making Line Plots of N features
We randomly select 4 features from market data, and make line plots of those features. Each feature comprises of a different time series measurment.

In [None]:
plt.subplot(4, 1, 1)
plt.plot(chunk_1["f_4"][:5000], label='feature 4')
plt.legend()
plt.subplot(4, 1, 2)
plt.plot(chunk_1["f_5"][:5000], label='feature 4')
plt.legend()
plt.subplot(4, 1, 3)
plt.plot(chunk_1["f_6"][:5000], label='feature 4')
plt.legend()
plt.subplot(4, 1, 4)
plt.plot(chunk_1["f_7"][:5000], label='feature 4')
plt.legend()

### Principcal Component Analysis
We will perform PCA on N number of dataset features for Dimensionality Reduction.

In [None]:
features = chunk_1.iloc[:, 4:].columns.tolist()
len(features)

In [None]:
features.target

In [None]:
%%time
# make a pipleine
pipeline = Pipeline([('Scaler', StandardScaler()), ('PCA', PCA(n_components=5))])
pipeline.fit(chunk_1[features])
pca_components = pipeline.transform(chunk_1[features])

In [None]:
var = pipeline.named_steps['PCA'].explained_variance_ratio_.sum() * 100

In [None]:
labels = {
    str(i): f"PC {i+1} ({var:.1f}%)"
    for i, var in enumerate(pipeline.named_steps['PCA'].explained_variance_ratio_ * 100)
}

In [None]:
labels

In [None]:
labels['color'] = 'Cluster'

In [None]:
# make a plot for PCA
fig = px.scatter_matrix(
    pca_components,
    labels=labels,
    dimensions=range(4),
    color=chunk_1["target"], 
    opacity=0.5
)

fig

## Applying Machine Learning Models
The dataset is a regression-based dataset, in which we have to predict values in future time. For time series, following models can be used:
* Random Forest Regressor
* Gradient Boosting Machine
* Adaptive Boosting Regressor

In [None]:
X = chunk_1[['f_1', 'f_2', 'f_3', 'f_4', 'f_5', 'f_6', 'f_7', 'f_8', 'f_9', 'f_10']]
y = chunk_1['target']

In [None]:
np.mean(chunk_1['f_2'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, 
                                                    random_state=42)

In [None]:
%%time
gb = GradientBoostingRegressor(n_estimators=200)
gb.fit(X_train, y_train)

In [None]:
pred = gb.predict(X_test)
pred[:4]

In [None]:
# calculate the mean squared error
MSE(y_test, pred)

In [None]:
gb.score(X_train, y_train)

In [None]:
#X = chunk_1.drop('target', axis=1)
#y = chunk_1.target

#X.shape, y.shape

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, 
 #                                                   random_state=42)

In [None]:
# rf = GradientBoostingRegressor(n_estimators=200)
# rf.fit(X_train, y_train)

In [None]:
# calculate loss
#y_pred_rf = rf.predict(X_test)
#MSE(y_test, y_pred_rf)

This was a starter....Work in progres...