# Introduction
Welcome to this starter EDA.

Questions and comments for improvement are much appreciated.

Thank you and enjoy!

## Imports

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import *
import warnings

## Load Data

In [None]:
filepath = "../input/ubiquant-market-prediction/train.csv"
nrows = 1000000
train = pd.read_csv(filepath, nrows=nrows)

## Key Figures

In [None]:
train_features = train.filter(regex="f_")
print(f"Number of rows: {len(train)}")
print(f"Number of time identifiers: {train.time_id.nunique()}")
print(f"Number of missing values: {train.isna().sum().sum()}")
print(f"Number of infinite values: {np.isinf(train.drop(columns='row_id')).sum().sum()}")
print(f"Number of features: {len(list(train_features))}")
print(f"Feature data types: {train_features.dtypes.unique()}")
print(f"Number of investments: {train.investment_id.nunique()}")
print(f"Range of investment identifiers: [{train.investment_id.min()}, {train.investment_id.max()}]")

In [None]:
print(f"Target minimum: {np.min(train.target)}")
print(f"Target median: {np.median(train.target)}")
print(f"Target maximum: {np.max(train.target)}")
print(f"Target mean: {np.mean(train.target)}")
print(f"Target standard deviation: {np.std(train.target)}")
print(f"Target skew: {skew(train.target)}")
print(f"Target kurtosis: {kurtosis(train.target)}")

## Key Plots

In [None]:
sns.histplot(train.target, kde=True)
plt.title("Target Distribution")
plt.show()

In [None]:
warnings.simplefilter("ignore")
sns.boxplot(train.target)
plt.title("Target Box Plot")
plt.show()

In [None]:
np.random.seed(0)

for train_investment_id in np.random.choice(train.investment_id.unique(), 9):
    train[train.investment_id == train_investment_id].target.hist(bins=100, alpha=0.5)

plt.title("Target Distributions of Sample Investments")
plt.show()

In [None]:
sns.histplot(train.groupby("investment_id").target.count(), kde=True)
plt.title("Distribution of Targets by Investment Identifier")
plt.show()

In [None]:
train[train.investment_id.isin(range(9))][["investment_id", "time_id"]].plot.scatter("time_id", "investment_id")
plt.title("Target Availability Temporal Discontinuities")
plt.show()

## Correlation Analysis

In [None]:
sample_train = train.sample(frac=0.01, random_state=0)
correlation = sample_train[["target"] + list(train_features)].corr()

In [None]:
warnings.simplefilter("ignore")
sns.clustermap(correlation)
plt.show()