# Import Liberaries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import os
from tqdm import tqdm
import random
import seaborn as sns
import math
import warnings
warnings.filterwarnings('ignore')

# Convert csv file to pickle file

In [None]:
def transform_csv2pickle(path, usecols, dtype):
    train = pd.read_csv(path, usecols=usecols, dtype=dtypes)
    train.to_pickle('../input/ubiquant-market-prediction/train.pkl')
    
path = '../input/ubiquant-market-prediction/train.csv'

basecols = ['row_id', 'time_id', 'investment_id', 'target']
features = [f'f_{i}' for i in range(300)]

dtypes = {'row_id' : 'str',
         'time_id' : 'uint16',
         'investment_id' : 'uint16',
         'target' : 'float32'}

for col in features:
    dtypes[col] = 'float32'
    
#transform_csv2pickle(path, basecols+features, dtypes)

# Load the Dataset

In [None]:
%%time
df = pd.read_pickle('../input/ump-train-picklefile/train.pkl')
display(df.info())
display(df.head())

# EDA

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.describe()

### Number of unique 'investement_id'

In [None]:
df['investment_id'].nunique()

### Number of unique 'time_id'

In [None]:
df['time_id'].nunique()

### Number of missing values

In [None]:
df.isnull().sum().sum()

### The distribution of the 'target' values

In [None]:
sns.set_theme()
fig, ax = plt.subplots(1, 1, figsize=(14,4))
sns.distplot(df['target']).set_title('Target Distribution')
fig.show()

### The distribution of the 'time_id' values

In [None]:
sns.set_theme()
fig, ax = plt.subplots(1, 1, figsize=(14,4))
sns.distplot(df['time_id']).set_title('Time ID Distribution')
fig.show()

### The distribution of the 'investment_id' values

In [None]:
sns.set_theme()
fig, ax = plt.subplots(1, 1, figsize=(14,4))
sns.distplot(df['investment_id']).set_title('Investiment ID Distribution')
fig.show()

### The distribution of random 9 features

In [None]:
sample_features = random.sample(range(299), 9)
fig, ax = plt.subplots(3, 3, figsize=(18, 18))
for i, sample in enumerate(sample_features):
    sns.distplot(df[f'f_{sample}'], ax=ax[math.floor(i/3), i%3]).set_title(f'f_{sample} Distribution')
fig.show()

### Target distribution of 3 different 'investment_id'

In [None]:
sns.set_theme()
fig, ax = plt.subplots(3, 1, figsize=(16, 12))
sns.lineplot(data=df[df['investment_id']==10]['target'], ax=ax[0]).set_title('Investement 10')
sns.lineplot(data=df[df['investment_id']==86]['target'], ax=ax[1], color='r').set_title('Investement 86')
sns.lineplot(data=df[df['investment_id']==1065]['target'], ax=ax[2], color='g').set_title('Investement 1065')
plt.show()

### The Correlation between the first 30 features for investement 0

In [None]:
investment_0 = df[df['investment_id']==0]
df_0 = investment_0.drop(['row_id', 'investment_id'], axis=1)
del investment_0
df_30 = df_0.iloc[:, :32]
del df_0
corrMatrix = df_30.corr()
plt.figure(figsize=(15,8))
sns.heatmap(corrMatrix.to_numpy(), cmap='YlGnBu')

### The correlation between the target and the features for investement 0

In [None]:
df_corr_0 = pd.DataFrame(columns=['feature', 'corr'])
df_0 = df[df['investment_id']==0]
df_0 = df_0.drop(['row_id', 'investment_id'], axis=1)
for i in range(300):
    feature = f'f_{i}'
    corrMatrix = df_0['target'].corr(df_0[feature])
    data = {'feature':feature, 'corr': corrMatrix*100}
    df_corr_0 = df_corr_0.append(data, ignore_index=True)
df_corr_0

### The correlation between the target and the features for investement 1

In [None]:
df_corr_1 = pd.DataFrame(columns=['feature', 'corr'])
df_0 = df[df['investment_id']==1]
df_0 = df_0.drop(['row_id', 'investment_id'], axis=1)
for i in range(300):
    feature = f'f_{i}'
    corrMatrix = df_0['target'].corr(df_0[feature])
    data = {'feature':i, 'corr': corrMatrix*100}
    df_corr_1 = df_corr_1.append(data, ignore_index=True)
df_corr_1

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(16, 6))
sns.lineplot(data=df_corr_0, x='feature', y='corr')
sns.lineplot(data=df_corr_1, x='feature', y='corr')
plt.show()

### The correlation between the target and the features for all investements

In [None]:
df_corr = pd.DataFrame(columns=['feature', 'corr'])
df_0 = df.drop(['row_id', 'investment_id'], axis=1)
for i in range(300):
    feature = f'f_{i}'
    corrMatrix = df_0['target'].corr(df_0[feature])
    data = {'feature':i, 'corr': corrMatrix*100}
    df_corr = df_corr.append(data, ignore_index=True)
df_corr

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(16, 10))
sns.lineplot(data=df_corr_0, x='feature', y='corr')
sns.lineplot(data=df_corr_1, x='feature', y='corr')
sns.lineplot(data=df_corr, x='feature', y='corr')
plt.show()