<a id=index></a>

<a href='#percentile'>Interquartile Range Method - IQR</a>

<a href='#lof'>Automatic Outlier Detection</a>

<a href='#monte_carlo'>Monte Carlo Method</a>

<a href='#markowitz'>Markowisk - Modern Portfolio Theory</a>

<a href='#groupby'>Group by Column Name</a>

<a href='#kurtosis'>Kurtosis</a>

<a href='#skewness'>Skewness</a>

<a href='#reduce_memory_usage'>Reduce Memory Usage</a>

<a href='#queries'>Queries in Pandas</a>

<a href='#concat_cols'>Concat Columns into new Column</a>

In [None]:
import dask.dataframe as dd
import pandas as pd
import matplotlib.pyplot as plt
import gc  

import janestreet

# Load data
data = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')

In [None]:
fig, ax = plt.subplots(figsize=(15, 4))
feature_0 = pd.Series(data['feature_0']).cumsum()
ax.set_xlabel ("Trade", fontsize=18)
ax.set_ylabel ("feature_0 (cumulative)", fontsize=18);
feature_0.plot(lw=3);

In [None]:
feature_0_is_plus_one  = data.query('feature_0 ==  1').reset_index(drop = True)
feature_0_is_minus_one = data.query('feature_0 == -1').reset_index(drop = True)
# the plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 4))
ax1.plot((pd.Series(feature_0_is_plus_one['resp']).cumsum()), lw=3, label='resp')
ax1.plot((pd.Series(feature_0_is_plus_one['resp']*feature_0_is_plus_one['weight']).cumsum()), lw=3, label='return')
ax2.plot((pd.Series(feature_0_is_minus_one['resp']).cumsum()), lw=3, label='resp')
ax2.plot((pd.Series(feature_0_is_minus_one['resp']*feature_0_is_minus_one['weight']).cumsum()), lw=3, label='return')
ax1.set_title ("feature 0 = 1", fontsize=18)
ax2.set_title ("feature 0 = -1", fontsize=18)
ax1.legend(loc="lower left")
ax2.legend(loc="upper left");

del feature_0_is_plus_one
del feature_0_is_minus_one
gc.collect();

<a id='groupby'></a>
<a href='#index'>back to index</a>
#### Group by Column Name 

In [None]:
ticks_day = data.groupby('date').count()
ticks_day.head(10)

In [None]:
day_100  = train_df.loc[train_df['date'] == 100]
day_200  = train_df.loc[train_df['date'] == 200]
day_100_and_200 = pd.concat([day_100, day_200])
day_100_and_200.corr(method='pearson').style.background_gradient(cmap='coolwarm', axis=None).set_precision(2)

In [None]:
resp_stds = (train_df[[c for c in train_df.columns if 'resp' in c]]
    .std()
    .sort_values()
);
ax = resp_stds.plot(kind='bar', title='Standard Deviation of each `resp_`')
for bar in ax.patches:
    bar.set_facecolor('#aa3333')
pos = resp_stds.index.get_loc('resp')
ax.patches[pos].set_facecolor('#348ABD')

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(22, 6))
sns.distplot(train_df['resp'], ax=axs[0])
sns.distplot(train_df['weight'], ax=axs[1])
fig.savefig('resp_weight_distplot.png')

In [None]:
import seaborn as sns
n_features = 10
nan_val = train_df.isna().sum()[train_df.isna().sum() > 0].sort_values(ascending=False)
print(nan_val)


fig, axs = plt.subplots(figsize=(10, 10))

sns.barplot(y = nan_val.index[0:n_features], 
            x = nan_val.values[0:n_features], 
            alpha = 0.8
           )

plt.title(f'NaN values of train dataset (Top {n_features})')
plt.xlabel('NaN values')
fig.savefig(f'nan_values_top_{n_features}_features.png')
plt.show()

In [None]:
train = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv',nrows=1000)
train = train[train['weight'] != 0]
train.head()

In [None]:
train['trade'] = ((train['weight'].values * train['resp'].values) > 0).astype('int')
train['return'] = train['weight'].values * train['resp'].values
train['return_1'] = train['weight'].values * train['resp_1'].values
train['return_2'] = train['weight'].values * train['resp_2'].values
train['return_3'] = train['weight'].values * train['resp_3'].values
train['return_4'] = train['weight'].values * train['resp_4'].values

assets = train.loc[:, train.columns.str.contains('feature')]
returns = train.loc[:, train.columns.str.contains('return')]
trades = train.loc[:, 'trade']

assets.head()

In [None]:
returns.head()

In [None]:
trades.head()

In [None]:
train=train.set_index('ts_id')
train['MA20'] = train['feature_2'].rolling(window=20).mean()
train['20dSTD'] = train['feature_2'].rolling(window=20).std()
train['Upper'] = train['MA20'] + (train['20dSTD'] * 2)
train['Lower'] = train['MA20'] - (train['20dSTD'] * 2)

train[['return','feature_2','MA20','Upper','Lower']].plot(figsize=(12,4))
plt.grid(True)
plt.title(' Bollinger Bands')
plt.axis('tight')
plt.ylabel('Price')
plt.savefig('feature_1.png', bbox_inches='tight')

In [None]:
returns['SMA_20']=returns['return'].rolling(window=20).mean()
returns['SMA_10']=returns['return'].rolling(window=10).mean()
returns['SMA_5']=returns['return'].rolling(window=5).mean()
returns['SMA_200']=returns['return'].rolling(window=200).mean()
returns[['return', 'SMA_5']].plot(figsize=(20,10))
plt.grid(True)
plt.title('Simple Moving Average')
plt.axis('tight')
plt.ylabel('Return')
plt.savefig('feature_2.png', bbox_inches='tight')

In [None]:
def wma(df, column='close', n=20, add_col=False):

    weights = np.arange(1, n + 1)
    wmas = df[column].rolling(n).apply(lambda x: np.dot(x, weights) /
                                       weights.sum(), raw=True).to_list()

    if add_col == True:
        df[f'{column}_WMA_{n}'] = wmas
        return df
    else:
        return wmas

In [None]:
plt.figure(figsize = (12,4))
# the minimum has been set to 1000 so as not to draw the partial days like day 2 and day 294
# the maximum number of trades per day is 18884
# I have used 125 bins for the 500 days
ax = sns.distplot(trades_per_day, 
             bins=125, 
             kde_kws={"clip":(1000,20000)}, 
             hist_kws={"range":(1000,20000)},
             color='darkcyan', 
             kde=True);
values = np.array([rec.get_height() for rec in ax.patches])
norm = plt.Normalize(values.min(), values.max())
colors = plt.cm.jet(norm(values))
for rec, col in zip(ax.patches, colors):
    rec.set_color(col)
plt.xlabel("Number of trades per day", size=14)
plt.show();

In [None]:
#Indicator Volatility (Average True Range[ATR], Bollinger Bands[BB], Standard Deviation[STD])

#Where alpha = 2 / (span+1)
train['ATR'] = train['TR'].ewm(span = 10).mean()

#Otherwise you should be able to easily do you're own smoothing like this:
#train['ATR'] = ( train['ATR'].shift(1)*13 + train['TR'] ) /  14

<a id='kurtosis'></a>
<a href='#index'>back to index</a>
### Kurtosis
Kurtosis is one of the two measures that quantify shape of of a distribution. The another measure is skewness.
#### 2) Kurtosis describes the peakedness of the distribution.
#### 3) If the distribution is tall and thin it is called a leptokurtic distribution. Values in a leptokurtic distribution are near the mean or at the extremes.
#### 4) A normal distribution has a kurtosis of 0.

In [None]:
#Leptokurtic 
kurt = features.kurt(axis=0);
kurt

<a id='skewness'></a>
<a href='#index'>back to index</a>
### Skewness 
skewness is a measure of asymmetry of a distribution. Another measure that describes the shape of a distribution is kurtosis.
#### 1) When a distribution is asymmetrical the tail of the distribution is skewed to one side-to the right or to the left.
#### 2) When the value of the skewness is negative, the tail of the distribution is longer towards the left hand side of the curve.
#### 3) When the value of the skewness is positive, the tail of the distribution is longer towards the right hand side of the curve.

In [None]:
skewness=features.skew(axis=0) #columns sequencial calculation
skewness

mean=features.mean()
mean
median=features.median()
median
mode=features.mode()
mode

<a id='correlation'></a>
<a href='#index'>back to index</a>
### Correlation Coefficient (Pearson, Spearman, Kendall)

In [None]:
# 1) Pearson correlation coefficient is defined as the covariance of two variables divided by the product of their standard deviations.
corr_pearson = features.corr(method='pearson')


# 2) Spearman is a nonparametric evaluation that finds the strength and direction of the monotonic relationship between two variables. 
# This method is used when the data is not normally distributed or when the sample size is small
corr_spearman = features.corr(method='spearman')
# Find Spearman rank correlation between rows of different data drames
spearmanCorrelation  = df1.corrwith(df2, axis=1, method="spearman");


# 3) Kendall or Kendall's tau It quantifies the discrepancy between the number of concordant and discordant pairs of two variables.
corr_kendall  = features.corr(method='kendall')

<a id='markowitz'></a>
<a href='#index'>back to index</a>

# [Markowitz - Modern Portfolio Theory](https://www.investopedia.com/terms/e/efficientfrontier.asp)


### Efficient Frontier

* Efficient frontier comprises investment portfolios that offer the highest expected return for a specific level of risk.
* Optimal portfolios that comprise the efficient frontier tend to have a higher degree of diversification.

### Limitations

The efficient frontier and modern portfolio theory have many assumptions that may not properly represent reality. One of the assumptions is that asset returns follow a normal distribution.
In reality, securities may experience returns (also known as tail risk) that are more than three standard deviations away from the mean in more than 0.3% of the observed values. Consequently, asset returns are said to follow a leptokurtic distribution or heavy-tailed distribution.

<a id='monte_carlo'></a>
<a href='#index'>back to index</a>
### Monte Carlo Method
There are three main reasons to use Monte Carlo methods to randomly sample a probability distribution; they are:

#### 1) Estimate density, gather samples to approximate the distribution of a target function.
#### 2) Approximate a quantity, such as the mean or variance of a distribution.
#### 3) Optimize a function, locate a sample that maximizes or minimizes the target function.

<a href='https://machinelearningmastery.com/monte-carlo-sampling-for-probability/'>Introduction to Monte Carlo Sampling for Probability</a>

In [None]:
# Exemple 1 Effect of size on monte carlo sample
from numpy.random import normal
from matplotlib import pyplot
# define the distribution
mu = 50 #Gaussian distribution with the specified mean (mu)
sigma = 5 # standard deviation (sigma)

# generate monte carlo samples of differing size
sizes = [10, 50, 100, 1000]
for i in range(len(sizes)):
# generate sample
    sample = normal(mu, sigma, sizes[i])
    # plot histogram of sample
    pyplot.subplot(2, 2, i+1)
    pyplot.hist(sample, bins=20)
    pyplot.title('%d samples' % sizes[i])
    pyplot.xticks([])
# show the plot
pyplot.show()

In [None]:
# Example 2 Monte Carlo between ranges (0% - 100%)
import random
import time

df_features = train.loc[:, train.columns.str.contains('feature')]

print_elapsed_time()

samples = 10000
features = df_features.columns.values
a = {}
listSomas={}
# Create the pandas DataFrame 
df = pd.DataFrame(columns=features)
df.head()

for k in range(samples):
    a[0]= random.uniform(0, 1)
    soma = a[0]
    for i in range(1, len(features)-1):
        a[i] = random.uniform(0, 1-soma)
        soma+=a[i]
    
    a[len(features)-1] = 1 - soma
    soma+=a[len(features)-1]
    random.shuffle(a)
    keys_values = a.items()
    new_d = {'feature_'+str(key): value for key, value in keys_values}
    #print(new_d)
    df.loc[k] = list(new_d.values())

#df.head()


print_elapsed_time('after heavy jobs')

portfolioMA = df.mean(axis = 0) 

print('Mean = '+str(portfolioMA.std()))

<a id='reduce_memory_usage'></a>
#### Reduce Memory Usage
<a href='#index'>back to index</a>

In [None]:
def reduce_memory_usage(df):   
    start_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe is {start_memory} MB")
    
    df = df.astype({c: np.float32 for c in df.select_dtypes(include='float64').columns}) 
    
    end_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe after reduction {end_memory} MB")
    print(f"Reduced by {100 * (start_memory - end_memory) / start_memory} % ")
    return df

<a id='percentile'></a>
<a href='#index'>back to index</a>
### Interquartile Range Method - IQR
A good statistic for summarizing a non-Gaussian distribution sample of data is the Interquartile Range, or IQR for short.
The IQR is calculated as the difference between the 75th and the 25th percentiles of the data and defines the box in a box and whisker plot.

In [None]:
from numpy import percentile

data = features['feature_1'].to_numpy()

# calculate interquartile range
q25, q75 = percentile(data, 25), percentile(data, 75)
iqr = q75 - q25
#the cutoff for outliers as 1.5 times the IQR and subtract this cut-off from the 25th percentile 
#and add it to the 75th percentile to give the actual limits on the data.
print('Percentiles: 25th=%.3f, 75th=%.3f, IQR=%.3f' % (q25, q75, iqr))

# calculate the outlier cutoff
cut_off = iqr * 1.5
lower, upper = q25 - cut_off, q75 + cut_off

# identify outliers
outliers = [x for x in data if x < lower or x > upper]
print('Identified outliers: %d' % len(outliers))
# remove outliers
outliers_removed = [x for x in data if x > lower and x < upper]
print('Non-outlier observations: %d' % len(outliers_removed))

In [None]:
#calculated IQR score to filter out the outliers by keeping only valid values.
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1
data_out = data[~((data < (Q1 - 1.5 * IQR)) |(data > (Q3 + 1.5 * IQR))).any(axis=1)]

In [None]:
def cut_off_outliers(data, target):
    y = data[target]
    removed_outliers = y.between(y.quantile(.05), y.quantile(.95))
    print(removed_outliers.value_counts())
    index=data[~removed_outliers].index # INVERT removed_outliers!!
    print(f'The result Outiers to drop ',len(index))
    data.drop(index, inplace=True)

<a id='remove_outliers_z_score'></a>
<a href='#index'>back to index</a>

### Remove Outliers Z-score
The Z-score is the signed number of standard deviations by which the value of an observation or data point is 
above the mean value of what is being observed or measured.
The intuition behind Z-score is to describe any data point by finding their relationship with the Standard Deviation 
and Mean of the group of data points. Z-score is finding the distribution of data where mean is 0 and standard deviation is 1 i.e. normal distribution.

In [None]:
from scipy.stats import zscore

#zscore (a,axis: int=0,ddof: int=0)
#with a as a DataFrame to get a NumPy array containing the z-score of each value in a. 
z_scores = stats.zscore(data)
abs_z_scores = np.abs(z_scores) #Call numpy.abs(x) with x as the previous result to convert each element in x to its absolute value.
filtered_entries = (abs_z_scores < 3).all(axis=1)# Use the syntax (array < 3).all(axis=1) with array as the previous result to create a boolean array.
new_df = df[filtered_entries] #Filter the original DataFrame with this

# for Pandas
data = data[(z < 3).all(axis=1)]

<a id='lof'></a>
<a href='#index'>back to index</a>

### Automatic Outlier Detection
Local outlier factor - LOF

In [None]:
from sklearn.neighbors import LocalOutlierFactor

# identify outliers in the training dataset
lof = LocalOutlierFactor()
yhat = lof.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train, y_train = X_train[mask, :], y_train[mask]

<a id='queries'></a>
<a href='#index'>back to index</a>

### Queries in Pandas


In [None]:
all_columns_nan = features.isna().any()
all_columns_nan

nan_values = features[features.columns[features.isnull().any()]]
nan_values.head()

nulls = features.isnull().sum()
nulls_list = list(nulls[nulls >(0.173 * len(features))].index)
nulls_list

#df.loc[(df['Salary_in_1000']>=100) & (df['Age']< 60) & (df['FT_Team'].str.startswith('S')),['Name','FT_Team']]
nan_features=features[features['feature_17'].isna() & features['feature_18'].isna()]
nan_features

<a id='concat_cols'></a>
<a href='#index'>back to index</a>

### Concat Columns into new Column

In [None]:
def concat_columns(df, cols_to_concat, new_col_name, sep="-"):
    df[new_col_name] = df[cols_to_concat[0]]
    for col in cols_to_concat[1:]:
        df[new_col_name] = df[new_col_name].astype(str) + sep + df[col].astype(str)