# Goals: 

## 1) Reduce the memory usage in processing Jane Street Data.

## 2) Look at overall market trends by date using mean and sum.

## 3) Use conditional mean by date to impute missing values.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy
from scipy import stats
from sklearn.datasets import make_classification
from sklearn.model_selection import StratifiedKFold
%matplotlib inline



In [None]:
####Import Temp Time Series Data
train_temp = pd.read_csv("../input/jane-street-market-prediction/train.csv", nrows=5)


# Reduce Memory Use Technique by Down Casting
Special thanks to https://www.kaggle.com/akosciansky/how-to-import-large-csv-files-and-save-efficiently. 

In [None]:
# Get information on the datatypes
train_temp.info()

In [None]:
train_cols=list(train_temp.columns)
len(train_cols)

In [None]:
# Find out the smallest data type possible for each numeric feature
float_cols = train_temp.select_dtypes(include=['float'])
int_cols = train_temp.select_dtypes(include=['int'])

for cols in float_cols.columns:
    train_temp[cols] = pd.to_numeric(train_temp[cols], downcast='float')
    
for cols in int_cols.columns:
    train_temp[cols] = pd.to_numeric(train_temp[cols], downcast='integer')

print(train_temp.info())

In [None]:
train_cols_dict = { i : 'float32' for i in train_cols }

In [None]:
int_cols_names=list(int_cols.columns)
int_cols_names

In [None]:
train_cols_dict[ 'feature_0']  =  'int8'
train_cols_dict['ts_id']  =  'int32'

In [None]:
train_cols_dict

In [None]:
train = pd.read_csv("../input/jane-street-market-prediction/train.csv", usecols=train_cols,dtype=train_cols_dict)

In [None]:
print(train.info())

In [None]:
ID=train.iloc[::, -1:]

In [None]:
ID

In [None]:
date_weight=train.iloc[::, 0:2]

In [None]:
features_resp=train.iloc[::, 6:-1]

In [None]:
updated_train = pd.merge(date_weight, features_resp, left_index=True, right_index=True, how='inner')
updated_train2 = pd.merge(ID, updated_train, left_index=True, right_index=True, how='inner')

In [None]:
updated_train2[:5588]

In [None]:
updated_train2

In [None]:

trend=updated_train2[['date','resp','feature_0']]
df1=trend.groupby(['date']).resp.mean()

# Plotting overall market trends by date using mean and sum

In [None]:
# Plot with subplots
df1.plot(subplots=True)
plt.show()

In [None]:
df1[0:100].plot(subplots=True)

In [None]:
df1[100:200].plot(subplots=True)

In [None]:
df1[200:300].plot(subplots=True)

In [None]:
df1[300:400].plot(subplots=True)

In [None]:
df1[400:500].plot(subplots=True)

In [None]:
df2=trend.groupby(['date']).resp.sum()

In [None]:
# Plot with subplots
df2.plot(subplots=True)
plt.show()

In [None]:
df2[0:100].plot(subplots=True)

In [None]:
df2[100:200].plot(subplots=True)

In [None]:
df2[200:300].plot(subplots=True)

In [None]:
df2[300:400].plot(subplots=True)

In [None]:
df2[400:500].plot(subplots=True)

In [None]:
df2.sum()

# Time Series Analysis EDA 

Trend of gains over time. 


# Aggregated Returns Over Time Trends and Seasonality

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
df3=pd.DataFrame(df2.copy())
df3['resp'] = df3['resp'].cumsum()

result = seasonal_decompose(df3, model="add", freq = 88)
fig = result.plot()

#result = seasonal_decompose(df1, freq = 88)
#df1["trend"]=result.trend
#df1["seasonal"]=result.seasonal

In [None]:
#Additive Seasonal Effect (add) means the peaks and valleys are someone similar over time. 
#Multiplicative Seasonal Effect (mul) means the peaks and valleys increase over time. 
result = seasonal_decompose(df3, model="add", freq = 88)
df3["trend"]=result.trend
df3["seasonal"]=result.seasonal

In [None]:
df3.plot(figsize = (14,6), grid = True);

# Returns Trends and Seasonality

In [None]:

df4=pd.DataFrame(df2.copy())


result = seasonal_decompose(df4, model="add", freq = 88)
fig = result.plot()

In [None]:
result = seasonal_decompose(df4, model="add", freq = 88)
df4["trend"]=result.trend
df4["seasonal"]=result.seasonal
df4.plot(figsize = (14,6), grid = True);

# Find NaN Values

In [None]:
###Explore NaN Values###

nans=pd.DataFrame(updated_train2.isnull().sum(axis = 0))
nans.reset_index(drop=False, inplace=True)
nans.columns = ['column','nans_num']
high_nans=nans[nans['nans_num']>100000]
nans=nans[nans['nans_num']>0]
print(high_nans.sort_values(by=['nans_num'], ascending=False))

# Replace NaN Values with date Column Means (Hashed Out)

In [None]:
nans_list=nans[['column']]
arr=np.ravel(np.array(nans_list.astype(str)))
nans_list2 = arr.tolist()
nans_list2 

In [None]:
# Replace the NaNs in column by the mean of values
# in column nans_list2 respectively
#updated_train2[nans_list2] = updated_train2[nans_list2].fillna(value=updated_train2[nans_list2].mean())
#print(updated_train2)

In [None]:
updated_train2.isnull().sum(axis = 0)

# Replace NaN's with Conditional Means based on Column and Date

In [None]:
means = updated_train2.groupby(['date'])[nans_list2].mean()
updated_train2 = updated_train2.set_index(['date'])
updated_train2[nans_list2] = updated_train2[nans_list2].fillna(means)
updated_train2 = updated_train2.reset_index()
print(updated_train2)

In [None]:
updated_train2.isnull().sum(axis = 0)

In [None]:
means

In [None]:
### Audit Conditional Means to Confirm they Remain the Same
updated_train2[updated_train2['date']==3].describe()

# Splitting Data into 5 Folds Based on Date to Get More Variation in Conditional Means:
This is not complete yet. But, the idea is to hash out conditional mean imputation above and to spit data into 5 folds based on date before calculating conditional mean to get more variation since so many values are missing. 

In [None]:
updated_train2.date=pd.to_numeric(updated_train2.date, downcast='integer')
updated_train2.info()

In [None]:
y=updated_train2.date
X=updated_train2.date
# Use stratified k-fold to create multiple datasets with date structure in place
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
# enumerate the splits and summarize the distributions
test_list=[]
for train_ix, test_ix in kfold.split(X, y):
    test_list.append(test_ix)


    


In [None]:
train_df_list=[]
for i in range(5):
    df=updated_train2.iloc[list(test_list[i])]
    train_df_list.append(df)
    
    

In [None]:
train_df_list[1]