In [None]:
import pandas as pd
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt 
from sklearn import preprocessing
import tensorflow as tf
from sklearn import svm

In [None]:
dffeatures = pd.read_csv('../input/jane-street-market-prediction/features.csv') # features
data = pd.read_csv('../input/jane-street-market-prediction/train.csv') # train
dffeatures = dffeatures.dropna()
data = data.dropna()
dftrain = data.sample(frac=0.6,random_state=200)
dfeval = data.drop(dftrain.index)

In [None]:
dftest = pd.read_csv('../input/jane-street-market-prediction/example_test.csv') # test
dftest = dftest.dropna()

In [None]:
dfexample_sample_submission = pd.read_csv('../input/jane-street-market-prediction/example_sample_submission.csv') # example_sample_submission

In [None]:
dftrain_processed = pd.merge(dftrain, dfexample_sample_submission, on='ts_id')
dftrain_processed = dftrain_processed.sort_values(by=['ts_id'])

In [None]:
Y_train = dftrain_processed.pop('action')

In [None]:
stored_date = dftrain_processed.pop('date')
dftrain_processed.pop('weight')
stored_ts_id = dftrain_processed.pop('ts_id')
dftrain_processed.pop('resp')
dftrain_processed.pop('resp_1')
dftrain_processed.pop('resp_2')
dftrain_processed.pop('resp_3')
dftrain_processed.pop('resp_4')

In [None]:
dftrain_transposed = dftrain.T
scaled_data = preprocessing.scale(dftrain_processed)

In [None]:
pca = PCA()
pca.fit(scaled_data)
pca_data = pca.transform(scaled_data)

In [None]:
per_var = np.round(pca.explained_variance_ratio_* 100 ,decimals = 1)
labels = ['PC' + str(x) for x in range(1,len(scaled_data.T) + 1)]

In [None]:
pca_df = pd.DataFrame(pca_data)

In [None]:
label_color_dict = {'PC1':'red','PC2':'green'}
cvec=[]
for iter in range(0,26):
    cvec.append('red')
    cvec.append('green')
    cvec.append('blue')
    cvec.append('yellow')
    cvec.append('black')

In [None]:
plt.bar(x=range(1,len(scaled_data.T) + 1),height = per_var,tick_label = labels)
plt.plot([iter for iter in range(1,len(scaled_data.T) + 1)], per_var,color='red')

plt.ylabel('Percentage of explained variance')
plt.xlabel('Principal Component')
plt.title('Scree plot')
axes = plt.gca()
axes.set_xlim([0,10])
plt.show()

In [None]:
plt.scatter(pca_df.loc[1],pca_df.loc[2],c=cvec)
plt.title('My PCA graph')
plt.xlabel('PC2 -{0}%'.format(per_var[1]))
plt.ylabel('PC3 -{0}%'.format(per_var[2]))

In [None]:
data = {'feature_0': pca_data.T[0], 
        'feature_1': pca_data.T[1], 
        'feature_2': pca_data.T[2],
        'feature_3': pca_data.T[3], 
        'feature_4': pca_data.T[4], 
        'feature_5': pca_data.T[5]
       }
X_train = pd.DataFrame(data)

In [None]:
model = tf.keras.models.Sequential([tf.keras.layers.Flatten(), 
                                    tf.keras.layers.Dense(7, activation=tf.nn.relu),  
                                    tf.keras.layers.Dense(2, activation=tf.nn.softmax)])

In [None]:
model.compile(loss='binary_crossentropy',optimizer='Adam',metrics=['accuracy'])

In [None]:
model.fit(X_train, Y_train, epochs=5)

In [None]:
def predict(data,pca,model):
    data=data.fillna(0.0)
    stored_test_date = data.pop('date')
    data.pop('weight')
    scaled_data = preprocessing.scale(data)
    pca_test_data = pca.transform(scaled_data)
    test_data = {'feature_0': pca_test_data.T[0], 
        'feature_1': pca_test_data.T[1], 
        'feature_2': pca_test_data.T[2],
        'feature_3': pca_test_data.T[3], 
        'feature_4': pca_test_data.T[4], 
        'feature_5': pca_test_data.T[5],
       } 
    X_test = pd.DataFrame(test_data)
    Y_test =  model.predict(X_test)
    return int(round(Y_test[0][0]))

In [None]:
df_submission = pd.DataFrame(columns = ['ts_id', 'action']) 

In [None]:
import janestreet
env = janestreet.make_env() # initialize the environment
iter_test = env.iter_test() # an iterator which loops over the test set
i=0
for (test_df, sample_prediction_df) in iter_test:
    prediction = predict(test_df,pca,model)
    sample_prediction_df.action = prediction
    dfexample_sample_submission.action[i] = prediction
    df_submission = df_submission.append({'ts_id' : i , 'action' : prediction} , ignore_index=True)
    env.predict(sample_prediction_df)
    i+=1

In [None]:
df_submission.to_csv('submission.csv', index=False)

In [None]:
df_submission