In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import gc

In [None]:
%%time
n_features = 300
features = [f'f_{i}' for i in range(n_features)]
feature_column = ['investment_id', 'time_id' ] + features
train = pd.read_pickle('../input/ubiquant-market-prediction-half-precision-pickle/train.pkl')
train.head()

In [None]:
print(train.shape)
#print(train.info())

In [None]:
plt.figure(figsize=(18,6))
sns.histplot(train['target'])

Let us try to understand the data
1. There are 1211 time_id's recorded with min value 0 to some value and max value 1219.
1. There are 3579 investment_id's recorded with min 0 and max 3773.

In [None]:
print(train['time_id'].nunique())
print(train['time_id'].min())
print(train['time_id'].max())
print(train['investment_id'].nunique())
print(train['investment_id'].min())
print(train['investment_id'].max())

### let us check the role of time_id on the target and check if we have to consider it as an independent feature

In [None]:
time_grp_count = train.groupby('time_id')['investment_id'].count()
time_grp_mean = train.groupby('time_id')['target'].mean()
time_grp_std = train.groupby('time_id')['target'].std()
plt.figure(figsize=(18,14))
plt.subplot(4,1,1)
time_grp_count.plot(title = 'plt of no of counts of investment id in each time id')
plt.legend()
plt.subplot(4,1,2)
time_grp_mean.plot(title = 'plt of mean of target in each time id')
plt.title('plt of mean of target in each time id')
plt.subplot(4,1,3)
time_grp_std.plot(title='plt of std of target in each time id')
plt.legend()

#Let find the distribution of mean and std of investment_id
plt.subplot(4,1,4)
time_grp_count.plot(kind='hist',bins=100)
plt.title('hist plot of count of investments in time')

plt.figure(figsize=(18,12))
plt.subplot(2,1,1)
time_grp_mean.plot(kind='hist',bins=100)
plt.title('hist plot of target mean of each investment id')
plt.subplot(2,1,2)
plt.title('hist plot of target std of each investment id')
time_grp_std.plot(kind='hist',bins=100)


1. We can see that the count of investments in each time_id is uniform and there is trend of increasing investments over the time.
2. There are certian times, when investment drops to very low value and in those times mean and std of target increases a lot in magnitude. Other times the mean is nearly zero and std is around 0.95.
3. Higher investment ids have higher frequency of investment, means they are invested more number of times in respective time_id.

### let us check the role of investment_id on the target and check if we have to consider it as an independent feature

In [None]:
invest_grp_count = train.groupby('investment_id')['time_id'].count()
invest_grp_mean = train.groupby('investment_id')['target'].mean()
invest_grp_std = train.groupby('investment_id')['target'].std()
plt.figure(figsize=(18,14))
plt.subplot(3,1,1)
invest_grp_count.plot()
plt.title('count of time_id in each investment id')
plt.subplot(3,1,2)
invest_grp_mean.plot()
plt.title('plot of mean of each investment id')
plt.subplot(3,1,3)
invest_grp_std.plot()
plt.title('plot of std of each investment id')

#Let find the distribution of mean and std of investment_id
plt.figure(figsize=(18,14))
plt.subplot(3,1,1)
invest_grp_count.plot(kind='hist',bins=100)
plt.title('hist plot of time_id in each investment id')
plt.subplot(3,1,2)
invest_grp_mean.plot(kind='hist',bins=100)
plt.title('hist plot of target mean of each investment id')
plt.subplot(3,1,3)
plt.title('hist plot of target std of each investment id')
invest_grp_std.plot(kind='hist',bins=100)

1. We can see that the count of time_id in each investment_id is random and the target mean and target std in each investment_id is nearly zero and 0.9 respectively.
2. the count of investments increase over the time
2. The histogram curve of target mean and target std in investment_id do follow nearly normal distribution with shifted mean and std. 
3. From this plot and previous plot of time_id we can say that investment_id and time_id mostly behave in similar fahsion. Infact they are interelated. 
4. We may need to consider any one of them preferably the ivestment_id in the modeling.

## Let us make a base model considering the pca on features f_0 to f_299

In [None]:
X = train.drop(['time_id','investment_id','target'], axis = 1)
Y = train['target']

n_pca = 40
from sklearn.decomposition import PCA
pca = PCA(n_components=n_pca)
pca.fit(X)
X_pca = pca.transform(X)
exp_var_ratio = pca.explained_variance_ratio_
plt.figure(figsize = (12,6))
plt.subplot(2,1,1)
plt.plot(exp_var_ratio*100)
plt.subplot(2,1,2)
plt.plot(exp_var_ratio.cumsum()*100)
col = [f'f_{i}' for i in range(n_pca)]
X_pca = pd.DataFrame(X_pca, columns = col)
X_pca.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso,LassoLars, BayesianRidge, TweedieRegressor, SGDRegressor, QuantileRegressor, ElasticNet
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from scipy.stats import pearsonr

In [None]:
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, shuffle=True, random_state=123)
X_train.shape, X_val.shape, Y_train.shape, Y_val.shape

In [None]:
 #build and train the model
def build_model(X,Y):
    model = LinearRegression()
   #model = Ridge(alpha= 1, tol = 0.001)
    #model = Lasso(alpha=0.0005, tol = 0.0001)
    #model = ElasticNet(alpha=0.0001, l1_ratio=0.2)
    #model = RandomForestRegressor(n_estimators=1000, min_samples_split=50,min_samples_leaf=50)
    #model = DecisionTreeRegressor(min_samples_split=20, min_samples_leaf=10)
    #model = GradientBoostingRegressor(learning_rate=0.1, n_estimators=200, subsample=1.0, 
    #criterion='friedman_mse', min_samples_split=20, min_samples_leaf=10, max_depth=5)
    #model = TweedieRegressor(power=1, alpha=0.5, link='log')
    #model = QuantileRegressor(alpha=0)
    #model = SGDRegressor(loss='squared_error', penalty='l2', alpha=0.0001, l1_ratio=0.65, tol=0.001,)
    #model = BayesianRidge( n_iter=800, tol=0.001, alpha_1=1e-06, alpha_2=1e-06, lambda_1=1e-06, lambda_2=1e-06)
    model.fit(X,Y)
    return model

# predict the output and score
def predict(model, X_val,Y_val):
    prediction = model.predict(X_val)
    score = r2_score(Y_val, prediction)
    pearson_score = pearsonr(Y_val, prediction)[0]
    return pearson_score

model = build_model(X_train, Y_train)
pearson_coef = predict(model, X_val, Y_val)
print(pearson_coef)

In [None]:
test = pd.read_csv('../input/test-data/example_test.csv')
print(test.shape)
test.head()
X_test = test.drop(['row_id','time_id','investment_id'],axis=1)
X_test.head()

print

In [None]:
# preprocess test dataset
#def preprocess_test(test):
#    #test_pca = pca.transform(test)
#    #col = [f'f_{i}' for i in range(n_pca)]
#    #test_pca_df = pd.DataFrame(test_pca, columns = col)
#    return test
#test_pca_df = preprocess_test(X_test)
#test_pca_df.head()

def predict_test(model, test):
    test_pred = model.predict(test)
    return test_pred
test_pred = predict_test(model, X_test) 
test_pred

In [None]:
sample_submission = pd.read_csv('../input/test-data/example_sample_submission.csv')
sample_submission['target'] = test_pred
sample_submission
sample_submission.to_csv('sample_submission.csv')

In [None]:
import ubiquant
env = ubiquant.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test set and sample submission
for (test_df, sample_prediction_df) in iter_test:
    test_df = test_df.drop(['row_id','investment_id'],axis=1)
    #test_pca = pca.transform(test_df)
    #test_pca_df = pd.DataFrame(test_pca, columns = [f'f_{i}' for i in range(n_pca)])
    sample_prediction_df['target'] = model.predict(test_df)  # make your predictions here
    env.predict(sample_prediction_df)   # register your predictions

## Including investment_id and time_d in the features

1. This basic model gave pearson score very less around 0.12. We can improve score by including the investment_id in the independent features.
2. It can be included by creating new features using groupby method.


In [None]:
#train.groupby(by='investment_id')['target'].mean().to_dict()
#train['invest_feature'] = train['investment_id'].map(train.groupby(by='investment_id')['target'].mean().to_dict())


In [None]:
#X_pca_mod = pd.concat([X_pca, train['invest_feature']],axis=1)
#X_pca_mod.head()


In [None]:
#X_train, X_val, Y_train, Y_val = train_test_split(X_pca_mod, Y, test_size=0.2, shuffle=True, random_state=123)
#print(X_train.shape, X_val.shape, Y_train.shape, Y_val.shape)

## build and train the model
#def build_model(X,Y):
#    model = LinearRegression()
#    #model = Ridge(alpha= 1, tol = 5)
#    #model = RandomForestRegressor(n_estimators=1000, min_samples_split=50,min_samples_leaf=50)
#    #model = DecisionTreeRegressor()
#    model.fit(X,Y)
#    return model

## predict the output and score
#def predict(model, X_val,Y_val):
#    prediction = model.predict(X_val)
#    score = r2_score(Y_val, prediction)
#   pearson_score = pearsonr(Y_val, prediction)[0]
 #   return pearson_score
#
#model = build_model(X_train, Y_train)
#pearson_coef = predict(model, X_val, Y_val)
#print(pearson_coef)

In [None]:
#invest_feature = test['investment_id'].map(train.groupby(by='investment_id')['target'].mean().to_dict())
#invest_feature_df = pd.DataFrame(invest_feature).rename(columns={'investment_id':'invest_feature'})
#invest_feature_df

In [None]:
## preprocess test dataset
#def preprocess_test(X):
#    test_pca = pca.transform(X)
#    col = [f'f_{i}' for i in range(75)]
#    test_pca_df = pd.DataFrame(test_pca, columns = col)
#   test_pca_df_mod = pd.concat([test_pca_df, invest_feature_df], axis=1)
#    return test_pca_df_mod
#test_pca_df_mod = preprocess_test(X_test)
#print(test_pca_df_mod.shape)
#
#def predict_test(model, X):
#    test_pred = model.predict(X)
#    return test_pred
#test_pred = predict_test(model, test_pca_df_mod) 
#test_pred

In [None]:
#sample_submission = pd.read_csv('../input/test-data/example_sample_submission.csv')
#sample_submission['target'] = test_pred
#sample_submission
#sample_submission.to_csv('sample_submission.csv')

In [None]:
#import ubiquant
#env = ubiquant.make_env()   # initialize the environment
#iter_test = env.iter_test()    # an iterator which loops over the test set and sample submission
#for (test_df, sample_prediction_df) in iter_test:
#    #print(test_df)
#    invest_feature = test_df['investment_id'].map(train.groupby(by='investment_id')['target'].mean().to_dict())
#    invest_feature_df = pd.DataFrame(invest_feature).rename(columns={'investment_id':'invest_feature'})
#    #print(invest_feature_df)
#    test_df = test_df.drop(['row_id','investment_id'],axis=1)
#    test_pca = pca.transform(test_df)
#    test_pca_df = pd.DataFrame(test_pca, columns = [f'f_{i}' for i in range(75)])
#    #print(test_pca_df)
#    test_pca_df_mod = pd.concat([test_pca_df, invest_feature_df],axis=1)
#    sample_prediction_df['target'] = model.predict(test_pca_df_mod)  # make your predictions here
#    #print(sample_prediction_df)
#    env.predict(sample_prediction_df)   # register your predictions

In [None]:
#invest_feature = train['investment_id'].map(train.groupby(by='investment_id')['target'].mean().to_dict())
#invest_feature.rename(columns={'investment_id': 'invest_feature'})
#invest_feature = pd.DataFrame(invest_feature).rename(columns={'investment_id':'invest_feature'})

#invest_feature