In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew , kurtosis , f_oneway
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd, MultiComparison  
from statsmodels.tsa.seasonal import seasonal_decompose
from dateutil.parser import parse
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing 

# LOAD DATA SET

In [None]:
data_types_dict = {
    'time_id': 'int8',
    'investment_id': 'int8',
    'target':'float32',
}
features = [f'f_{i}' for i in range(300) ]
for f in features :
    data_types_dict[f] = 'float32'
  

data = pd.read_csv('/kaggle/input/ubiquant-market-prediction/train.csv', 
                       usecols = data_types_dict.keys(),
                       dtype=data_types_dict, 
                       index_col = 0, nrows= 1000000)

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data.dtypes

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
data.shape

# STATISTICAL ANALYSIS

In [None]:
mean=data.mean()
mean

In [None]:
median=data.median()
median

In [None]:
data_central = pd.DataFrame({
    'mean':mean,
    'median':median
})
data_central.head()

In [None]:
data_central.plot(kind='density', figsize=(10,10), xlim=(-0.5 , 0.5))


In [None]:
data.mode()

In [None]:
#skewness
data_skew = data.skew()
data_skew

In [None]:
#Kurtosis
data_kurt=data.kurtosis()
data_kurt 

In [None]:
data_shape = pd.DataFrame({'Skewed data': data_skew,
                           'Kurtosis data':data_kurt
                        })
data_shape.head()

In [None]:
data_shape.plot(kind='density', figsize=(10,10), xlim=(-10,10))

In [None]:
data.var() # Varience

In [None]:
data.std() #standard deviation 

# ANOVA TEST # ONE WAY ANOVA

In [None]:
fvalue ,  pvalue =f_oneway(data['investment_id'] ,  data['target'])
print(fvalue , pvalue)

# TIME SERIES FORCASTING

In [None]:
    x=data['investment_id']
    y=data['target']
    plt.figure(figsize=(15,4), dpi=100)
    plt.plot(x,y,color='green')
    plt.gca().set(title='Investment_id vs Target',xlabel='investment',ylabel='target')
    plt.show()


In [None]:
mutiplicative_decomposition = seasonal_decompose(data['target'], model='additive' , period=30)
#plot
plt.figure(figsize=(16,12))
mutiplicative_decomposition.plot().suptitle('Additive Decomposition  ' , fontsize=16)
plt.show()

In [None]:
#auto-correlation #partial autocorrelation
fig , ax = plt.subplots(1,2, figsize=(16,3), dpi=100)
plot_acf(data['target'].tolist(), lags=50, ax=ax[0])
plot_pacf(data['target'].tolist(), lags=50 , ax=ax[1])

# CORRELATION MATRIX

In [None]:

data_features = data.filter(regex="f_")
sample_data = data.sample(frac=0.01, random_state=0)
correlation = data[["target"] + list(data_features)].corr()



In [None]:
sns.clustermap(correlation)
plt.show()

# DATA VISUALIZATION 

In [None]:
sns.histplot(data.target , kde=True)
plt.title("Target Distribution")
plt.show()

In [None]:
sns.distplot(data.groupby('investment_id').target.count(), kde=True)
plt.title('Distribution plot along Investment ')
plt.show()

# STANDARIZE THE DATA

In [None]:
features = [f'f_{i}' for i in range(299)]
target = 'target'
data_features = data[features]
data_features = StandardScaler().fit_transform(data_features)


# DIMENSION REDUCTION USING PCA

In [None]:
#Dimension Reducton
pca = PCA(n_components=50)

In [None]:
data_pca = pca.fit_transform(data_features)
data_pca = pd.DataFrame(data=data_pca)
data_pca.head(10)

In [None]:
data_pca.shape

# Spliting Data into training and testing Set

In [None]:
x_train , x_test , y_train , y_test = train_test_split(data['target'], data_pca , test_size=0.3 , shuffle=False)

In [None]:
print(f'length of training set {len(x_train)}')
print(f'length of training set {len(y_train)}')
print(f'length of test set {len(x_test)}')
print(f'length of test set {len(y_test)}')

# LOGISTIC REGRESSION