**Steps of this notebook** 
* Import the required libraries
* build the functions that we will need to use it for visualization more than one time
* Load the dataset and show some data analysis (has nulls, std and mean)
* Remove the outliers from the featuers
* remove the correlated features that have correlation more than .8 to avoid misleading
* train the data using LGBM algorithm
* predict the data


# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import xgboost as xgb
from sklearn.svm import SVR
import time
from multiprocessing import Process, Pool
from scipy.stats import pearsonr

seed = 42
np.random.seed(seed)

# Build Functions

## Draw features to detect Normal distribution of features

In [None]:
def draw_hist_features(df, fig_size = 10, from_st = 0):
    plt.figure(figsize = (20, 20))
    for index in range(from_st, from_st + fig_size):
        plt.subplot(5, 5, 1 + index)
        plt.hist(df.loc[:, f'f_{index}'], bins = 100)
        plt.title(f'f_{index}')

## Build Draw outliers function to visualize the outliers between featuers and target
we will need it to draw the outliers compared to train feature

In [None]:
def draw_outliers(df, features = [], fig_size = (20, 20)):
    plt.figure(figsize = fig_size)
    for index, f in enumerate(features):
        plt.subplot(5, 5, 1 + index)
        plt.scatter(df[f], df['target'])
        plt.xticks([]), plt.yticks([])
        plt.title(f)

## Detect outliers used to extract ouliters based on fraction* std that u choose
this method used to detect the outliers that out of range std* 70 by default (but in our case it will be std* 35)

In [None]:
def detect_outliers(df, featuers, fraction = 70):
    outliers_list = [] # used to store the index of the outlier record
    outliers_col = [] # used to store feature that conatins outliers
    
    # detect outliers from range +ve and -ve over x-axis
    for col in features:
        mean = df[col].mean()
        std = df[col].std()
        out = df[(df[col] > mean + std * fraction) | 
                 (df[col] < mean - std * fraction)
                ]
        
        # only remvove the features that hase outliers > 0 and < 10 records
        if 0 < len(out) < 10:
            out_list = out.index.to_list()
            outliers_list.extend(out_list) # save the outlier records
            outliers_col.append(col) # save this feature as conatins outliers
            
            print(f'{col}: {len(out)}')
        
    print(f'# of features: {len(outliers_col)}')
    print(f'ouliers records: {len(outliers_list)}') # print number of outliers
    
    return outliers_list, outliers_col

## Reduce Memory method (not mine) to reduce the train.csv as storage as possible
This memory is not mine it's for kaggler who i need to thank him for this powerfull and simple method that reduce the dataset memory for 60% at least in most cases

In [None]:
def reduce_mem_usage(df):
  
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
     
    return df

# Load the dataset
for memory effecieny we will use the compressed dataset of this competition wich reduced the train data from 18GB to 3GB

In [None]:
path = '../input/ubiquant-parquet/'

print('Reading the train data....')
train = pd.read_parquet(path + 'train.parquet')
print('Done.')

In [None]:
train.head()

## Data Exploratory

### show describe method
in **describe()** method: as we see in the mean and std there are some features that std is away from 1 and mean away from 0

In [None]:
train.describe()

### Search for null values

In [None]:
train.isnull().sum()

### Draw features to detect Normal distribution of features

In [None]:
draw_hist_features(df = train, fig_size = 20)

as we see "f_1, 2, 5, 6 and 9" are nearly normal distributed 

but "f_0, 3, 4, 8, 10, 11, 12" are away from  normal distributed, so we need to scatter some of these features to make sure they contains outliers

- if the outliers are small records then we will drop them for not affect our model from learning and avoid **misleading**

### Build Draw outliers function to reduce code duplication

In [None]:
draw_outliers(train, features = ['f_3', 'f_4', 'f_8', 'f_6']) # just try some features to detect outliers 

### Get the feature columns
Get all features except
* row_id
* time_id
* target

In [None]:
%%time 
features = train.columns.to_list()
features = [f for f in features if f not in ['row_id', 'time_id', 'target']]  # remove features "time_id, row_id, target"

### Remove outliers from std * 35 and len < 10

In [None]:
%%time
# loop over all features f_0 till f_299 to find outliers
outliers_list, outliers_col = detect_outliers(train, features, fraction = 35)  # outliers with fraction 35%

as we see there are **36** features contains outliers that are out of range stdx35 which give **102** records of outlier vlaues

We need to remove these outliers

* before dropping the outliers let's see how they affect on the distribution of the features

In [None]:
%%time
draw_outliers(train, features=['f_12', 'f_214', 'f_295', 'f_165'])


Let's drop these outliers to see how they affect the distribution of features

In [None]:
%%time
print(f'Before: {len(train)}')
train.drop(train.index[outliers_list], inplace = True)
print(f'After: {len(train)}')

In [None]:
%time
draw_outliers(train, features=['f_12', 'f_214', 'f_295', 'f_165'])


As we see the distribution of the features became much better after removing the outliers 

- u r free to choose the fraction u need to remove the outliers i tried std*35 but maybe chaning the fraction to other value would make detect much outliers 

## Reduce train memory
This method is referenced to [GUILLAUME MARTIN ](https://www.kaggle.com/gemartin/load-data-reduce-memory-usage) its very useful to reduce memory usage 

by the way he inspired this method from [this notebook](https://www.kaggle.com/arjanso/reducing-dataframe-memory-size-by-65)

Thanks [GUILLAUME](https://www.kaggle.com/gemartin) 

In [None]:
%%time

train = reduce_mem_usage(train) 

**WOW** how wonderful this method reduce the memory in huage scale, take it a try,  it's realy useful

# Save the dataset as Pickle files
i used this step as when i train the model the memory is overfitted so i need to restart over again, so to avoid these steps i saved the dataset as Pickle file

if the memory overfits the next time just import the updated data and start from this cell and ignore the above steps

**Why i didn't save it as parquet format rather than Pickle?!!**
* Hmmm, nice question, because the parquet doesn't support float16 data and when i used reduce_mem_usage() method i transformed the float32 and float64 to **float16** also the same with **int16**, so i have to save it as Pickle or back the data to float32 again then save it as parquet format as u like

In [None]:
# save train set as parquest format
train.to_pickle('final_train.pickle')

#### *Memory crached?!* start from here next time
**Get the train set again by starting from this cell if the memory overfits**

If the memory overfits again for the 1000 time with you "as happened to me", I'll begin from here

In [None]:
train = pd.read_pickle('final_train.pickle')

### Get target from train

In [None]:
target = train['target']
train.drop(['target'], axis = 1, inplace = True)

target

### Drop time_id, row_id from train

In [None]:
train.drop(['row_id', 'time_id'], inplace = True, axis = 1)
train.head()

## Remove Correlated Features more than 0.80
To avoid misleading we need to find the feaures that has more than 80% correlation and remove one of them

also this step will avoid more computation

In [None]:
%%time 
corr_f = train.corr()
corr_f.head()

In [None]:
sns.heatmap(corr_f)

* Find the features that has features that has corr more than 0.80

In [None]:
%%time
f_len = len(features)
exclude = []

for i in range(f_len):
    for j in range(f_len):
        if corr_f.iloc[i, j] > .80 and i !=  j: # remove any feature of them
            exclude.append(features[j])

exclude = set(exclude) # remove duplicated feature name

In [None]:
len(set(exclude))

Remove the features that has corr more than .80

In [None]:
print(f'train len, Before: {len(train.columns)}')
train.drop(exclude, inplace=True, axis = 1)
print(f'train len, After: {len(train.columns)}')

as we see the train features droped from **301** to **270** which are **31** features are greater than 80% corr with other exsiting features

# Split valid and train data

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(train, target, test_size = 5000)

x_train.shape, x_valid.shape, y_train.shape, y_valid.shape

## Remove some variables don't need to reduce memory usage
Don't be optimistic it will not be reduced that much :)

In [None]:
%%time
outliers_col = None
outliers_list = None
features = None
target = [[]]
train = [[]]

# Build the Model

## 1- LGBM

In [None]:
%%time

import lightgbm as lgbm

lgbm_reg = lgbm.LGBMRegressor(
        objective="regression",
        metric="rmse",
        n_estimators=100)

lgbm_reg.fit(train, target) # (x_train, y_train)

In [None]:
lgbm_reg.score(x_valid, y_valid)

Also we can use Pool which uses custom number of CPUs to work on

Kaggle Notebooks contains 4 CPUs so, you can use until 400% from the process

* this step for more processing and reduced memory

In [None]:
# %%time
# with Pool(2) as pool:  # use 2 CPUs 
#     result = pool.map(lgbm_reg.fit, (x_train, y_train))

## 2 - MLPRegressor

In [None]:
%%time
from sklearn.neural_network import MLPRegressor

mlp_reg = MLPRegressor(solver='adam',
                        hidden_layer_sizes=(128, 128),
                        activation='relu',
                        verbose=True,
                        warm_start=True)

mlp_reg.partial_fit(train, target)

In [None]:
mlp_reg.score(x_valid, y_valid)

In [None]:


y_pred = mlp_reg.predict(x_valid)

pearsonr(y_pred, y_valid)

# Prediction and Submission

In [None]:
import ubiquant
env = ubiquant.make_env()
iter_test = env.iter_test()

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    
    test_df = test_df[features]
    y_pred  = lgbm_reg.predict(test_df)
    sample_prediction_df["target"] = y_pred
    
    display(test_df)
    display(sample_prediction_df)
    
    env.predict(sample_prediction_df)

**If it helps u hope to upvote thanks ❤️**