In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Loading dependencies

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_columns', 140)

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso, Ridge, LinearRegression

### Loading clean data

In [None]:
df = pd.read_csv('../input/jane-street-data-prep/clean.csv')
columns = df.columns
df.head()

In [None]:
columns

In [None]:
df.shape

In [None]:
# df = pd.DataFrame()

Choose only a certain proportion of data: how to choose one in every 3 rows?

In [None]:
# train = df[df.index % 3 == 0]

### Split into target and feature dataframes

In [None]:
target_cols = ['resp', 'resp_1', 'resp_2', 'resp_3', 'resp_4']

In [None]:
features = df.drop(labels=target_cols, axis=1)
features_cols = features.columns
features.head()

In [None]:
# df = train
# train = pd.DataFrame()

In [None]:
targets = pd.concat([df.resp, df.resp_1, df.resp_2, df.resp_3, df.resp_4], axis=1)
targets.head()

### Are there missing values?

In [None]:
df.isnull().sum().sum()

## Scaling the data

In [None]:
# features = StandardScaler().fit_transform(features)
# targets = StandardScaler().fit_transform(targets)

### Baseline 1: LASSO regression

In [None]:
lasso = Lasso(alpha = 0.1)
lasso.fit(features, targets)

In [None]:
print(f'Coefficients: {lasso.coef_}')
print(f'Intercept: {lasso.intercept_}')

In [None]:
print(f'Score: {lasso.score(features, targets)}')
# previous score (with unscaled complete data) below 1e-15

In [None]:
for i in range(5):
    print(np.where(lasso.coef_[i] != 0))
    
print(f'Non-null coeffficient: {lasso.coef_[lasso.coef_ != 0]}')

Conclusion: LASSO regression is quite useless. Why? Is it because there is too much data?

### Baseline 2: classic regression

In [None]:
ols = LinearRegression()
ols.fit(features, targets)

In [None]:
print(f'Coefficients: {ols.coef_}')
print(f'Intercept: {ols.intercept_}')

In [None]:
print(f'Score: {ols.score(features, targets)}')
# previous score of ca. 0.00661 (complete, unscaled data), now 0.00664

In [None]:
plt.figure(figsize=(15, 6))
plt.hist(ols.coef_[0], bins=133)
plt.show()

In [None]:
columns.shape

In [None]:
features_cols.shape

In [None]:
# checking the magnitude of coefficients

# predictors = features.columns
coef = pd.Series(ols.coef_[0], features_cols).sort_values()

plt.figure(figsize=(15, 6))
coef.plot(kind='bar', title='Modal Coefficients')
plt.show()

Investigate which coefficients have a higher magnitude. **Do the same work after scaling the data.**

### Baseline 3: ridge regression

In [None]:
ridge = Ridge(0.05)
ridge.fit(features, targets)

print(f'Coefficients: {ridge.coef_}')
print(f'Intercept: {ridge.intercept_}')

In [None]:
print(f'Score: {ridge.score(features, targets)}')
# previous score of ca. 0.00661 (complete, unscaled data)

In [None]:
plt.figure(figsize=(15, 6))
plt.hist(ridge.coef_[0], bins=133)
plt.show()

In [None]:
# checking the magnitude of coefficients

# predictors = features.columns
coef = pd.Series(ridge.coef_[0],features_cols).sort_values()

plt.figure(figsize=(15, 6))
coef.plot(kind='bar', title='Modal Coefficients')
plt.show()