# Data Scaling & Regularization & Feature Engineering

Hi Guys, Welcome to [Tirendaz Academy](https://youtube.com/c/tirendazacademy) 😀
</br>
In this notebook, I'm going to talk about data scaling & regularization & feature engineering.
</br>
Happy learning 🐱‍🏍 

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("wine.data", header = None)

In [3]:
df.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 
                   'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 
                   'Color intensity',  'Hue', 'OD280/OD315 of diluted wines', 'Proline']

In [4]:
df.head()

Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [5]:
np.unique(df["Class label"])

array([1, 2, 3], dtype=int64)

In [7]:
X = df.iloc[:, 1:].values
y = df.iloc[:,0].values

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, 
                                                   random_state=0, stratify=y)

## Data Scaling

In [10]:
from sklearn.preprocessing import MinMaxScaler

In [11]:
mmscaler = MinMaxScaler()

In [12]:
X_train_norm = mmscaler.fit_transform(X_train)

In [14]:
X_test_norm = mmscaler.transform(X_test)

In [15]:
from sklearn.preprocessing import StandardScaler

In [16]:
stdscaler = StandardScaler()

In [17]:
X_train_std = stdscaler.fit_transform(X_train)

In [18]:
X_test_std = stdscaler.transform(X_test)

## Regularization

In [19]:
from sklearn.linear_model import LogisticRegression

In [20]:
lr = LogisticRegression(penalty = "l1", C=1, solver = "liblinear", multi_class="ovr")

In [21]:
lr.fit(X_train_std, y_train)

LogisticRegression(C=1, multi_class='ovr', penalty='l1', solver='liblinear')

In [22]:
lr.score(X_train_std, y_train)

1.0

In [23]:
lr.score(X_test_std, y_test)

1.0

In [24]:
lr.intercept_

array([-1.30028225, -1.36967165, -2.32914549])

In [25]:
lr.coef_

array([[ 1.22048919,  0.17773414,  0.71659728, -1.20294953,  0.        ,
         0.        ,  1.35081277,  0.        ,  0.        ,  0.        ,
         0.        ,  0.39706703,  2.60884505],
       [-1.38425411, -0.42688779, -0.93814321,  0.44953585, -0.01736337,
         0.11305157,  0.59294145,  0.        ,  0.        , -1.94826858,
         1.09550011,  0.        , -2.56880533],
       [ 0.21255908,  0.08580109,  0.49563871,  0.        ,  0.        ,
         0.        , -2.55402992,  0.        ,  0.        ,  1.64983458,
        -1.03248307, -0.36849188,  0.        ]])

In [26]:
lr.coef_[lr.coef_ != 0].shape

(24,)

In [27]:
X_data = X_train_std[:1,]

In [28]:
lr.predict(X_data)

array([1], dtype=int64)

In [29]:
y_train[:1]

array([1], dtype=int64)

## Feature Engineering

In [30]:
from sklearn.ensemble import RandomForestClassifier

In [31]:
forest = RandomForestClassifier(n_estimators=500, random_state=1)

In [32]:
feat_labels = df.columns[1:]

In [33]:
forest.fit(X_train, y_train)

RandomForestClassifier(n_estimators=500, random_state=1)

In [34]:
importances = forest.feature_importances_

In [35]:
indices = np.argsort(importances)

In [36]:
for f in range(X_train.shape[1]):
    print(f"{f+1}", feat_labels[indices[f]],
         importances[indices[f]])

1 Nonflavanoid phenols 0.012992399256212534
2 Ash 0.014381677761740612
3 Proanthocyanins 0.022483463782451273
4 Alcalinity of ash 0.025900122601886087
5 Malic acid 0.028867963800009225
6 Magnesium 0.031922184349841815
7 Total phenols 0.050824811977803086
8 Hue 0.07259491714547278
9 Alcohol 0.11479722353879414
10 OD280/OD315 of diluted wines 0.12223110107587146
11 Color intensity 0.14907323774360948
12 Flavanoids 0.15967892986778753
13 Proline 0.19425196709852016


In [37]:
from sklearn.feature_selection import SelectFromModel

In [38]:
selector = SelectFromModel(forest, threshold=0.1, prefit=True)

In [39]:
X_selected = selector.transform(X_train)

In [40]:
for f in range(X_selected.shape[1]):
    print(f"{f+1}", feat_labels[indices[f]],
         importances[indices[f]])

1 Nonflavanoid phenols 0.012992399256212534
2 Ash 0.014381677761740612
3 Proanthocyanins 0.022483463782451273
4 Alcalinity of ash 0.025900122601886087
5 Malic acid 0.028867963800009225


Don't forget to follow us on [YouTube](http://youtube.com/tirendazacademy) | [Medium](http://tirendazacademy.medium.com) | [Twitter](http://twitter.com/tirendazacademy) | [GitHub](http://github.com/tirendazacademy) | [Linkedin](https://www.linkedin.com/in/tirendaz-academy) | [Kaggle](https://www.kaggle.com/tirendazacademy) 😎