<a href="https://colab.research.google.com/github/reejungkim/Pipeline/blob/main/prep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# import libraries

In [47]:
import warnings
warnings.filterwarnings("ignore")

# Load up packages
import os
import pandas as pd
import numpy as np
import sklearn

#visualization lisb
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_palette('Set2')

#split data set
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split,  cross_val_score

#preprocessing libs
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
#label encoders
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()

#Models
#classifiers
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
#regressors
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor , GradientBoostingRegressor


#model evaluation libs
#classifier evaluation
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
#regressor evaluation
from sklearn.metrics import r2_score, mean_squared_error

#number of features optimization
from sklearn.model_selection import StratifiedKFold

#Hyper-parameter optimization
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV


# Load data set

In [18]:
# Load data
#df = pd.read_csv('bankruptcy_Train.csv')

df = sns.load_dataset('diamonds')
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


# Checking for nulls and data types

In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    53940 non-null  float64 
 1   cut      53940 non-null  category
 2   color    53940 non-null  category
 3   clarity  53940 non-null  category
 4   depth    53940 non-null  float64 
 5   table    53940 non-null  float64 
 6   price    53940 non-null  int64   
 7   x        53940 non-null  float64 
 8   y        53940 non-null  float64 
 9   z        53940 non-null  float64 
 10  test     53940 non-null  object  
dtypes: category(3), float64(6), int64(1), object(1)
memory usage: 3.4+ MB


In [24]:
# checking for sparse columns 
#(df.isnull().sum()/ len(df).sort_values(ascending=False)[:30]
df_null = pd.DataFrame((df.isnull().sum()/ len(df)), columns={"%missing"})
df_null.loc[df_null['%missing']>=.7]

Unnamed: 0,%missing


should check if the sparse columns refers to zero or n/a before dropping the columns
 

# Checking columns with small variations

In [22]:
df_no_var = df.loc[:, df.columns!='class'].std().sort_values(ascending=True)
df_no_var.loc[df_no_var<=0.1]

Series([], dtype: float64)

In [10]:
#df.drop(['Attr1', 'Attr2'], axis=1)

# Imputer - Categorical columns

In [50]:
#catogory type
for col in ['clarity']:
  df[col]= df[col].cat.add_categories("F").fillna("F")

In [40]:
#replace null objects with 'None'
for col in ['color']:
    df[col] = df[col].fillna("none")

ValueError: ignored

In [41]:
cols_obj = df.select_dtypes(include=['object']).columns
# Impute null objects with mode
for col in cols_obj:
    #df[col] = df[col].fillna(df[col].mode()[0])
    df[col].fillna(df[col].mode()[0], inplace=True)

# Imputer - Numeric Columns

In [30]:
# imputer on columns where null value refers to zero instead of n/a
df[['depth','table']] =  df[['depth','table']].fillna(0)

In [29]:
cols_number = df.select_dtypes(include=['int64', 'float64']).columns
# Null values of rest of numerical types variables would better be replaced by median of neighborhood group
for col in cols_number:
    df[col] = df.groupby('color')[col].transform( lambda x: x.fillna(x.median()))

In [15]:
#Custom Transformer that fills missing ages
class CustomImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        super().__init__()
        self.age_means_ = {}

    def fit(self, X, y=None):
        self.age_means_ = X.groupby(['Pclass', 'Sex']).Age.mean()

        return self

    def transform(self, X, y=None):
        # fill Age
        for key, value in self.age_means_.items():
            X.loc[((np.isnan(X["Age"])) & (X.Pclass == key[0]) & (X.Sex == key[1])), 'Age'] = value

        return X

NameError: ignored

# Vectorization

In [None]:
for col in cols_obj:
    print('{}: {}'.format(col, df[col].unique()  ))

In [None]:
#Encoding object type variables using label encoder
for col in cols_obj:
    df[col] = le.fit_transform(df[col])

# Applying Scaler 

In [None]:
#choose scaler
scaler = RobustScaler()

In [None]:
#scaler application
scaled_X = scaler.fit_transform(all_data.loc[:, all_data.columns!='SalePrice'].reset_index(drop=True))
scaled_Y = scaler.fit_transform( all_data[0:1460][['SalePrice']] ) 

In [7]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: ignored

# Pipeline

In [46]:
make_column_transformer(
    (StandardScaler(), ['carat', 'depth']),
    (OneHotEncoder(), ['cut']),
    remainder='passthrough'
)

ColumnTransformer(remainder='passthrough',
                  transformers=[('standardscaler', StandardScaler(),
                                 ['carat', 'depth']),
                                ('onehotencoder', OneHotEncoder(), ['cut'])])

In [45]:
num_transformer = Pipeline(
    steps = [("imputer", SimpleImputer(strategy="median")),
             ("scaler", StandardScaler())]
)

In [44]:
sklearn.set_config(display="diamgram")


NameError: ignored

# Evaluation 

**Regression**
* R-Squared
* RMSE : Root Mean Squared Error [0- infinity]
* MAE : Mean Absolute Error [0- infinity]

**Classification**
* Accuracy
*Precision
*Recall
*Specificity
*F1 Score


* R-squared : how well the predictions approximate the  ground truth [0–100%]
*RMSE and MAE are more interpretable than R-squared and Using the RMSE is more appropriate if the data have a lot of outliers.
*Accuracy : This is a metric that is best used for a balanced data set
*Precision : This metric that’s be used to avoid false positive predict (observation is negative, but is predicted positive)
*Recall / Sensitivity : This metric that’s be used to avoid false negative (Observation is positive, but is predicted negative) and maximize True positive rate.
*Specificity : This metric is used to maximize True negative rate and minimize True positive rate.
*F1 Score : This metric that’s be used to minimize False positive and False negative (일반적으로 크래스 불균형 문제가 있을경우에는 정확도와 정밀도가 높고 재현율이 크게 떨어지는 경향이있음. 따서 정확도, 정밀도, 재현율 가운데 하나의 척도만 사용한다면 모델의 성능을 객관적으로 판단하기 어려움. 그서 정밀도와 재현율의 조화 평균인 F1점수를 사용하는것이 좋다.)




# Evaluatin metrics for regressor models

In [16]:
f1_score(y_test, y_pred, average='micro')

score = mean_absolute_error(y_test, y_pred)
print('MAE:', score)

mse = mean_squared_error(y_test, y_pred)
print('MSE:', mse)

rmse = np.sqrt(mse)
print('RMSE:', rmse)

NameError: ignored

# Evaluation metrics for classifier models

In [None]:
pd.DataFrame(confusion_matrix(y_test, y_pred), 
             columns = ['predicted Negative', 'Predicted Positive'],
             index = ['Actual Negative', 'Actual Positive'])

In [None]:
print(classification_report(y_test, y_pred))