### Common Function Notebook

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split, KFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import make_column_transformer, TransformedTargetRegressor, make_column_selector, ColumnTransformer
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import SequentialFeatureSelector,SelectFromModel
from sklearn.metrics import mean_squared_error
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
import category_encoders as ce
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import os
from sklearn import set_config
set_config(display="diagram")


warnings.filterwarnings('ignore')

In [9]:

#Simple function to dump the feature with percentages
def feature_null_percentage_in_data(df):
    print(round(df.isnull().sum()/df.shape[0] * 100,2))

def cleaned_data_percent(df):
    cleaned_df = df.dropna()
    print(((df.shape[0] - cleaned_df.shape[0])/df.shape[0])* 100)

def feature_selection_method(pipeline):
    pipeline.fit(X_train, y_train)
    train_mse = mean_squared_error(y_train, pipeline.predict(X_train))
    print('train_mse : ' , train_mse)
    test_mse = mean_squared_error(y_test, pipeline.predict(X_test))
    print('test_mse : ', test_mse)
    score = pipeline.score(X_test, y_test)
    print(score)
    model_coefs = pipeline.named_steps['ridge'].coef_
    feature_names = pipeline.named_steps['selector'].get_feature_names_out()
    print(model_coefs)
    print(feature_names)
    return pd.DataFrame(({'feature' : feature_names, 'coef': model_coefs}))

def getFigTitle(fig, title):
    fig = fig + 1
    return f'Fig{ fig} : {title}', fig

def valuecount_percentages(feature_series):
    vc_series = feature_series.value_counts()
    print('Name             Counts          Percents')
    for item in vc_series.index:
        print(f"{item:<15}  "
        f"{vc_series[item]:<15} "
        f"{(vc_series[item]/feature_series.size) * 100:.3f}")
    percentage_series = feature_series.value_counts(normalize=True).mul(100).round(1)




In [10]:
### Commmon Functions for Data Processing

def count_encoder(org_df, categorical_features):
    encoder = ce.CountEncoder()
    encoded_features = encoder.fit_transform(org_df[categorical_features])
    endoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out())
    used_cars_df_encoded = org_df.drop(columns=categorical_features).merge(endoded_df, how='inner', left_index=True, right_index=True).reset_index()
    print(used_cars_df_encoded.shape)
    used_cars_df_encoded.drop(columns=['index'], inplace=True)
    used_cars_df_encoded = pd.DataFrame(scaler.fit_transform(used_cars_df_encoded), columns=used_cars_df_encoded.columns)
    used_cars_df_encoded    
    return used_cars_df_encoded

def run_price_correlation(df):
    return df.corrwith(df["price"]).sort_values(ascending=False)

def onehot_encoder(org_df, categorical_features):
    encoder = OneHotEncoder(sparse_output=False)
    encoded_features = encoder.fit_transform(org_df[categorical_features])
    endoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out())
    print(endoded_df.shape)
    used_cars_df_encoded = org_df.merge(endoded_df, how='inner', left_index=True, right_index=True).reset_index()
    print(used_cars_df_encoded.shape)
    used_cars_df_encoded.drop(columns=['index'], inplace=True)
    used_cars_df_encoded.drop(columns=categorical_features, inplace=True)
    used_cars_df_encoded = pd.DataFrame(scaler.fit_transform(used_cars_df_encoded), columns=used_cars_df_encoded.columns)
    return used_cars_df_encoded    

def getX_Y(df):
    X = df.drop('price', axis=1)
    y = df['price']
    return X,y

def get_cat_features(df):
    numerical_features = df.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = df.select_dtypes(include=['object']).columns
    print('Numerical Features = ', numerical_features)
    print('Cateorical Features = ', categorical_features)
    return categorical_features
def convert_cat_to_codes(df):
    new_df = df.copy()
    for col_name in new_df.columns:
        if(new_df[col_name].dtype == 'object'):
            new_df[col_name]= new_df[col_name].astype('category')
            new_df[col_name] = new_df[col_name].cat.codes

    return new_df

pca_names=[]
def variance_comp_count(arr_var, ratio):
    pca_names.clear()
    i = 0
    for cumratio in arr_var:
        print(f'{ratio}, {cumratio}')
        if(ratio < cumratio):
            return
        else:
            pca_names.append(f'pca{i}')
            i = i+1



In [11]:
## Common functions for Pipeline Processing

def reset_globals():
    models = []
    train_mses= []
    test_mses = []
    train_scores = []
    test_scores = []
    train_perms = []
    test_perms = []
    model_coefs = []
    results = []
    print('Models = ', models)
    print('Result = ', results)


def dump_df():
    df = {}
    df['Models'] = models
    df['Train MSE'] = train_mses
    df['Test MSE'] = test_mses
    df['Train Score'] = train_scores
    df['Test Score'] = test_scores
    return pd.DataFrame.from_dict(df)

def dump_results():
    cols = ['model', 'train_mse', 'test_mse', 'train_score', 'test_score']
    return pd.DataFrame.from_dict(results)

def dump_coefs_df():
    return pd.DataFrame(model_coefs, columns=X_train.columns, index=models)

def pipeline_proces_and_holdout(pipe, model_name):
    print(f'==================== RUNNING {model_name}=================================')
    models.append(f'{model_name}-ho')
    pipe.fit(X_train, y_train)
    train_mse = round(mean_squared_error(y_train, pipe.predict(X_train)), 5)
    train_mses.append(train_mse)
    test_mse = round(mean_squared_error(y_test, pipe.predict(X_test)), 5)
    test_mses.append(test_mse)
    train_score = pipe.score(X_train, y_train)
    train_scores.append(train_score)
    test_score = pipe.score(X_test, y_test)
    test_scores.append(test_score)
    result = {'model' :f'{model_name}-ho', 'train_mse' : train_mse, 'test_mse' : test_mse, 'train_score' :train_score, 'test_score' :test_score}
    print(result)
    results.append(result)
    print(pipe.named_steps['model'].coef_)
    model_coefs.append(pipe.named_steps['model'].coef_)
    print('==================== DONE =================================================')

def pipeline_proces_and_kfold(pipe, model_name):
    print(f'==================== RUNNING {model_name}=================================')
    models.append(f'{model_name}-kfold')
    kfold = KFold(n_splits=3, random_state=1, shuffle=True)
    selector_grid = GridSearchCV(pipe, {}, scoring='r2', cv=kfold, verbose=2)
    selector_grid.fit(X_train, y_train)
    best_estimator = selector_grid.best_estimator_
    best_model = selector_grid.best_estimator_.named_steps['model']
    train_mse = round(mean_squared_error(y_train, best_estimator.predict(X_train)), 5)
    train_mses.append(train_mse)
    test_mse = round(mean_squared_error(y_test, best_estimator.predict(X_test)), 5)
    test_mses.append(test_mse)
    train_score = best_estimator.score(X_train, y_train)
    train_scores.append(train_score)
    test_score = best_estimator.score(X_test, y_test)
    test_scores.append(test_score)
    
#    test_scores.append(list(selector.cv_results_.get('mean_test_score'))[0])
    result = {'model' :f'{model_name}-kfold', 'train_mse' : train_mse, 'test_mse' : test_mse, 'train_score' :train_score, 'test_score' :test_score}
    print(result)
    results.append(result)
    print('=====================================================================')
    return selector_grid

def grid_search_for_best_stuff(pipe, params, X, y):
    selector_grid = GridSearchCV(pipe, param_grid=params, verbose=2)
    selector_grid.fit(X, y)
    selector_grid.cv_results_
    best_estimator = selector_grid.best_estimator_
    best_model = selector_grid.best_estimator_.named_steps['model']
    return selector_grid, best_estimator, best_model, selector_grid.best_estimator_.get_params()

def gs_for_number_of_features(pipe, feature_select):
    grid, best_estimator, best_model, params = grid_search_for_best_stuff(pipe, feature_select, X_train, y_train)
    best_feature_count = params.get('n_features_to_select')
    print('best_n_features_to_select = ',best_feature_count)
    pipe.set_params(selector__n_features_to_select=best_feature_count)
    pipeline_proces_and_holdout(pipe, f'select_fcount-{best_feature_count}')
    return grid, best_estimator, best_model, params

def gs_for_best_alpha(pipe, ridge_param_dict):
    print('=========== Searching for best alpha ====================')
    grid, best_estimator, best_model, params = grid_search_for_best_stuff(pipe, ridge_param_dict, X_train, y_train)
    best_alpha = round(params.get('model__alpha'),2)
    print('best_alpha = ',best_alpha)
    pipe.set_params(model__alpha=best_alpha)
    pipeline_proces_and_holdout(pipe, f'best_alpha-{best_alpha}')
    return best_alpha

def getColumnNames(fromName, encoded_names):
    arr = []
    for i in range(0, fnames.size):
        arr.append(X_train.columns[int(fnames[i][1:])])
    return arr

def dump_hyper_params(hyper_params, best_model_params):
    for hp in hyper_params:
        print(f'{hp}={best_model_params.get(hp)}')
        return best_model_params.get(hp)

def selected_columns_list(cols_arr ,selected_list):
    selected_columns = []
    for i in  range(len(cols_arr)):
        if(selected_list[i]):
            selected_columns.append(cols_arr[i])
    return selected_columns

def pipeline_factory(transformer):
    linear = Pipeline([
        ('transformer',  transformer),
        ('scaler', StandardScaler()),
        ('model', LinearRegression(fit_intercept=False) )
    ])

    ridge = Pipeline([
        ('transformer',  transformer),
        ('scaler', StandardScaler()),
        ('model', Ridge() )
    ])

    lasso = Pipeline([
        ('transformer',  transformer),
        ('scaler', StandardScaler()),
        ('model', Lasso(random_state=rs) )
    ])

    fs = Pipeline([
        ('transformer',  transformer),
        ('scaler', StandardScaler()),
        ('selector', SequentialFeatureSelector(LinearRegression(fit_intercept=False))),
        ('model', LinearRegression() )
    ])

    fs_l = Pipeline([
        ('transformer',  transformer),
        ('scaler', StandardScaler()),
        ('selector', SequentialFeatureSelector(Lasso(random_state=42))),
        ('model', LinearRegression() )
    ])
    
    complex = Pipeline([
        ('transformer',  transformer),
        ('scaler', StandardScaler()),
        ('selector', SequentialFeatureSelector(Lasso(random_state=42))),
        ('model', Ridge(random_state=rs))
    ])
    
    ms = Pipeline([
        ('transformer',  transformer),
        ('scaler', StandardScaler()),
        ('selector', SelectFromModel(Lasso(random_state=rs))),
        ('model', LinearRegression() )
    ])
    return linear, ridge, lasso, fs, complex, ms, fs_l

def dump_feature_imp(pipeline):
    r = permutation_importance(pipeline, X_test, y_test, random_state=22, n_repeats=30, scoring='r2')
    for i in r.importances_mean.argsort()[::-1]:
        if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
            print(f"{X.columns[i]:<18}  "
            f"{r.importances_mean[i]:.3f} "
            f" +/- {r.importances_std[i]:.3f}")

