### Common Function Notebook

In [24]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split, KFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import make_column_transformer, TransformedTargetRegressor, make_column_selector, ColumnTransformer
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import SequentialFeatureSelector,SelectFromModel
from sklearn.metrics import mean_squared_error
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
import category_encoders as ce
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import os
from sklearn import set_config
set_config(display="diagram")


warnings.filterwarnings('ignore')

In [25]:

#Simple function to dump the feature with percentages
def feature_null_percentage_in_data(df):
    print(round(df.isnull().sum()/df.shape[0] * 100,2))

def cleaned_data_percent(df):
    cleaned_df = df.dropna()
    print(((df.shape[0] - cleaned_df.shape[0])/df.shape[0])* 100)

def feature_selection_method(pipeline):
    pipeline.fit(X_train, y_train)
    train_mse = mean_squared_error(y_train, pipeline.predict(X_train))
    print('train_mse : ' , train_mse)
    test_mse = mean_squared_error(y_test, pipeline.predict(X_test))
    print('test_mse : ', test_mse)
    score = pipeline.score(X_test, y_test)
    print(score)
    model_coefs = pipeline.named_steps['ridge'].coef_
    feature_names = pipeline.named_steps['selector'].get_feature_names_out()
    print(model_coefs)
    print(feature_names)
    return pd.DataFrame(({'feature' : feature_names, 'coef': model_coefs}))

def getFigTitle(fig, title):
    fig = fig + 1
    return f'Fig{ fig} : {title}', fig

def valuecount_percentages(feature_series):
    vc_series = feature_series.value_counts()
    print('Name             Counts          Percents')
    for item in vc_series.index:
        print(f"{item:<15}  "
        f"{vc_series[item]:<15} "
        f"{(vc_series[item]/feature_series.size) * 100:.3f}")
    percentage_series = feature_series.value_counts(normalize=True).mul(100).round(1)




In [26]:
### Commmon Functions for Data Processing

def count_encoder(org_df, categorical_features):
    encoder = ce.CountEncoder()
    encoded_features = encoder.fit_transform(org_df[categorical_features])
    endoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out())
    used_cars_df_encoded = org_df.drop(columns=categorical_features).merge(endoded_df, how='inner', left_index=True, right_index=True).reset_index()
    used_cars_df_encoded.drop(columns=['index'], inplace=True)
    used_cars_df_encoded = pd.DataFrame(StandardScaler().fit_transform(used_cars_df_encoded), columns=used_cars_df_encoded.columns)
    return used_cars_df_encoded

def run_price_correlation(df):
    return df.corrwith(df["price"]).sort_values(ascending=False)

def onehot_encoder(org_df, categorical_features):
    encoder = OneHotEncoder(sparse_output=False)
    encoded_features = encoder.fit_transform(org_df[categorical_features])
    endoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out())
    used_cars_df_encoded = org_df.merge(endoded_df, how='inner', left_index=True, right_index=True).reset_index()
    print(used_cars_df_encoded.shape)
    used_cars_df_encoded.drop(columns=['index'], inplace=True)
    used_cars_df_encoded.drop(columns=categorical_features, inplace=True)
    used_cars_df_encoded = pd.DataFrame(StandardScaler().fit_transform(used_cars_df_encoded), columns=used_cars_df_encoded.columns)
    return used_cars_df_encoded    

def getX_Y(df):
    X = df.drop('price', axis=1)
    y = df['price']
    return X,y

def get_cat_features(df):
    numerical_features = df.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = df.select_dtypes(include=['object']).columns
#    print('Numerical Features = ', numerical_features)
#    print('Cateorical Features = ', categorical_features)
    return categorical_features
def convert_cat_to_codes(df):
    new_df = df.copy()
    for col_name in new_df.columns:
        if(new_df[col_name].dtype == 'object'):
            new_df[col_name]= new_df[col_name].astype('category')
            new_df[col_name] = new_df[col_name].cat.codes

    return new_df

def variance_comp_count(arr_var, ratio):
    i = 0
    for cumratio in arr_var:
        print(f'{ratio}, {cumratio}')
        if(ratio < int(cumratio)):
            return pca_names
        else:
            pca_names.append(f'pca{i}')
            i = i+1
            return pca_names

