In [53]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer, RobustScaler
from sklearn.utils.extmath import randomized_svd
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer


In [54]:
parts_data = pd.read_csv('/Users/skylerwilson/Desktop/Lighthouse_Labs/Projects/final_project/data/Project_Data/parts_data_functions.csv')



In [63]:
# Clean sales data function
def clean_sales_data(df, column_names):
    for col in column_names:  
        df[col] = np.abs(df[col])
    return df

# Clean quantity data function
def clean_quantity_data(df, quantity_col):
    return df[df[quantity_col] > 0]

# Clean turnover data function
def clean_turnover_data(df, turnover_col):
    return df[df[turnover_col] >= 0]

def z_score(column, threshold=2):
    z_scores = (column - column.mean()) / column.std()
    return np.abs(z_scores) < threshold

sales_data = ['Sales Last Month', 'Sales Last 3 Months', 'Sales Last 6 Months', 'Sales Last 9 Months',
              'Sales Last 12 Months', 'Sales Last 2 Years', 'Sales Last 3 Years',
              'Sales Last 4 Years', 'Sales Last 5 Years', 'Sales Last 10 Years',
              'Months No Sale', 'Reorder Point', 'Sales - Jan', 'Sales - Feb',
              'Sales - Mar', 'Sales - Apr', 'Sales - May', 'Sales - Jun',
              'Sales - Jul', 'Sales - Aug', 'Sales - Sep', 'Sales - Oct',
              'Sales - Nov', 'Sales - Dec', 'Sales - 1st Qtr', 'Sales - 2nd Qtr',
              'Sales - 3rd Qtr', 'Sales - 4th Qtr', 'Sales - This Year','Sales - Last Year']
quantity_col = 'Quantity'
turnover_col = 'Turnover'
num_cols = parts_data.select_dtypes(include='number').columns

# Apply preprocessing steps
parts_data = clean_sales_data(parts_data, sales_data)
parts_data = clean_quantity_data(parts_data, quantity_col)
parts_data = clean_turnover_data(parts_data, turnover_col)


# Apply z-score transformation to numerical columns
parts_data[num_cols] = parts_data[parts_data[num_cols].apply(z_score)][num_cols]

#deals with columns that dont have data yet so they end up as NaN when they shouldnt
parts_data[num_cols] = np.where(parts_data[num_cols].isna(), 0, parts_data[num_cols])



In [64]:
X = parts_data.select_dtypes(include='number')
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42, shuffle=True)

# Randomized SVD Transformer
class RandomizedSVDTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, n_components):
        self.n_components = n_components
    
    def fit(self, X, y=None):
        U, sigma, VT = randomized_svd(X, n_components=self.n_components)
        self.U = U
        self.sigma = sigma
        self.VT = VT
        return self
    
    def transform(self, X, y=None):
        transformed_data = X.dot(self.VT.T)
        return transformed_data

svd_components = 5
k = 10

# Create FeatureUnion combining SVD and SelectKBest
features = FeatureUnion([
    ('svd', RandomizedSVDTransformer(n_components=svd_components)),
    ('select_k_best', SelectKBest(score_func=f_regression, k=k))
])

# Create your pipeline using the defined features FeatureUnion
pipeline = Pipeline([
    ('features', features),
    ('scaler', RobustScaler())
    #('model', YourModelHere())  # Replace with the appropriate model
])

# Fit and transform on training data
#X_train_scaled = pipeline.fit_transform(X_train)

# Transform on testing data
#X_test_scaled = pipeline.transform(X_test)
