In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

class DataAnalyzer:
    def __init__(self, data_path):
        self.data = pd.read_csv(data_path)
        self.preprocessed_data = None
        self.scaler = StandardScaler()

    def preprocess(self):
        # Handle missing values
        self.data = self.data.fillna(self.data.mean())
        
        # Remove duplicates
        self.data = self.data.drop_duplicates()
        
        # Scale numerical features
        numerical_cols = self.data.select_dtypes(include=[np.number]).columns
        self.data[numerical_cols] = self.scaler.fit_transform(
            self.data[numerical_cols])
        
        return self.data

    def split_data(self, target_column, test_size=0.2):
        X = self.data.drop(target_column, axis=1)
        y = self.data[target_column]
        return train_test_split(X, y, test_size=test_size, random_state=42)
