In [14]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load your dataset
data = pd.read_csv("train.csv")
data = data.drop("UID", axis=1)
# Define columns by data type: numeric, 4-category, 3-category, >50-category
numeric_cols = ['col_4']
four_cat_cols = ['col_0', 'col_1']
three_cat_cols = ['col_6']
high_card_cols = ['col_2', 'col_3', 'col_5']

In [15]:
# Define preprocessing steps for different column types
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=2))  # Adjust the number of components as needed
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Fill missing values with the most frequent category
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # Handle unknown categories gracefully
])

In [16]:
# Combine preprocessing steps using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('4cat', categorical_transformer, four_cat_cols + three_cat_cols),
        ('high_card', 'passthrough', high_card_cols)  # No preprocessing for high cardinality columns
    ])

In [17]:
# Create a pipeline that includes preprocessing and the linear regression model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', LinearRegression())])

In [18]:
# Split the data into training and testing sets
X = data.drop(['y'], axis=1)
y = data['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
X_train

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6
1208,A1,B0,C53,D48,50,E1,F0
2355,A0,B0,C7,D1,0,E1,F2
482,A0,BO,C27,D1,0,E1,F2
1199,A0,B0,C11,D1,100,E1,F0
438,A 0,B0,C2,D1,0,E1,F2
...,...,...,...,...,...,...,...
1638,A0,B0,C19,D1,100,E1,F2
1095,A0,B0,C4,D1,0,E1,F2
1130,A2,B0,C11,D1,0,E1,F2
1294,A3,B0,C11,D1,100,E1,F2


In [20]:
# Fit the model
pipeline.fit(X_train, y_train)

ValueError: n_components=2 must be between 0 and min(n_samples, n_features)=1 with svd_solver='full'

In [None]:

# Make predictions
y_pred = pipeline.predict(X_test)

# Calculate RMSE (Root Mean Squared Error) as an example evaluation metric
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'RMSE: {rmse}')

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score

# Load your dataset
# Replace 'your_dataset.csv' with your dataset file
df = pd.read_csv('train.csv')

# Split the data into features (X) and target (y)
X = df.drop(['UID', 'y'], axis=1)  # Replace 'TargetColumn' with your target variable
y = df['y']

# Define categorical and numerical columns
categorical_cols = ['col_0', 'col_1', 'col_2', 'col_3', 'col_5','col_6']
numerical_cols = ['col_4']

# Preprocessing pipeline for numerical columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # You can choose a different strategy
    ('scaler', StandardScaler())
])

# Preprocessing pipeline for categorical columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # You can choose a different strategy
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Column Transformer to apply the appropriate preprocessing to each column
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Create and train the linear regression model within a pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', LinearRegression())])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.2f}")

Mean Squared Error: 2756790759.53
R-squared: 0.39


In [22]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Feature Engineering (e.g., adding polynomial features)
X_train_poly = np.column_stack([X_train_scaled, X_train_scaled**2])  # Example: adding squared features
X_test_poly = np.column_stack([X_test_scaled, X_test_scaled**2])

# Re-fit the model with the engineered features
model.fit(X_train_poly, y_train)

# Make predictions with the updated model
y_pred_poly = model.predict(X_test_poly)

# Calculate R-squared and MSE after feature engineering
r2_poly = r2_score(y_test, y_pred_poly)
mse_poly = mean_squared_error(y_test, y_pred_poly)

print(f"Improved R-squared (with feature engineering): {r2_poly:.2f}")
print(f"Improved MSE (with feature engineering): {mse_poly:.2f}")

ValueError: could not convert string to float: 'A1'

In [42]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

def preprocess_data(df, high_cardinality_cols, low_cardinality_cols, target_col):
    """
    Preprocesses the data for linear regression.

    Parameters:
    - df: DataFrame containing the data.
    - high_cardinality_cols: List of column names with high cardinality (more than 50 classes).
    - low_cardinality_cols: List of column names with low cardinality (3-4 classes).
    - target_col: Name of the target column.

    Returns:
    - Preprocessed DataFrame.
    """

    # Separate features and target
    X = df.drop(target_col, axis=1)
    y = df[target_col]

    # Preprocessing pipelines for high-cardinality and low-cardinality features
    high_cardinality_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    low_cardinality_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Combine transformers using ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('high_cardinality', high_cardinality_transformer, high_cardinality_cols),
            ('low_cardinality', low_cardinality_transformer, low_cardinality_cols)
        ],
        remainder='passthrough'  # Pass through numerical features
    )

    # Create a pipeline that includes preprocessing and scaling
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('scaler', StandardScaler(with_mean=False))  # Scale numerical features
    ])

    # Fit and transform the data
    X_processed = pipeline.fit_transform(X)
    print(X_processed.shape)
    print(pipeline.named_steps['preprocessor'].get_feature_names_out(X.columns))
    # Create a DataFrame with the processed data and column names
    processed_df = pd.DataFrame(X_processed, columns=X.columns)

    # Concatenate target column back to the DataFrame
    processed_df[target_col] = y

    return processed_df

In [43]:
df['y']

0       237000
1        86193
2       169200
3        58000
4       235000
         ...  
2623    102100
2624    129300
2625    275300
2626    150000
2627    191475
Name: y, Length: 2628, dtype: int64

In [41]:
high_cardinality_cols = ['col_2', 'col_3', 'col_5']
low_cardinality_cols = ['col_0', 'col_1', 'col_4', 'col_6']
target_col = 'y'
df_preprocessed = preprocess_data(df, high_cardinality_cols, low_cardinality_cols, target_col)

(2628, 268)


ValueError: Shape of passed values is (2628, 1), indices imply (2628, 7)