In [1]:
""" Get the number of unique values of each category """
import pandas as pd
df = pd.read_csv('my_train.csv')

unique_counts = {}
for column in df.columns:
    unique_counts[column] = df[column].nunique()
print("Unique value counts for each field:")
for field, count in unique_counts.items():
    print(f"{field}: {count}")


Unique value counts for each field:
Id: 1314
MSSubClass: 15
MSZoning: 5
LotFrontage: 107
LotArea: 989
Street: 2
Alley: 2
LotShape: 4
LandContour: 4
Utilities: 2
LotConfig: 5
LandSlope: 3
Neighborhood: 25
Condition1: 9
Condition2: 8
BldgType: 5
HouseStyle: 8
OverallQual: 10
OverallCond: 9
YearBuilt: 110
YearRemodAdd: 61
RoofStyle: 6
RoofMatl: 8
Exterior1st: 15
Exterior2nd: 16
MasVnrType: 3
MasVnrArea: 304
ExterQual: 4
ExterCond: 5
Foundation: 6
BsmtQual: 4
BsmtCond: 4
BsmtExposure: 4
BsmtFinType1: 6
BsmtFinSF1: 601
BsmtFinType2: 6
BsmtFinSF2: 131
BsmtUnfSF: 730
TotalBsmtSF: 686
Heating: 6
HeatingQC: 4
CentralAir: 2
Electrical: 5
1stFlrSF: 721
2ndFlrSF: 390
LowQualFinSF: 21
GrLivArea: 810
BsmtFullBath: 4
BsmtHalfBath: 3
FullBath: 4
HalfBath: 3
BedroomAbvGr: 8
KitchenAbvGr: 4
KitchenQual: 4
TotRmsAbvGrd: 12
Functional: 7
Fireplaces: 4
FireplaceQu: 5
GarageType: 6
GarageYrBlt: 96
GarageFinish: 3
GarageCars: 5
GarageArea: 422
GarageQual: 5
GarageCond: 5
PavedDrive: 3
WoodDeckSF: 253
OpenPor

In [33]:
# Identify categorical and numerical columns
categorical_cols = train_data.select_dtypes(include=['object', 'bool']).columns.tolist()
numerical_cols = train_data.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_cols.remove('SalePrice')  # Exclude the target variable from the features

# numerical and categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with mean
    ('scaler', StandardScaler())  # Standardize numerical features
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),  # Handle missing values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical features
])

# Processor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

X_train = preprocessor.fit_transform(train_data.drop('SalePrice', axis=1))
X_dev = preprocessor.transform(dev_data.drop('SalePrice', axis=1))

# target
y_train = train_data['SalePrice']
y_dev = dev_data['SalePrice']

model = LinearRegression()
model.fit(X_train, y_train)

coefficients = model.coef_
onehot_feature_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_cols)
all_feature_names = np.concatenate([numerical_cols, onehot_feature_names])

feature_importance = pd.DataFrame({
    'Feature': all_feature_names,
    'Coefficient': coefficients
})

# Sort the features by their coefficients
sorted_features = feature_importance.sort_values(by='Coefficient', ascending=False)

# Extract the top 10 most positive and negative features
top_10_positive = sorted_features.head(10)
top_10_negative = sorted_features.tail(10)

print("Top 10 Most Positive Features is:\n", top_10_positive)
print("\nTop 10 Most Negative Features is:\n", top_10_negative)

Top 10 Most Positive Features:
                   Feature    Coefficient
277        PoolQC_missing  265865.782104
128      RoofMatl_Membran  137717.033597
133      RoofMatl_WdShngl  114277.343147
129        RoofMatl_Metal  102092.066192
102       Condition2_PosA   98741.957380
259         GarageQual_Ex   87534.061440
125        RoofStyle_Shed   67446.969683
99      Condition2_Artery   52054.724610
131      RoofMatl_Tar&Grv   45775.149628
87   Neighborhood_StoneBr   44101.310195

Top 10 Most Negative Features:
               Feature    Coefficient
64      LandSlope_Sev  -27170.191823
262     GarageQual_Po  -33800.980324
240    Functional_Sev  -33826.682802
228    Electrical_Mix  -34426.156709
104   Condition2_RRAe  -75258.088116
265     GarageCond_Ex  -82418.172324
276         PoolQC_Gd -126232.159564
275         PoolQC_Fa -153639.689603
103   Condition2_PosN -191625.940796
126  RoofMatl_ClyTile -520189.208076


In [38]:
""" naive approach """
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
import numpy as np

train_data_path = 'my_train.csv'
dev_data_path = 'my_dev.csv'
train_data = pd.read_csv(train_data_path)
dev_data = pd.read_csv(dev_data_path)

# Convert all fields to strings
train_data = train_data.astype(str)
dev_data = dev_data.astype(str)


y_train = np.log(train_data['SalePrice'].astype(float))
y_dev = dev_data['SalePrice'].astype(float) 

# Dropping the 'Id' column and the target variable from the datasets
X_train = train_data.drop(['SalePrice', 'Id'], axis=1)
X_dev = dev_data.drop(['SalePrice', 'Id'], axis=1)

cat_processor = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

categorical_features = list(X_train.columns)

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_processor, categorical_features)
    ])

# Creating a pipeline that first transforms the data and then applies linear regression
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('regressor', LinearRegression())])

model_pipeline.fit(X_train, y_train)
y_dev_pred_log = model_pipeline.predict(X_dev)
y_dev_pred = np.exp(y_dev_pred_log)

# Compute the Root Mean Squared Log Error (RMSLE)
rmsle = np.sqrt(mean_squared_log_error(y_dev, y_dev_pred))

print(f"Root Mean Squared Log Error (RMSLE): {rmsle}")


Root Mean Squared Log Error (RMSLE): 0.15233137894779697


In [39]:
coefficients = model_pipeline.named_steps['regressor'].coef_

feature_names = model_pipeline.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(input_features=categorical_features)

feature_importance = dict(zip(feature_names, coefficients))

sorted_features = sorted(feature_importance.items(), key=lambda x: x[1])

# Top 10 most positive features
top_10_positive = sorted_features[-10:]

# Top 10 most negative features
top_10_negative = sorted_features[:10]

print("Top 10 most positive features:")
for feature, coeff in reversed(top_10_positive):
    print(f"{feature}: {coeff}")

print("\nTop 10 most negative features:")
for feature, coeff in top_10_negative:
    print(f"{feature}: {coeff}")


Top 10 most positive features:
FullBath_3: 0.1384868955128779
OverallQual_9: 0.13777419218234832
Neighborhood_StoneBr: 0.12192484382630553
2ndFlrSF_472: 0.1117291881713578
OverallQual_8: 0.10845606473475708
RoofMatl_WdShngl: 0.09399030509045228
GrLivArea_1192: 0.08998259228027972
Neighborhood_NoRidge: 0.08973706153848256
LotArea_8029: 0.08741275605842641
GarageCars_3: 0.08656765942192202

Top 10 most negative features:
MSZoning_C (all): -0.2008037480372122
GrLivArea_968: -0.1275052412263517
EnclosedPorch_236: -0.12296424688865087
OverallQual_3: -0.11659502734479932
LotArea_8281: -0.10903583425995617
BsmtFinSF2_311: -0.10793872586682125
OverallCond_3: -0.10047231859640178
GarageCars_1: -0.0947984976593521
OverallQual_1: -0.08972090034376787
YearRemodAdd_1958: -0.08911884159630307


In [27]:
test_data_path = 'test.csv' 
test_data = pd.read_csv(test_data_path)
test_data = test_data.astype(str)

# Dropping the 'Id' column from the test dataset
X_test = test_data.drop(['Id'], axis=1)  

y_test_pred_log = model_pipeline.predict(X_test)

# Convert predictions back to SalePrice scale
y_test_pred = np.exp(y_test_pred_log)

submission = pd.DataFrame({
    'Id': test_data['Id'].astype(int),
    'SalePrice': y_test_pred
})

submission_file_path = 'my_submission_new.csv'
submission.to_csv(submission_file_path, index=False)

In [58]:
""" Smarter binarization """

import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
from sklearn.pipeline import Pipeline
import numpy as np

train_data_path = 'my_train.csv'
dev_data_path = 'my_dev.csv'
train_data = pd.read_csv(train_data_path)
dev_data = pd.read_csv(dev_data_path)

y_train = np.log(train_data['SalePrice'].astype(float))
y_dev = dev_data['SalePrice'].astype(float)

# Dropping the 'Id' column and the target variable from the datasets
X_train = train_data.drop(['SalePrice', 'Id'], axis=1)
X_dev = dev_data.drop(['SalePrice', 'Id'], axis=1)

# Columns with only numerical data and no missing values
numerical_no_na = X_train.select_dtypes(include=[np.number]).dropna(axis=1).columns.tolist()

# Columns with numerical data and NaN values
numerical_with_na = X_train.select_dtypes(include=[np.number]).columns[X_train.select_dtypes(include=[np.number]).isna().any()].tolist()

# (categorical data)
categorical = X_train.select_dtypes(include=[object]).columns.tolist()

# Pipeline for numerical features without missing values
num_processor_no_na = Pipeline([
    ('scaler', StandardScaler())
])

# Pipeline for categorical features
cat_processor = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Pipeline for numerical features with missing values
num_processor_with_na = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_processor, categorical),
        ('num_no_na', num_processor_no_na, numerical_no_na),
        ('num_with_na', num_processor_with_na, numerical_with_na)
    ])

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

model_pipeline.fit(X_train, y_train)
y_dev_pred_log = model_pipeline.predict(X_dev)

y_dev_pred = np.exp(y_dev_pred_log)

rmsle = np.sqrt(mean_squared_log_error(y_dev, y_dev_pred))

print(f"Root Mean Squared Log Error (RMSLE): {rmsle}")



Root Mean Squared Log Error (RMSLE): 0.12409284941384163


In [59]:
X_train_transformed = preprocessor.fit_transform(X_train)
print(f"Total number of features after preprocessing: {X_train_transformed.shape[1]}")


Total number of features after preprocessing: 302


In [61]:
model_pipeline.fit(X_train, y_train)
coefficients = model_pipeline.named_steps['regressor'].coef_

# Extracting feature names for one-hot encoded categorical features
cat_feature_names = model_pipeline.named_steps['preprocessor'].named_transformers_['cat']['onehot'].get_feature_names_out(categorical)
feature_names = np.concatenate([cat_feature_names, 
                                numerical_no_na, 
                                numerical_with_na])
feature_importance = sorted(zip(coefficients, feature_names), reverse=True)

# Top 10 most positive features
top_10_positive = feature_importance[:10]
print("Top 10 most positive features:")
for coef, name in top_10_positive:
    print(f"{name}: {coef}")

# Top 10 most negative features
top_10_negative = feature_importance[-10:]
print("\nTop 10 most negative features:")
for coef, name in top_10_negative:
    print(f"{name}: {coef}")


Top 10 most positive features:
PoolQC_missing: 0.6484730654621055
RoofMatl_Membran: 0.6156025139091502
RoofMatl_Metal: 0.4778672079085623
Condition2_PosA: 0.441465476230923
RoofStyle_Shed: 0.3280193561429262
RoofMatl_Roll: 0.2876307282108068
GarageQual_Ex: 0.2864310168436186
RoofMatl_WdShngl: 0.2820118951349435
RoofMatl_Tar&Grv: 0.244268997579583
RoofMatl_CompShg: 0.22494321463375203

Top 10 most negative features:
PoolQC_Ex: -0.1865461330682581
Exterior1st_BrkComm: -0.18924145797255412
Functional_Maj2: -0.20583117343247515
Functional_Sev: -0.20882843799231643
GarageCond_Ex: -0.2553870578691189
PoolQC_Fa: -0.3162664812072812
MSZoning_C (all): -0.3337389127677652
Condition2_RRAe: -0.47314646828927226
Condition2_PosN: -0.7333125045673314
RoofMatl_ClyTile: -2.3178679573931675


In [52]:
""" Generate smart binarization submission file to Kaggle"""

import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.pipeline import Pipeline

my_train_data_path = 'my_train.csv'  
test_data_path = 'test.csv'         
train_data = pd.read_csv(my_train_data_path)
test_data = pd.read_csv(test_data_path)
y_train = np.log(train_data['SalePrice'].astype(float))

# Dropping the 'Id' column and the target variable from the training dataset
X_train = train_data.drop(['SalePrice', 'Id'], axis=1)

# Identifying columns
numerical_no_na = X_train.select_dtypes(include=[np.number]).dropna(axis=1).columns.tolist()
numerical_with_na = X_train.select_dtypes(include=[np.number]).columns[X_train.select_dtypes(include=[np.number]).isna().any()].tolist()
categorical = X_train.select_dtypes(include=[object]).columns.tolist()

num_processor_no_na = Pipeline([('scaler', StandardScaler())])
cat_processor = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
num_processor_with_na = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_processor, categorical),
        ('num_no_na', num_processor_no_na, numerical_no_na),
        ('num_with_na', num_processor_with_na, numerical_with_na)
    ])

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

model_pipeline.fit(X_train, y_train)
X_test = test_data.drop(['Id'], axis=1)

# Handling missing values and potentially problematic data in the test dataset
for col in X_train.columns:
    if col in numerical_with_na:
        mean_value = train_data[col].mean()
        X_test[col].fillna(mean_value, inplace=True)
    elif col in categorical and col not in X_test.columns:
        X_test[col] = 'missing'

# Check for and handle any infinite values
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)

# Re-checking for any remaining NaN values after all imputations and replacements
if X_test.isna().any().any():
    for col in X_test.columns:
        # Corrected way to check if a column is numerical
        if np.issubdtype(X_test[col].dtype, np.number):
            mean_value = train_data[col].mean()
            X_test[col].fillna(mean_value, inplace=True)

X_test = X_test[X_train.columns]
y_test_pred_log = model_pipeline.predict(X_test)

y_test_pred = np.exp(y_test_pred_log)

submission = pd.DataFrame({
    'Id': test_data['Id'].astype(int),
    'SalePrice': y_test_pred
})
submission_file_path = 'my_submission_my_train.csv'
submission.to_csv(submission_file_path, index=False)

submission_file_path


'my_submission_my_train.csv'

In [53]:
""" 4.1 - sb """
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_log_error
from sklearn.pipeline import Pipeline
import numpy as np

train_data_path = 'my_train.csv'
dev_data_path = 'my_dev.csv'
train_data = pd.read_csv(train_data_path)
dev_data = pd.read_csv(dev_data_path)

y_train = np.log(train_data['SalePrice'].astype(float))
y_dev = dev_data['SalePrice'].astype(float) 

X_train = train_data.drop(['SalePrice', 'Id'], axis=1)
X_dev = dev_data.drop(['SalePrice', 'Id'], axis=1)
numerical_no_na = X_train.select_dtypes(include=[np.number]).dropna(axis=1).columns.tolist()

# Columns with numerical data and NaN values
numerical_with_na = X_train.select_dtypes(include=[np.number]).columns[X_train.select_dtypes(include=[np.number]).isna().any()].tolist()

# Columns with strings (categorical data)
categorical = X_train.select_dtypes(include=[object]).columns.tolist()

# Pipeline for numerical features without missing values
num_processor_no_na = Pipeline([
    ('scaler', StandardScaler())
])
cat_processor = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Pipeline for numerical features with missing values
num_processor_with_na = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_processor, categorical),
        ('num_no_na', num_processor_no_na, numerical_no_na),
        ('num_with_na', num_processor_with_na, numerical_with_na)
    ])

# Creating a pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Ridge())
])

# Define a range of alpha values for tuning
alpha_values = [0.01, 0.1, 1, 10, 100]

best_alpha = None
best_rmsle = float('inf')

# Loop through each alpha value
for alpha in alpha_values:
    # Update the model in the pipeline
    model_pipeline.set_params(regressor=Ridge(alpha=alpha))
    
    model_pipeline.fit(X_train, y_train)
    
    y_dev_pred_log = model_pipeline.predict(X_dev)
    y_dev_pred = np.exp(y_dev_pred_log)
    
    rmsle = np.sqrt(mean_squared_log_error(y_dev, y_dev_pred))
    
    print(f"Alpha: {alpha}, RMSLE: {rmsle}")
    if rmsle < best_rmsle:
        best_rmsle = rmsle
        best_alpha = alpha

print(f"Best Alpha: {best_alpha}, Best RMSLE: {best_rmsle}")


Alpha: 0.01, RMSLE: 0.12425552294130057
Alpha: 0.1, RMSLE: 0.12501976868293904
Alpha: 1, RMSLE: 0.12808332371263367
Alpha: 10, RMSLE: 0.1275838451122021
Alpha: 100, RMSLE: 0.12810048975884905
Best Alpha: 0.01, Best RMSLE: 0.12425552294130057


In [54]:
""" 4.1 naive """
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_log_error
from sklearn.pipeline import Pipeline
import numpy as np

train_data_path = 'my_train.csv'
dev_data_path = 'my_dev.csv'
train_data = pd.read_csv(train_data_path)
dev_data = pd.read_csv(dev_data_path)

# Convert all fields to strings
train_data = train_data.astype(str)
dev_data = dev_data.astype(str)

y_train = np.log(train_data['SalePrice'].astype(float))
y_dev = dev_data['SalePrice'].astype(float)

X_train = train_data.drop(['SalePrice', 'Id'], axis=1)
X_dev = dev_data.drop(['SalePrice', 'Id'], axis=1)

cat_processor = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

categorical_features = list(X_train.columns)

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_processor, categorical_features)
    ])

# Creating a pipeline that first transforms the data and then applies Ridge regression
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('regressor', Ridge())])

# Define a range of alpha values for tuning
alpha_values = [0.01, 0.1, 1, 10, 100]

best_alpha = None
best_rmsle = float('inf')

# Loop through each alpha value
for alpha in alpha_values:
    # Update the model in the pipeline
    model_pipeline.set_params(regressor=Ridge(alpha=alpha))
    
    # Fit the model on the training data
    model_pipeline.fit(X_train, y_train)
    
    # Predict on the dev set
    y_dev_pred_log = model_pipeline.predict(X_dev)
    y_dev_pred = np.exp(y_dev_pred_log)
    
    # Compute the RMSLE
    rmsle = np.sqrt(mean_squared_log_error(y_dev, y_dev_pred))
    
    print(f"Alpha: {alpha}, RMSLE: {rmsle}")

    # Update the best alpha if the current one is better
    if rmsle < best_rmsle:
        best_rmsle = rmsle
        best_alpha = alpha

print(f"Best Alpha: {best_alpha}, Best RMSLE: {best_rmsle}")


Alpha: 0.01, RMSLE: 0.15011200318735915
Alpha: 0.1, RMSLE: 0.14472911780389117
Alpha: 1, RMSLE: 0.14060827537615866
Alpha: 10, RMSLE: 0.13946014128405673
Alpha: 100, RMSLE: 0.15430911822131044
Best Alpha: 10, Best RMSLE: 0.13946014128405673


In [84]:
""" PolynomialFeatures """
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error

# Load the datasets
train_data_path = 'my_train.csv'
dev_data_path = 'my_dev.csv'
train_data = pd.read_csv(train_data_path)
dev_data = pd.read_csv(dev_data_path)

y_train = np.log(train_data['SalePrice'].astype(float))
y_dev = np.log(dev_data['SalePrice'].astype(float))

X_train = train_data.drop(['SalePrice', 'Id'], axis=1)
X_dev = dev_data.drop(['SalePrice', 'Id'], axis=1)

numerical_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, numerical_cols),
    ('cat', cat_pipeline, categorical_cols)
])

# Creating a pipeline with PolynomialFeatures and LinearRegression
polynomial_regression = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('regressor', LinearRegression())
])

polynomial_regression.fit(X_train, y_train)
y_dev_pred_log = polynomial_regression.predict(X_dev)

rmsle = np.sqrt(mean_squared_log_error(np.exp(y_dev), np.exp(y_dev_pred_log)))

print(f"Root Mean Squared Log Error (RMSLE): {rmsle}")


Root Mean Squared Log Error (RMSLE): 0.14956303869696996


In [69]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.pipeline import Pipeline

my_train_data_path = 'my_train.csv'  
test_data_path = 'test.csv'       
train_data = pd.read_csv(my_train_data_path)
test_data = pd.read_csv(test_data_path)

y_train = np.log(train_data['SalePrice'].astype(float))

X_train = train_data.drop(['SalePrice', 'Id'], axis=1)

# Identifying columns
numerical_no_na = X_train.select_dtypes(include=[np.number]).dropna(axis=1).columns.tolist()
numerical_with_na = X_train.select_dtypes(include=[np.number]).columns[X_train.select_dtypes(include=[np.number]).isna().any()].tolist()
categorical = X_train.select_dtypes(include=[object]).columns.tolist()

# Creating pipelines for data preprocessing
num_processor_no_na = Pipeline([('scaler', StandardScaler())])
cat_processor = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
num_processor_with_na = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Creating the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_processor, categorical),
        ('num_no_na', num_processor_no_na, numerical_no_na),
        ('num_with_na', num_processor_with_na, numerical_with_na)
    ])

# Creating a pipeline with PolynomialFeatures
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('regressor', LinearRegression())
])

model_pipeline.fit(X_train, y_train)
X_test = test_data.drop(['Id'], axis=1)

# Handling missing values and potentially problematic data in the test dataset
for col in X_train.columns:
    if col in numerical_with_na:
        mean_value = train_data[col].mean()
        X_test[col].fillna(mean_value, inplace=True)
    elif col in categorical and col not in X_test.columns:
        X_test[col] = 'missing'

# Check for and handle any infinite values
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)

# Re-checking for any remaining NaN values after all imputations and replacements
if X_test.isna().any().any():
    for col in X_test.columns:
        if np.issubdtype(X_test[col].dtype, np.number):
            mean_value = train_data[col].mean()
            X_test[col].fillna(mean_value, inplace=True)

# Predict on the test set (in logarithmic scale)
y_test_pred_log = model_pipeline.predict(X_test)

# Convert predictions back to SalePrice scale
y_test_pred = np.exp(y_test_pred_log)

# Create a DataFrame for the submission
submission = pd.DataFrame({
    'Id': test_data['Id'].astype(int),
    'SalePrice': y_test_pred
})

submission_file_path = 'my_submission_non_linear.csv'
submission.to_csv(submission_file_path, index=False)

print(f"Submission file saved to {submission_file_path}")


Submission file saved to my_submission_non_linear.csv


In [103]:
""" more:GradientBoostingRegressor """
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.pipeline import Pipeline
import numpy as np

gbr_params = {
    'n_estimators': 300,  # 50, 100, 200
    'learning_rate': 0.01,  # 0.01, 0.05, 0.1, 0.2
    'max_depth': 50, 
    'min_samples_split': 3,
    'min_samples_leaf': 2,
    'max_features': 'log2',  # Can be a fraction (e.g., 0.3) or 'sqrt', 'log2'
    'subsample': 0.9  # Values slightly less than 1 can help with overfitting
}

# Creating a pipeline with the GradientBoostingRegressor and parameters
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(**gbr_params))
])

# Rest of your code for fitting and evaluating the model...

# Load the datasets
train_data_path = 'my_train.csv'
dev_data_path = 'my_dev.csv'
train_data = pd.read_csv(train_data_path)
dev_data = pd.read_csv(dev_data_path)

# Extracting the target variable 'SalePrice', converting to float, and taking the logarithm
y_train = np.log(train_data['SalePrice'].astype(float))
y_dev = dev_data['SalePrice'].astype(float)  # Keeping original SalePrice for evaluation

# Dropping the 'Id' column and the target variable from the datasets
X_train = train_data.drop(['SalePrice', 'Id'], axis=1)
X_dev = dev_data.drop(['SalePrice', 'Id'], axis=1)

# Columns with only numerical data and no missing values
numerical_no_na = X_train.select_dtypes(include=[np.number]).dropna(axis=1).columns.tolist()

# Columns with numerical data and NaN values
numerical_with_na = X_train.select_dtypes(include=[np.number]).columns[X_train.select_dtypes(include=[np.number]).isna().any()].tolist()

# Columns with strings (categorical data)
categorical = X_train.select_dtypes(include=[object]).columns.tolist()

# Pipeline for numerical features without missing values
num_processor_no_na = Pipeline([
    ('scaler', StandardScaler())
])

# Pipeline for categorical features
cat_processor = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Pipeline for numerical features with missing values
num_processor_with_na = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Creating the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_processor, categorical),
        ('num_no_na', num_processor_no_na, numerical_no_na),
        ('num_with_na', num_processor_with_na, numerical_with_na)
    ])

# Creating a pipeline with GradientBoostingRegressor
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor())
])

# Fit the model on the training data
model_pipeline.fit(X_train, y_train)

# Predict on the dev set
y_dev_pred_log = model_pipeline.predict(X_dev)
y_dev_pred = np.exp(y_dev_pred_log)

# Compute the RMSLE
rmsle = np.sqrt(mean_squared_log_error(y_dev, y_dev_pred))

print(f"RMSLE: {rmsle}")


RMSLE: 0.13009862823472093


In [108]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.pipeline import Pipeline

train_data_path = 'my_train.csv'
dev_data_path = 'my_dev.csv'
test_data_path = 'test.csv'  # Replace with the correct path to your test dataset

train_data = pd.read_csv(train_data_path)
dev_data = pd.read_csv(dev_data_path)
test_data = pd.read_csv(test_data_path)

# Extracting the target variable 'SalePrice', converting to float, and taking the logarithm
y_train = np.log(train_data['SalePrice'].astype(float))
y_dev = np.log(dev_data['SalePrice'].astype(float))  # Use log for consistent error metric

# Dropping the 'Id' column and the target variable from the datasets
X_train = train_data.drop(['SalePrice', 'Id'], axis=1)
X_dev = dev_data.drop(['SalePrice', 'Id'], axis=1)

# Identify columns
numerical_no_na = X_train.select_dtypes(include=[np.number]).dropna(axis=1).columns.tolist()
numerical_with_na = X_train.select_dtypes(include=[np.number]).columns[X_train.select_dtypes(include=[np.number]).isna().any()].tolist()
categorical = X_train.select_dtypes(include=[object]).columns.tolist()

num_processor_no_na = Pipeline([
    ('scaler', StandardScaler())
])

cat_processor = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
num_processor_with_na = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Creating the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_processor, categorical),
        ('num_no_na', num_processor_no_na, numerical_no_na),
        ('num_with_na', num_processor_with_na, numerical_with_na)
    ])

# Creating a pipeline with GradientBoostingRegressor
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor())
])

model_pipeline.fit(X_train, y_train)

y_dev_pred_log = model_pipeline.predict(X_dev)
y_dev_pred = np.exp(y_dev_pred_log)

# Compute the RMSLE
rmsle = np.sqrt(mean_squared_log_error(np.exp(y_dev), y_dev_pred))
print(f"RMSLE on dev set: {rmsle}")

# Handling missing values and potentially problematic data in the test dataset
X_test = test_data.drop(['Id'], axis=1)
for col in X_train.columns:
    if col in numerical_with_na:
        mean_value = train_data[col].mean()
        X_test[col].fillna(mean_value, inplace=True)
    elif col in categorical and col not in X_test.columns:
        X_test[col] = 'missing'

# Check for and handle any infinite values
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)

# Re-checking for any remaining NaN values after all imputations and replacements
if X_test.isna().any().any():
    for col in X_test.columns:
        if np.issubdtype(X_test[col].dtype, np.number):
            mean_value = train_data[col].mean()
            X_test[col].fillna(mean_value, inplace=True)

y_test_pred_log = model_pipeline.predict(X_test)
y_test_pred = np.exp(y_test_pred_log)

test_ids = test_data['Id']
submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': y_test_pred
})

# Save the DataFrame to a CSV file
submission_file_path = 'p2_submission.csv'
submission.to_csv(submission_file_path, index=False)

print(f"Submission file saved as {submission_file_path}")


RMSLE on dev set: 0.1305113109020391
Submission file saved as p2_submission.csv
