In [1]:
import pandas as pd

# Load my dataset
df = pd.read_csv('my_train.csv')

# Initialize an empty dictionary to store the count of unique values for each field
unique_counts = {}

# Iterate over each column in the DataFrame
for column in df.columns:
    # Count the number of unique values in each column
    unique_counts[column] = df[column].nunique()

# Print the unique value counts for each field
print("Unique value counts for each field:")
for field, count in unique_counts.items():
    print(f"{field}: {count}")


Unique value counts for each field:
Id: 1314
MSSubClass: 15
MSZoning: 5
LotFrontage: 107
LotArea: 989
Street: 2
Alley: 2
LotShape: 4
LandContour: 4
Utilities: 2
LotConfig: 5
LandSlope: 3
Neighborhood: 25
Condition1: 9
Condition2: 8
BldgType: 5
HouseStyle: 8
OverallQual: 10
OverallCond: 9
YearBuilt: 110
YearRemodAdd: 61
RoofStyle: 6
RoofMatl: 8
Exterior1st: 15
Exterior2nd: 16
MasVnrType: 3
MasVnrArea: 304
ExterQual: 4
ExterCond: 5
Foundation: 6
BsmtQual: 4
BsmtCond: 4
BsmtExposure: 4
BsmtFinType1: 6
BsmtFinSF1: 601
BsmtFinType2: 6
BsmtFinSF2: 131
BsmtUnfSF: 730
TotalBsmtSF: 686
Heating: 6
HeatingQC: 4
CentralAir: 2
Electrical: 5
1stFlrSF: 721
2ndFlrSF: 390
LowQualFinSF: 21
GrLivArea: 810
BsmtFullBath: 4
BsmtHalfBath: 3
FullBath: 4
HalfBath: 3
BedroomAbvGr: 8
KitchenAbvGr: 4
KitchenQual: 4
TotRmsAbvGrd: 12
Functional: 7
Fireplaces: 4
FireplaceQu: 5
GarageType: 6
GarageYrBlt: 96
GarageFinish: 3
GarageCars: 5
GarageArea: 422
GarageQual: 5
GarageCond: 5
PavedDrive: 3
WoodDeckSF: 253
OpenPor

In [11]:
# Identify categorical and numerical columns
categorical_cols = train_data.select_dtypes(include=['object', 'bool']).columns.tolist()
numerical_cols = train_data.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_cols.remove('SalePrice')  # Exclude the target variable from the features

# Define the preprocessing for numerical and categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with mean
    ('scaler', StandardScaler())  # Standardize numerical features
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),  # Handle missing values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical features
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Apply the transformations to the training and development datasets
X_train = preprocessor.fit_transform(train_data.drop('SalePrice', axis=1))
X_dev = preprocessor.transform(dev_data.drop('SalePrice', axis=1))

# Extract the target variable
y_train = train_data['SalePrice']
y_dev = dev_data['SalePrice']

# Train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Get the model coefficients
coefficients = model.coef_

# Manually construct feature names for the one-hot encoded categorical features
onehot_feature_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_cols)
all_feature_names = np.concatenate([numerical_cols, onehot_feature_names])

# Create a DataFrame with feature names and coefficients
feature_importance = pd.DataFrame({
    'Feature': all_feature_names,
    'Coefficient': coefficients
})

# Sort the features by their coefficients
sorted_features = feature_importance.sort_values(by='Coefficient', ascending=False)

# Extract the top 10 most positive and negative features
top_10_positive = sorted_features.head(10)
top_10_negative = sorted_features.tail(10)

print("Top 10 Most Positive Features:\n", top_10_positive)
print("\nTop 10 Most Negative Features:\n", top_10_negative)

Top 10 Most Positive Features:
                   Feature    Coefficient
277        PoolQC_missing  265865.782104
128      RoofMatl_Membran  137717.033597
133      RoofMatl_WdShngl  114277.343147
129        RoofMatl_Metal  102092.066192
102       Condition2_PosA   98741.957380
259         GarageQual_Ex   87534.061440
125        RoofStyle_Shed   67446.969683
99      Condition2_Artery   52054.724610
131      RoofMatl_Tar&Grv   45775.149628
87   Neighborhood_StoneBr   44101.310195

Top 10 Most Negative Features:
               Feature    Coefficient
64      LandSlope_Sev  -27170.191823
262     GarageQual_Po  -33800.980324
240    Functional_Sev  -33826.682802
228    Electrical_Mix  -34426.156709
104   Condition2_RRAe  -75258.088116
265     GarageCond_Ex  -82418.172324
276         PoolQC_Gd -126232.159564
275         PoolQC_Fa -153639.689603
103   Condition2_PosN -191625.940796
126  RoofMatl_ClyTile -520189.208076


In [4]:
# Import necessary libraries
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
from sklearn.pipeline import Pipeline
import numpy as np

# Load the datasets
train_data_path = 'my_train.csv'
dev_data_path = 'my_dev.csv'
train_data = pd.read_csv(train_data_path)
dev_data = pd.read_csv(dev_data_path)

# Convert all fields to strings
train_data = train_data.astype(str)
dev_data = dev_data.astype(str)

# Extracting the target variable 'SalePrice' and converting to float
y_train = train_data['SalePrice'].astype(float)
y_dev = dev_data['SalePrice'].astype(float)

# Dropping the 'Id' column and the target variable from the datasets
X_train = train_data.drop(['SalePrice', 'Id'], axis=1)
X_dev = dev_data.drop(['SalePrice', 'Id'], axis=1)

# Define the OneHotEncoder for categorical features
cat_processor = OneHotEncoder(sparse=False, handle_unknown='ignore')

# Identifying all columns as categorical
categorical_features = list(X_train.columns)

# Creating the column transformer to apply OneHotEncoder to all features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_processor, categorical_features)
    ])

# Creating a pipeline that first transforms the data and then applies linear regression
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('regressor', LinearRegression())])

# Fit the model on the training data
model_pipeline.fit(X_train, y_train)

# Predict on the dev set
y_dev_pred = model_pipeline.predict(X_dev)

# Compute the Root Mean Squared Log Error (RMSLE)
rmsle = np.sqrt(mean_squared_log_error(y_dev, y_dev_pred))

print(f"Root Mean Squared Log Error (RMSLE): {rmsle}")




Root Mean Squared Log Error (RMSLE): 0.1578699386146292


In [6]:
# Import necessary libraries
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
import numpy as np

# Load the training data
train_data_path = 'my_train.csv'
train_data = pd.read_csv(train_data_path)

# Convert all fields to strings
train_data = train_data.astype(str)

# Extracting the target variable 'SalePrice' and converting to float
y_train = train_data['SalePrice'].astype(float)

# Dropping the 'Id' column and the target variable from the training dataset
X_train = train_data.drop(['SalePrice', 'Id'], axis=1)

# Define the OneHotEncoder for categorical features
cat_processor = OneHotEncoder(sparse=False, handle_unknown='ignore')

# Identifying all columns as categorical
categorical_features = list(X_train.columns)

# Creating the column transformer to apply OneHotEncoder to all features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_processor, categorical_features)
    ])

# Creating a pipeline that first transforms the data and then applies linear regression
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('regressor', LinearRegression())])

# Fit the model on the training data
model_pipeline.fit(X_train, y_train)

# Load the test data
test_data_path = 'test.csv'  # Replace with the correct path to your test data
test_data = pd.read_csv(test_data_path)

# Convert all fields to strings
test_data = test_data.astype(str)

# Store the Id column for the submission file
test_ids = test_data['Id']

# Dropping the 'Id' column from the test dataset
X_test = test_data.drop(['Id'], axis=1)

# Predict on the test set using the trained model
y_test_pred = model_pipeline.predict(X_test)

# Create the submission DataFrame
submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': y_test_pred
})

# Path to save the submission file
submission_file_path = 'submission.csv'

# Save the submission file
submission.to_csv(submission_file_path, index=False)

# Displaying the first few rows of the submission DataFrame
print(submission.head())




KeyError: "['Id'] not in index"

In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
import numpy as np

# Load the training data
train_data_path = 'train.csv'
train_data = pd.read_csv(train_data_path)

# Convert all fields to strings
train_data = train_data.astype(str)

# Extracting the target variable 'SalePrice' and converting to float
y_train = train_data['SalePrice'].astype(float)

# Dropping the target variable 'SalePrice' from the training dataset
X_train = train_data.drop(['SalePrice'], axis=1)

# Define the OneHotEncoder for categorical features
cat_processor = OneHotEncoder(sparse=False, handle_unknown='ignore')

# Identifying all columns as categorical
categorical_features = list(X_train.columns)

# Creating the column transformer to apply OneHotEncoder to all features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_processor, categorical_features)
    ])

# Creating a pipeline that first transforms the data and then applies linear regression
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('regressor', LinearRegression())])

# Fit the model on the training data
model_pipeline.fit(X_train, y_train)

# Load the test data
test_data_path = 'test.csv'  # Replace with the correct path to your test data
test_data = pd.read_csv(test_data_path)

# Convert all fields to strings
test_data = test_data.astype(str)

# Predict on the test set using the trained model
y_test_pred = model_pipeline.predict(test_data)

# Create the submission DataFrame
submission = pd.DataFrame({
    'Id': test_data['Id'],
    'SalePrice': y_test_pred
})

# Path to save the submission file
submission_file_path = 'submission.csv'

# Save the submission file
submission.to_csv(submission_file_path, index=False)

# Displaying the first few rows of the submission DataFrame
print(submission.head())




     Id      SalePrice
0  1461  148233.962311
1  1462  170419.163582
2  1463  200189.724848
3  1464  208479.092619
4  1465  217482.952381
