<a href="https://colab.research.google.com/github/sb200004/summerGit/blob/main/week5assignmentLMS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
uploaded = files.upload()


In [None]:
import pandas as pd

# Load datasets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Save SalePrice and drop from train to prepare for preprocessing
y = train['SalePrice']
train.drop(['SalePrice'], axis=1, inplace=True)

# Combine train and test for uniform preprocessing
all_data = pd.concat([train, test], axis=0, sort=False)
print("Shape of all_data:", all_data.shape)


In [None]:
# Overview
print(all_data.info())

# Check missing values
missing = all_data.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)
print("Missing values:\n", missing)

# Preview data
all_data.head()


In [None]:
# Drop 'Id' and features with very high missing values or no variance
all_data.drop(['Id', 'Alley', 'PoolQC', 'Fence', 'MiscFeature', 'Utilities'], axis=1, inplace=True)


In [None]:
# Fill categorical with 'None'
for col in ['FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
            'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
            'MasVnrType']:
    all_data[col] = all_data[col].fillna('None')

# Fill numerical with 0
for col in ['GarageYrBlt', 'GarageArea', 'GarageCars',
            'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
            'BsmtFullBath', 'BsmtHalfBath', 'MasVnrArea']:
    all_data[col] = all_data[col].fillna(0)

# Fill LotFrontage using neighborhood median
all_data['LotFrontage'] = all_data.groupby('Neighborhood')['LotFrontage'].transform(
    lambda x: x.fillna(x.median())
)

# Remaining categorical fill with mode
all_data = all_data.fillna(all_data.mode().iloc[0])


In [None]:
# Total square footage
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']

# Total bathrooms
all_data['TotalBath'] = (all_data['FullBath'] + 0.5 * all_data['HalfBath'] +
                         all_data['BsmtFullBath'] + 0.5 * all_data['BsmtHalfBath'])

# Has garage, pool, etc.
all_data['HasGarage'] = all_data['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
all_data['HasPool'] = all_data['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
all_data['Has2ndFloor'] = all_data['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)


In [None]:
# Convert some numeric to categorical
all_data['MSSubClass'] = all_data['MSSubClass'].astype(str)
all_data['OverallCond'] = all_data['OverallCond'].astype(str)
all_data['YrSold'] = all_data['YrSold'].astype(str)
all_data['MoSold'] = all_data['MoSold'].astype(str)

# One-hot encode all remaining categorical
all_data = pd.get_dummies(all_data)
print("Data shape after encoding:", all_data.shape)


In [None]:
from scipy.stats import skew
import numpy as np

# Identify skewed numeric features
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewness = skewed_feats[abs(skewed_feats) > 0.75]

# Apply log1p
for feat in skewness.index:
    all_data[feat] = np.log1p(all_data[feat])

# Log transform target too
y = np.log1p(y)


In [None]:
# Split data back
X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
