In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import IsolationForest

In [2]:
train_data = pd.read_csv('S:/COLLEGE/INTERN/PRODIGY-Machine Learning/train.csv')
test_data = pd.read_csv('S:/COLLEGE/INTERN/PRODIGY-Machine Learning/test.csv')

In [3]:
# Display information about the training data
print(train_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [4]:
# Step 2: Data Preprocessing
# Assuming 'GrLivArea', 'BedroomAbvGr', and 'FullBath' are the relevant features
X_train = train_data[['GrLivArea', 'BedroomAbvGr', 'FullBath']]
y_train = train_data['SalePrice']

X_test = test_data[['GrLivArea', 'BedroomAbvGr', 'FullBath']]


In [5]:
# Handle missing values and scale features
preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

In [6]:
# Feature Engineering
train_data['TotalBathrooms'] = train_data['FullBath'] + 0.5 * train_data['HalfBath'] + train_data['BsmtFullBath'] + 0.5 * train_data['BsmtHalfBath']
train_data['BedBathRatio'] = train_data['BedroomAbvGr'] / train_data['TotalBathrooms']

test_data['TotalBathrooms'] = test_data['FullBath'] + 0.5 * test_data['HalfBath'] + test_data['BsmtFullBath'] + 0.5 * test_data['BsmtHalfBath']
test_data['BedBathRatio'] = test_data['BedroomAbvGr'] / test_data['TotalBathrooms']

In [7]:
# Outlier Removal
iso_forest = IsolationForest(contamination=0.02, random_state=42)
outliers = iso_forest.fit_predict(train_data[['GrLivArea', 'BedroomAbvGr', 'FullBath', 'TotalBathrooms', 'BedBathRatio']])
train_data = train_data.loc[outliers == 1]



In [8]:
# Step 3: Train the Linear Regression Model with Feature Engineering and Regularization
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('poly_features', PolynomialFeatures(degree=2)),
    ('regressor', Ridge(alpha=1.0))  # You can adjust the alpha parameter for Ridge regularization
])

model.fit(X_train_preprocessed, y_train)

In [9]:
# Step 4: Model Evaluation with Cross-Validation
y_pred = model.predict(X_train_preprocessed)
mse = mean_squared_error(y_train, y_pred)
r2 = r2_score(y_train, y_pred)

cv_scores = cross_val_score(model, X_train_preprocessed, y_train, scoring='neg_mean_squared_error', cv=5)
cv_rmse_scores = np.sqrt(-cv_scores)

print(f'Training Mean Squared Error: {mse}')
print(f'Training R-squared: {r2}')
print(f'Cross-Validation RMSE: {np.mean(cv_rmse_scores)}')

Training Mean Squared Error: 2407872998.5539765
Training R-squared: 0.6182093365001203
Cross-Validation RMSE: 51213.59871141924


In [10]:
# Step 5: Make Predictions on the Test Set
predicted_prices = model.predict(X_test_preprocessed)

In [11]:
# Step 6: Create Submission File
submission_df = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': predicted_prices})
submission_df.to_csv('submission.csv', index=False)

In [12]:
# Display the first few rows of the submission file
print(submission_df.head())

     Id      SalePrice
0  1461  116778.571170
1  1462  146441.289975
2  1463  203960.231025
3  1464  200271.915320
4  1465  181542.424503


In [13]:
from IPython.display import FileLink

# Create a download link for the CSV file
FileLink('submission.csv')