<a href="https://colab.research.google.com/github/rohitarer/PRODIGY_ML_01/blob/main/PRODIGY_ML_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [94]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [95]:
# Load the dataset
data = pd.read_csv('./train.csv')

In [96]:
# Display the first few rows of the dataset
print(data.head())

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

In [97]:
# Display the first few rows of the dataset
print(data.head())

# Display the column names to identify relevant features
print(data.columns)

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

In [98]:
# Define the features and target variable based on the dataset structure
features = ['GrLivArea', 'BedroomAbvGr', 'FullBath', 'OverallQual', 'YearBuilt', 'TotalBsmtSF',
            '1stFlrSF', '2ndFlrSF', 'GarageCars', 'GarageArea']
target = 'SalePrice'

In [99]:
# Prepare the data
X = data[features]
y = data[target]

In [100]:
# Handle missing values, scaling, and polynomial features
numeric_features = features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2, include_bias=False))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)])

In [101]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [102]:
# Apply preprocessing
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [103]:
# Create the Ridge regression model
model = Ridge(alpha=1.0)

In [104]:
# Create the linear regression model
model = LinearRegression()

In [105]:
# Train the model
model.fit(X_train, y_train)

In [106]:
# Make predictions
y_pred = model.predict(X_test)
y_pred

array([138237.54056608, 310078.65374836, 123751.43684079, 170281.28912559,
       318621.69266116,  72503.99484775, 206130.2963572 , 157382.02692873,
        72052.86554833, 133352.01835247, 152301.57323386, 114250.16298649,
       155635.61370719, 219733.5748637 , 185573.99823037, 127508.95491096,
       202187.22255877, 133563.96101806, 125552.88945481, 211908.00109445,
       165520.59311325, 204825.36515573, 169130.0998332 , 123173.32630087,
       205784.30272529, 169204.04319039, 196311.45891756, 104442.38712564,
       178704.44062498, 206321.12338606, 104008.28674356, 268728.33940631,
        98506.22046208, 117408.76594631, 268401.38911763, 142291.46754495,
       152946.67944282, 219136.1455298 , 305727.93070232,  92426.23194028,
        87069.35254177, 230762.48308779, 123944.51836123, 345882.30151646,
       127021.05604007, 187121.67064268, 123273.86786517, 122116.38856083,
       399975.25518292, 142443.48901071, 117635.31858695, 203431.94639005,
       117647.98642061, 2

In [107]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Squared Error: 1188213284.7381086
R-squared: 0.845089602105221


In [108]:
# # Display the coefficients
# coefficients = pd.DataFrame(model.coef_, features, columns=['Coefficient'])
# print(coefficients)

In [109]:
score = model.score(X_test, y_test)
score

0.845089602105221