<a href="https://colab.research.google.com/github/sudhamsalagar123/Data-science-Project/blob/main/Data_science_end_to_end_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

# Create synthetic housing data
np.random.seed(42)  # For reproducibility
n_samples = 1000

# Generate features
area = np.random.normal(1500, 500, n_samples)  # Square footage
bedrooms = np.random.randint(1, 6, n_samples)  # Number of bedrooms
bathrooms = np.random.randint(1, 4, n_samples)  # Number of bathrooms
age = np.random.randint(0, 50, n_samples)  # Age of house in years
distance_to_city = np.random.uniform(1, 30, n_samples)  # Distance to city center

# Generate target (house price) with some relationship to features
price = (
    100000 +  # Base price
    150 * area +  # Area impact
    20000 * bedrooms +  # Bedroom impact
    25000 * bathrooms +  # Bathroom impact
    -2000 * age +  # Age impact (older houses cost less)
    -3000 * distance_to_city  # Location impact
)

# Add some noise
price += np.random.normal(0, 50000, n_samples)

# Create DataFrame
housing_data = pd.DataFrame({
    'area': area,
    'bedrooms': bedrooms,
    'bathrooms': bathrooms,
    'age': age,
    'distance_to_city': distance_to_city,
    'price': price
})

# Save to CSV
housing_data.to_csv('housing_data.csv', index=False)

print("Dataset created and saved as housing_data.csv")

Dataset created and saved as housing_data.csv


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the data
data = pd.read_csv('housing_data.csv')

# Explore the data
print("Dataset shape:", data.shape)
print("\nFirst 5 rows:")
print(data.head())
print("\nData summary:")
print(data.describe())

# Check for missing values
print("\nMissing values:")
print(data.isnull().sum())

# Visualize the data
plt.figure(figsize=(15, 10))

# Distribution of target variable
plt.subplot(2, 3, 1)
sns.histplot(data['price'])
plt.title('Price Distribution')

# Relationships between features and target
plt.subplot(2, 3, 2)
sns.scatterplot(x='area', y='price', data=data)
plt.title('Price vs Area')

plt.subplot(2, 3, 3)
sns.boxplot(x='bedrooms', y='price', data=data)
plt.title('Price vs Bedrooms')

plt.subplot(2, 3, 4)
sns.boxplot(x='bathrooms', y='price', data=data)
plt.title('Price vs Bathrooms')

plt.subplot(2, 3, 5)
sns.scatterplot(x='age', y='price', data=data)
plt.title('Price vs Age')

plt.subplot(2, 3, 6)
sns.scatterplot(x='distance_to_city', y='price', data=data)
plt.title('Price vs Distance to City')

plt.tight_layout()
plt.savefig('exploratory_analysis.png')
plt.close()

# Split the data into features and target
X = data.drop('price', axis=1)
y = data['price']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save preprocessed data
import joblib
joblib.dump(scaler, 'scaler.pkl')

print("\nData preprocessing complete.")
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

Dataset shape: (1000, 6)

First 5 rows:
          area  bedrooms  bathrooms  age  distance_to_city          price
0  1748.357077         4          3   32          1.417062  454521.857623
1  1430.867849         1          3   49         23.166248  267502.479184
2  1823.844269         3          1   37         19.088549  281514.755611
3  2261.514928         5          3   26         23.106776  549621.650593
4  1382.923313         3          3   33          2.129202  300359.337253

Data summary:
              area     bedrooms    bathrooms          age  distance_to_city  \
count  1000.000000  1000.000000  1000.000000  1000.000000       1000.000000   
mean   1509.666028     3.045000     1.985000    24.583000         15.187403   
std     489.607969     1.424431     0.835148    14.630077          8.322703   
min    -120.633670     1.000000     1.000000     0.000000          1.000891   
25%    1176.204847     2.000000     1.000000    12.000000          7.793900   
50%    1512.650306     3.00

In [3]:
import pandas as pd
import numpy as np
import joblib
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

# Load the preprocessed data
X_train = pd.read_csv('housing_data.csv').drop('price', axis=1)
y_train = pd.read_csv('housing_data.csv')['price']
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

scaler = joblib.load('scaler.pkl')
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define models to evaluate
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42)
}

# Function to evaluate model
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    return {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R²': r2
    }

# Evaluate all models
results = {}
for name, model in models.items():
    print(f"Evaluating {name}...")
    results[name] = evaluate_model(model, X_train_scaled, X_test_scaled, y_train, y_test)

# Display results
results_df = pd.DataFrame(results).T
print("\nModel Evaluation Results:")
print(results_df)

# Find the best model
best_model_name = results_df['R²'].idxmax()
print(f"\nBest model based on R² score: {best_model_name}")

# Fine-tune the best model
print(f"\nFine-tuning {best_model_name}...")

if best_model_name == 'Random Forest':
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    }
    best_model = RandomForestRegressor(random_state=42)

elif best_model_name == 'Gradient Boosting':
    param_grid = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    }
    best_model = GradientBoostingRegressor(random_state=42)

elif best_model_name == 'Ridge Regression':
    param_grid = {
        'alpha': [0.1, 1.0, 10.0, 100.0]
    }
    best_model = Ridge()

elif best_model_name == 'Lasso Regression':
    param_grid = {
        'alpha': [0.01, 0.1, 1.0, 10.0]
    }
    best_model = Lasso()

else:  # Linear Regression
    param_grid = {}
    best_model = LinearRegression()

if param_grid:
    grid_search = GridSearchCV(best_model, param_grid, cv=5, scoring='r2')
    grid_search.fit(X_train_scaled, y_train)

    print("Best parameters:", grid_search.best_params_)
    final_model = grid_search.best_estimator_
else:
    final_model = best_model
    final_model.fit(X_train_scaled, y_train)

# Final evaluation
final_metrics = evaluate_model(final_model, X_train_scaled, X_test_scaled, y_train, y_test)
print("\nFinal Model Performance:")
for metric, value in final_metrics.items():
    print(f"{metric}: {value:.4f}")

# Save the final model
joblib.dump(final_model, 'house_price_model.pkl')
print("\nFinal model saved as 'house_price_model.pkl'")

# Create a feature importance plot for tree-based models
if best_model_name in ['Random Forest', 'Gradient Boosting']:
    plt.figure(figsize=(10, 6))
    importances = final_model.feature_importances_
    indices = np.argsort(importances)

    plt.barh(range(len(indices)), importances[indices], align='center')
    plt.yticks(range(len(indices)), [X_train.columns[i] for i in indices])
    plt.xlabel('Feature Importance')
    plt.title('Feature Importance for House Price Prediction')
    plt.tight_layout()
    plt.savefig('feature_importance.png')
    plt.close()
    print("Feature importance plot created.")

Evaluating Linear Regression...
Evaluating Ridge Regression...
Evaluating Lasso Regression...
Evaluating Random Forest...
Evaluating Gradient Boosting...

Model Evaluation Results:
                            MSE          RMSE           MAE        R²
Linear Regression  2.348206e+09  48458.289890  38568.437936  0.771804
Ridge Regression   2.348280e+09  48459.053222  38578.262901  0.771797
Lasso Regression   2.348187e+09  48458.097879  38568.431994  0.771806
Random Forest      2.746556e+09  52407.590945  42463.471210  0.733093
Gradient Boosting  2.791043e+09  52830.319768  42508.776827  0.728770

Best model based on R² score: Lasso Regression

Fine-tuning Lasso Regression...
Best parameters: {'alpha': 10.0}

Final Model Performance:
MSE: 2348021788.2442
RMSE: 48456.3906
MAE: 38568.4015
R²: 0.7718

Final model saved as 'house_price_model.pkl'


In [5]:
!pip install fastapi


Collecting fastapi
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting starlette<0.47.0,>=0.40.0 (from fastapi)
  Downloading starlette-0.46.2-py3-none-any.whl.metadata (6.2 kB)
Downloading fastapi-0.115.12-py3-none-any.whl (95 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.2/95.2 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading starlette-0.46.2-py3-none-any.whl (72 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.0/72.0 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: starlette, fastapi
Successfully installed fastapi-0.115.12 starlette-0.46.2


In [7]:
!pip install uvicorn


Collecting uvicorn
  Downloading uvicorn-0.34.1-py3-none-any.whl.metadata (6.5 kB)
Downloading uvicorn-0.34.1-py3-none-any.whl (62 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/62.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.4/62.4 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: uvicorn
Successfully installed uvicorn-0.34.1


In [8]:
# app.py
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, Field
import joblib
import numpy as np
import uvicorn
import pandas as pd
from typing import Optional

# Initialize FastAPI app
app = FastAPI(
    title="House Price Prediction API",
    description="A simple API for predicting house prices based on features",
    version="1.0.0"
)

# Load the trained model and scaler
model = joblib.load('house_price_model.pkl')
scaler = joblib.load('scaler.pkl')

# Define input data model
class HouseFeatures(BaseModel):
    area: float = Field(..., gt=0, description="Area of the house in square feet")
    bedrooms: int = Field(..., ge=1, le=10, description="Number of bedrooms")
    bathrooms: int = Field(..., ge=1, le=10, description="Number of bathrooms")
    age: int = Field(..., ge=0, description="Age of the house in years")
    distance_to_city: float = Field(..., ge=0, description="Distance to city center in miles")

# Define output data model
class PredictionResult(BaseModel):
    predicted_price: float
    confidence_interval: Optional[dict] = None

@app.get("/")
async def root():
    return {"message": "Welcome to the House Price Prediction API! Use /predict endpoint to make predictions."}

@app.post("/predict", response_model=PredictionResult)
async def predict_price(house: HouseFeatures):
    try:
        # Convert input to DataFrame
        features = pd.DataFrame({
            'area': [house.area],
            'bedrooms': [house.bedrooms],
            'bathrooms': [house.bathrooms],
            'age': [house.age],
            'distance_to_city': [house.distance_to_city]
        })

        # Scale features
        scaled_features = scaler.transform(features)

        # Make prediction
        prediction = model.predict(scaled_features)[0]

        # For some models, we can provide confidence intervals
        confidence_interval = None
        if hasattr(model, 'predict_proba') or hasattr(model, 'estimators_'):
            # This is a simple approximation for tree-based models
            # In a real application, you might want to use more sophisticated methods
            try:
                if hasattr(model, 'estimators_'):
                    # For ensemble models like Random Forest
                    predictions = []
                    for estimator in model.estimators_:
                        predictions.append(estimator.predict(scaled_features)[0])
                    std_dev = np.std(predictions)
                    confidence_interval = {
                        "lower_bound": prediction - 1.96 * std_dev,
                        "upper_bound": prediction + 1.96 * std_dev,
                    }
            except:
                pass

        return PredictionResult(
            predicted_price=float(prediction),
            confidence_interval=confidence_interval
        )

    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

@app.get("/health")
async def health_check():
    return {"status": "healthy"}

# Run the application
if __name__ == "__main__":
    uvicorn.run("app:app", host="0.0.0.0", port=8000, reload=True)

INFO:     Will watch for changes in these directories: ['/content']
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
INFO:     Started reloader process [231] using StatReload
INFO:     Stopping reloader process [231]
