<a href="https://colab.research.google.com/github/sagunkayastha/CAI_Workshop/blob/main/Workshop_1/DS_intro1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
pd.DataFrame?

In [None]:
import seaborn as sns
sns.*?

## The Boston Housing Dataset   
We explore the Boston housing dataset, which contains US census data concerning houses in various areas around the city of Boston.

Boston Housing Data: This dataset was taken from the StatLib library and is maintained by Carnegie Mellon University. This dataset concerns the housing prices in the housing city of Boston. The dataset provided has 506 instances with 13 features.

---

In [None]:
# Common standard libraries

import datetime
import time
import os

In [None]:
# Common external libraries

import pandas as pd
import numpy as np
import sklearn # scikit-learn
import requests
from bs4 import BeautifulSoup

In [None]:
# Visualization libraries

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Setting plot appearance
# See here for more options: https://matplotlib.org/users/customizing.html

%config InlineBackend.figure_format='retina'
sns.set() # Revert to matplotlib defaults
plt.rcParams['figure.figsize'] = (9, 6)
plt.rcParams['axes.labelpad'] = 10
sns.set_style("darkgrid")

### Loading the data
---

In [None]:
# Uncomment following or upload the file
# !wget https://raw.githubusercontent.com/sagunkayastha/CAI_Workshop/main/Workshop_1/data/BostonHousing.csv


In [None]:

boston = pd.read_csv('BostonHousing.csv')

![Alt text](https://raw.githubusercontent.com/sagunkayastha/CAI_Workshop/main/Workshop_1/images/image-1.png)

proxy for socio-economic status.

In [None]:
boston

Question - What do we want to do with this data? What is our Goal

In [None]:
# What fields are in the dictionary?
boston.keys()

We want to predict housing price(medv) (using 12 features)

## Basic EDA

In [None]:
# changing the name of data set.
df = boston.copy()

In [None]:
summary_stats = df.describe()
summary_stats

In [None]:

# Set the aesthetic style of the plots
sns.set(style="whitegrid")

# Draw histograms for each feature
df.hist(figsize=(16, 14), bins=30)
plt.suptitle('Feature Distributions', fontsize=20)
plt.show()


In [None]:
df.dtypes

In [None]:
# Identify and NaNs
df.isnull().sum()

In [None]:
# Calculate the correlation matrix
correlation_matrix = df.corr()

# Generate a heatmap
plt.figure(figsize=(14, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap', fontsize=20)
plt.show()

In [None]:
# Focus on these columns
# Domain Expertise
cols = ['rm', 'age', 'tax', 'lstat', 'medv']

# The pairwise correlations
df[cols].corr()

In [None]:
# Pairwise correlation heatmap

ax = sns.heatmap(
    df[cols].corr(),annot=True,
    cmap=sns.cubehelix_palette(20, light=0.95, dark=0.15),
)
ax.xaxis.tick_top() # move labels to the top

# plt.savefig(
#     'boston-housing-corr.png',
#     bbox_inches='tight',
#     dpi=300,
# )

In [None]:
sns.pairplot(
    df[cols],
    plot_kws={'alpha': 0.6},
    diag_kws={'bins': 30},
)

In [None]:
# Categorize AGE into 3 bins

def get_age_category(x):
    if x < 50:
        age = 'Relatively New'
    elif 50 <= x < 85:
        age = 'Relatively Old'
    else:
        age = 'Very Old'
    return age

df['age_category'] = df.age.apply(get_age_category)

In [None]:
# Check the segmented counts
df.groupby('age_category').size()

In [None]:
sns.boxplot(
    x='medv',
    y='age_category',
    data=df,
    order=['Relatively New', 'Relatively Old', 'Very Old'],
)

In [None]:
sns.violinplot(
    x='medv',
    y='age_category',
    data=df,
    order=['Relatively New', 'Relatively Old', 'Very Old'],
)

In [None]:
cols = ['rm', 'age', 'tax', 'lstat', 'medv', 'age_category']
sns.pairplot(
    df[cols],
    hue='age_category',
    hue_order=['Relatively New', 'Relatively Old', 'Very Old'],
    plot_kws={'alpha': 0.5},
)

In [None]:
df

In [None]:
y = df['medv'].values
x = df['lstat'].values.reshape(-1,1)

In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=3)
x_poly = poly.fit_transform(x)

In [None]:
from sklearn.linear_model import LinearRegression
clf = LinearRegression(fit_intercept=False)
clf.fit(x_poly, y)
x_0, x_1, x_2, x_3 = clf.coef_
msg = (
    'model: y = {:.3f} + {:.3f}x + {:.3f}x^2 + {:.3f}x^3'
    .format(x_0, x_1, x_2, x_3)
)
print(msg)

In [None]:
y_pred = clf.predict(x_poly)
resid_MEDV = y - y_pred

from sklearn.metrics import mean_squared_error
error = mean_squared_error(y, y_pred)
print('mse = {:.2f}'.format(error))

In [None]:
fig, ax = plt.subplots()

# Plot the samples
ax.scatter(x.flatten(), y, alpha=0.6)

# Plot the polynomial model
x_ = np.linspace(2, 38, 50).reshape(-1, 1)
x_poly = poly.fit_transform(x_)
y_ = clf.predict(x_poly)
ax.plot(x_, y_, color='red', alpha=0.8)

ax.set_xlabel('LSTAT')
ax.set_ylabel('MEDV')



Sklearn


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Extract features and target variable from the dataset
X = df.drop(['medv','age_category'], axis=1)


y = df['medv']

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both the training and test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the Linear Regression model
lr_model_scaled = LinearRegression()

# Fit the model to the scaled training data
lr_model_scaled.fit(X_train_scaled, y_train)

# Make predictions on the scaled test set
y_pred_scaled = lr_model_scaled.predict(X_test_scaled)

# Calculate performance metrics for the scaled data
mse_scaled = mean_squared_error(y_test, y_pred_scaled)
rmse_scaled = np.sqrt(mse_scaled)
r2_scaled = r2_score(y_test, y_pred_scaled)

print(f"Mean Squared Error (MSE) for LR: {mse_scaled :.2f}")
print(f"Root Mean Squared Error (RMSE) for LR: {rmse_scaled:.2f}")
print(f"R-squared (R2) for LR: {r2_scaled:.2f}")


In [None]:
# Retrieve the intercept and coefficients from the model
intercept = lr_model_scaled.intercept_
coefficients = lr_model_scaled.coef_

# Create a dictionary to show feature names along with their corresponding coefficients
feature_coef_dict = {feature: coef for feature, coef in zip(X.columns, coefficients)}

intercept, feature_coef_dict

medv=22.80−1.00×crim+0.70×zn+0.28×indus+0.72×chas−2.02×nox+3.15×rm−0.18×age−3.08×dis+2.25×rad−1.77×tax−2.04×ptratio+1.13×b−3.61×lstat

In [None]:
from sklearn.svm import SVR
# Initialize the Support Vector Regressor model
svr_model = SVR()

# Fit the model to the scaled training data
svr_model.fit(X_train_scaled, y_train)

# Make predictions on the scaled test set
y_pred_svr = svr_model.predict(X_test_scaled)

# Calculate performance metrics for the SVR model
mse_svr = mean_squared_error(y_test, y_pred_svr)
rmse_svr = np.sqrt(mse_svr)
r2_svr = r2_score(y_test, y_pred_svr)

print(f"Mean Squared Error (MSE) for SVR: {mse_svr:.2f}")
print(f"Root Mean Squared Error (RMSE) for SVR: {rmse_svr:.2f}")
print(f"R-squared (R2) for SVR: {r2_svr:.2f}")

In [None]:
# Extract features and target variable from the dataset
X = df.drop(['medv','age_category'], axis=1)

cols = ['rm', 'age', 'tax', 'lstat']
X = df[cols]
y = df['medv']

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both the training and test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the Linear Regression model
lr_model_scaled = LinearRegression()

# Fit the model to the scaled training data
lr_model_scaled.fit(X_train_scaled, y_train)

# Make predictions on the scaled test set
y_pred_scaled = lr_model_scaled.predict(X_test_scaled)

# Calculate performance metrics for the scaled data
mse_scaled = mean_squared_error(y_test, y_pred_scaled)
rmse_scaled = np.sqrt(mse_scaled)
r2_scaled = r2_score(y_test, y_pred_scaled)

print(f"Mean Squared Error (MSE) for LR with selected cols: {mse_scaled :.2f}")
print(f"Root Mean Squared Error (RMSE) for LR with selected cols: {rmse_scaled:.2f}")
print(f"R-squared (R2) for LR with selected cols: {r2_scaled :.2f}")


In [None]:
from sklearn.svm import SVR
# Initialize the Support Vector Regressor model
svr_model = SVR()

# Fit the model to the scaled training data
svr_model.fit(X_train_scaled, y_train)

# Make predictions on the scaled test set
y_pred_svr = svr_model.predict(X_test_scaled)

# Calculate performance metrics for the SVR model
mse_svr = mean_squared_error(y_test, y_pred_svr)
rmse_svr = np.sqrt(mse_svr)
r2_svr = r2_score(y_test, y_pred_svr)


print(f"Mean Squared Error (MSE) for SVR with selected cols: {mse_svr:.2f}")
print(f"Root Mean Squared Error (RMSE) for SVR with selected cols: {rmse_svr:.2f}")
print(f"R-squared (R2) for SVR with selected cols: {r2_svr:.2f}")

To try other datasets

https://github.com/selva86/datasets/tree/master