# Multivariate Regression Model using Feature Engineering

# # 1. Using Polynomial Features and ElasticNet Regression Model Selecting all the features

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

# Load the training and testing datasets
train_data = pd.read_csv('cancer_us_county-training.csv')
test_data = pd.read_csv('cancer_us_county-testing.csv')

# Drop non-numeric columns if necessary
train_data = train_data.select_dtypes(include=[np.number])
test_data = test_data.select_dtypes(include=[np.number])

# Data preprocessing (e.g., handling missing values)
train_data.fillna(train_data.mean(), inplace=True)
test_data.fillna(test_data.mean(), inplace=True)

# Get all numeric features except the target variable
features = train_data.columns.drop('TARGET_deathRate')

# Separate features and target variables for training and testing sets
X_train = train_data[features]
y_train = train_data['TARGET_deathRate']
X_test = test_data[features]
y_test = test_data['TARGET_deathRate']

# Feature scaling using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Apply feature engineering using PolynomialFeatures (interaction_only=True)
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Train the ElasticNet regression model with the interaction features
elastic_net = ElasticNet(alpha=1, l1_ratio=0.75)
elastic_net.fit(X_train_poly, y_train)

# Make predictions on the testing set
y_pred = elastic_net.predict(X_test_poly)

# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Calculate R-squared score
r2 = r2_score(y_test, y_pred)
print(f'R-squared Score: {r2}')


Mean Squared Error: 406.2590594829085
R-squared Score: 0.5025022212179722


## 2. Using Feature selection using Recursive Feature Elimination (RFE) and training model using Multivariate Linear Regression Model for top 10 selected features only

In [13]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Load the training and testing datasets
train_data = pd.read_csv('cancer_us_county-training.csv')
test_data = pd.read_csv('cancer_us_county-testing.csv')

#dropping this column as it has more null values
train_data.drop(columns=['PctSomeCol18_24'],axis=1,inplace=True)
test_data.drop(columns=['PctSomeCol18_24'],axis=1,inplace=True)

#checking the value counts of two categorical columns in train dataset 
for i in train_data.select_dtypes(include=object):
    print(train_data[i].value_counts(ascending=False))
    print("-------------------------------------------------------------------------------")
    
    
#checking the value counts of two categorical columns in test dataset
for i in test_data.select_dtypes(include=object):
    print(test_data[i].value_counts(ascending=False))
    print("-------------------------------------------------------------------------------")
    
#dropping Geography column as it as more only one count in each unique value
#dropping the binnedInc column as we already have Medianincome of a country column
train_data.drop(columns=['binnedInc', 'Geography'],axis=1,inplace=True)
test_data.drop(columns=['binnedInc', 'Geography'],axis=1,inplace=True)

# Drop non-numeric columns if necessary
train_data = train_data.select_dtypes(include=[np.number])
test_data = test_data.select_dtypes(include=[np.number])

# Data preprocessing (e.g., handling missing values)
train_data.fillna(train_data.mean(), inplace=True)
test_data.fillna(test_data.mean(), inplace=True)

# Get all numeric features except the target variable
features = train_data.columns.drop('TARGET_deathRate')

# Separate features and target variables for training and testing sets
X_train = train_data[features]
y_train = train_data['TARGET_deathRate']
X_test = test_data[features]
y_test = test_data['TARGET_deathRate']

# Feature scaling using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Feature selection using Recursive Feature Elimination (RFE)
lr = LinearRegression()
rfe = RFE(estimator=lr, n_features_to_select=10)
rfe.fit(X_train, y_train)

# Get the selected features
selected_features = features[rfe.support_]
print(f"Selected features: {selected_features.tolist()}")

# Train the multivariate linear regression model with the selected features
X_train_selected = train_data[selected_features]
X_test_selected = test_data[selected_features]

# Scale the selected features
X_train_selected = scaler.fit_transform(X_train_selected)
X_test_selected = scaler.transform(X_test_selected)

# Train the model
lr.fit(X_train_selected, y_train)

# Make predictions on the testing set
y_pred = lr.predict(X_test_selected)

# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Calculate R-squared score
r2 = r2_score(y_test, y_pred)
print(f'R-squared Score: {r2}')


(40362.7, 42724.4]    254
(54545.6, 61494.5]    252
(34218.1, 37413.8]    252
(45201, 48021.6]      249
(37413.8, 40362.7]    242
(61494.5, 125635]     240
(48021.6, 51046.4]    240
(42724.4, 45201]      238
(51046.4, 54545.6]    236
[22640, 34218.1]      235
Name: binnedInc, dtype: int64
-------------------------------------------------------------------------------
Jackson County, Kentucky         1
Cascade County, Montana          1
Morris County, Texas             1
Dallas County, Arkansas          1
Berrien County, Michigan         1
                                ..
Magoffin County, Kentucky        1
Emmons County, North Dakota      1
Monroe County, New York          1
Hall County, Nebraska            1
Yadkin County, North Carolina    1
Name: Geography, Length: 2438, dtype: int64
-------------------------------------------------------------------------------
[22640, 34218.1]      71
(51046.4, 54545.6]    69
(42724.4, 45201]      67
(48021.6, 51046.4]    65
(61494.5, 125635]    

## 3. Applying Log transformation on Numeric values 

In [16]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Read the CSV files
train_data = pd.read_csv('cancer_us_county-training.csv')
test_data = pd.read_csv('cancer_us_county-testing.csv')

# Choose features for multivariate linear regression
numeric_features = train_data.select_dtypes(include=[np.number]).columns.tolist()
numeric_features.remove('TARGET_deathRate')

# Data preparation: handle missing values
train_data = train_data.dropna(subset=['TARGET_deathRate'] + numeric_features)
test_data = test_data.dropna(subset=['TARGET_deathRate'] + numeric_features)

# Feature engineering: apply log transformation to numeric features
for feature in numeric_features:
    train_data[f'log_{feature}'] = np.log(train_data[feature] + 1)
    test_data[f'log_{feature}'] = np.log(test_data[feature] + 1)

# Update selected features to include engineered features
selected_features = [f'log_{feature}' for feature in numeric_features]

# Prepare the training data
X_train = train_data[selected_features]
y_train = train_data['TARGET_deathRate']

# Prepare the testing data
X_test = test_data[selected_features]
y_test = test_data['TARGET_deathRate']

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the linear regression model
regressor = LinearRegression()
regressor.fit(X_train_scaled, y_train)

# Make predictions on the testing data
y_pred = regressor.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)


Mean Squared Error: 259.905417623331
R-squared: 0.5674975513002033
