In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [2]:
# Load the dataset
df = pd.read_csv('GA_5_dataset.csv')
print("Dataset shape:", df.shape)
df.head()

Dataset shape: (6000, 17)


Unnamed: 0,Marital_Status_Married,Marital_Status_Single,Attrition_Flag_Existing Customer,Gender_M,Education_Level,Income_Category,Card_Category,Customer_Age,Dependent_count,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Total_Revolving_Bal,Total_Trans_Amt,Total_Trans_Ct,Credit_Limit
0,1.069045,-0.921798,0.439814,-0.95119,0.834712,-0.893941,-0.246607,0.491018,-0.257859,0.02221,-1.164956,0.669239,-0.415532,-1.44691,0.120064,0.941898,1912.0
1,1.069045,-0.921798,0.439814,-0.95119,-0.499271,-0.893941,-0.246607,-0.424834,-1.872632,-0.10321,-1.805996,-1.359398,-2.222191,-0.12163,0.94332,1.406924,8401.0
2,-0.935414,1.084837,0.439814,-0.95119,1.501703,-0.893941,-0.246607,0.0,-0.257859,0.02221,1.399203,-0.34508,-0.415532,1.583523,0.173868,0.815073,2759.0
3,-0.935414,1.084837,0.439814,-0.95119,-1.166262,-0.893941,-0.246607,0.0,-1.065246,0.02221,-0.523916,0.669239,-0.415532,0.258243,-0.007792,0.434597,7075.0
4,-0.935414,1.084837,0.439814,1.051315,-0.499271,0.602194,2.750132,-1.079013,0.549527,-0.35405,1.399203,-1.359398,-0.415532,0.743161,0.028077,0.561422,34516.0


In [3]:
# Separate features (X) and target variable (y)
# Credit_Limit is the last column (target variable)
X = df.iloc[:, :-1]  # All columns except the last one
y = df.iloc[:, -1]   # Last column (Credit_Limit)

print("Feature matrix shape:", X.shape)
print("Target variable shape:", y.shape)

Feature matrix shape: (6000, 16)
Target variable shape: (6000,)


In [4]:
# Split the dataset into train and test set (70:30 ratio, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("Training set size:", X_train.shape[0])
print("Test set size:", X_test.shape[0])

Training set size: 4200
Test set size: 1800


## Question 1 & 2: Linear Regression with fit_intercept=False

In [5]:
# Train Linear Regression model with fit_intercept=False
lr_model = LinearRegression(fit_intercept=False)
lr_model.fit(X_train, y_train)

# Make predictions on test set
y_pred = lr_model.predict(X_test)

# Calculate R2 score on test dataset
r2 = r2_score(y_test, y_pred)

print(f"R2 Score on test dataset: {r2:.3f}")
print(f"\nQuestion 1 Answer: {r2:.3f}")

R2 Score on test dataset: -0.411

Question 1 Answer: -0.411


In [6]:
# Get the coefficients
coefficients = lr_model.coef_

# Find the index of feature with highest absolute coefficient value
abs_coefficients = np.abs(coefficients)
max_coef_index = np.argmax(abs_coefficients)

print("Coefficients:")
for i, coef in enumerate(coefficients):
    print(f"Feature {i}: {coef:.4f} (Absolute: {abs(coef):.4f})")

print(f"\nFeature with highest absolute coefficient:")
print(f"Index: {max_coef_index}")
print(f"Coefficient value: {coefficients[max_coef_index]:.4f}")
print(f"Absolute coefficient value: {abs_coefficients[max_coef_index]:.4f}")

print(f"\nQuestion 2 Answer: {max_coef_index}")

Coefficients:
Feature 0: -809.6172 (Absolute: 809.6172)
Feature 1: -533.7746 (Absolute: 533.7746)
Feature 2: 200.0798 (Absolute: 200.0798)
Feature 3: -170.5218 (Absolute: 170.5218)
Feature 4: 4.6205 (Absolute: 4.6205)
Feature 5: 4462.9321 (Absolute: 4462.9321)
Feature 6: 3877.5317 (Absolute: 3877.5317)
Feature 7: 132.2398 (Absolute: 132.2398)
Feature 8: 192.6857 (Absolute: 192.6857)
Feature 9: -1.3186 (Absolute: 1.3186)
Feature 10: 22.3471 (Absolute: 22.3471)
Feature 11: -86.5910 (Absolute: 86.5910)
Feature 12: 259.8347 (Absolute: 259.8347)
Feature 13: 1.6462 (Absolute: 1.6462)
Feature 14: 1360.9905 (Absolute: 1360.9905)
Feature 15: -794.0213 (Absolute: 794.0213)

Feature with highest absolute coefficient:
Index: 5
Coefficient value: 4462.9321
Absolute coefficient value: 4462.9321

Question 2 Answer: 5


## Question 3 & 4: Ridge Regression

In [7]:
# Import Ridge from sklearn
from sklearn.linear_model import Ridge

In [8]:
# Train Ridge model with specified parameters
ridge_model = Ridge(solver='sag', tol=0.0005, random_state=42)
ridge_model.fit(X_train, y_train)

# Make predictions on test set
y_pred_ridge = ridge_model.predict(X_test)

# Calculate R2 score on test dataset
r2_ridge = r2_score(y_test, y_pred_ridge)

print(f"R2 Score on test dataset: {r2_ridge:.3f}")
print(f"\nQuestion 3 Answer: {r2_ridge:.3f}")

R2 Score on test dataset: 0.503

Question 3 Answer: 0.503


In [9]:
# Get the intercept value
intercept = ridge_model.intercept_

print(f"Intercept value: {intercept:.3f}")
print(f"\nQuestion 4 Answer: {intercept:.3f}")

Intercept value: 8638.308

Question 4 Answer: 8638.308


## Question 5 & 6: Lasso Regression

In [10]:
# Import Lasso from sklearn
from sklearn.linear_model import Lasso

In [11]:
# Train Lasso model with specified parameters
lasso_model = Lasso(alpha=100, random_state=42)
lasso_model.fit(X_train, y_train)

# Make predictions on test set
y_pred_lasso = lasso_model.predict(X_test)

# Calculate R2 score on test dataset
r2_lasso = r2_score(y_test, y_pred_lasso)

print(f"R2 Score on test dataset: {r2_lasso:.3f}")
print(f"\nQuestion 5 Answer: {r2_lasso:.3f}")

R2 Score on test dataset: 0.501

Question 5 Answer: 0.501


In [12]:
# Get the coefficients
lasso_coefficients = lasso_model.coef_

# Count how many coefficients are in the range [-1, 1]
count_in_range = np.sum((lasso_coefficients >= -1) & (lasso_coefficients <= 1))

print("Lasso Coefficients:")
for i, coef in enumerate(lasso_coefficients):
    print(f"Feature {i}: {coef:.4f}")

print(f"\nNumber of coefficients in range [-1, 1]: {count_in_range}")
print(f"\nQuestion 6 Answer: {count_in_range}")

Lasso Coefficients:
Feature 0: -197.6313
Feature 1: -0.0000
Feature 2: 0.0000
Feature 3: 0.0000
Feature 4: -0.0000
Feature 5: 4289.9714
Feature 6: 3753.3467
Feature 7: 0.0000
Feature 8: 62.8468
Feature 9: 0.0000
Feature 10: -0.0000
Feature 11: -0.0000
Feature 12: 107.5511
Feature 13: 0.0000
Feature 14: 694.9363
Feature 15: -133.8558

Number of coefficients in range [-1, 1]: 9

Question 6 Answer: 9


## Question 7: KNeighborsRegressor

In [13]:
# Import KNeighborsRegressor and mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

In [14]:
# Train KNeighborsRegressor with specified parameters
knn_model = KNeighborsRegressor(n_neighbors=10, p=1)
knn_model.fit(X_train, y_train)

# Make predictions on test set
y_pred_knn = knn_model.predict(X_test)

# Calculate RMSE on test dataset
rmse_knn = np.sqrt(mean_squared_error(y_test, y_pred_knn))

print(f"RMSE on test dataset: {rmse_knn:.3f}")
print(f"\nQuestion 7 Answer: {rmse_knn:.3f}")

RMSE on test dataset: 6707.056

Question 7 Answer: 6707.056


## Question 8: Decision Tree Regressor

In [15]:
# Import DecisionTreeRegressor
from sklearn.tree import DecisionTreeRegressor

In [16]:
# Train Decision Tree Regressor with specified parameters
dt_model = DecisionTreeRegressor(
    max_depth=10,
    min_samples_split=6,
    min_samples_leaf=6,
    random_state=42
)
dt_model.fit(X_train, y_train)

# Make predictions on test set
y_pred_dt = dt_model.predict(X_test)

# Calculate RMSE on test dataset
rmse_dt = np.sqrt(mean_squared_error(y_test, y_pred_dt))

print(f"RMSE on test dataset: {rmse_dt:.3f}")
print(f"\nQuestion 8 Answer: {rmse_dt:.3f}")

RMSE on test dataset: 6740.834

Question 8 Answer: 6740.834


## Question 9 & 10: AdaBoost with GridSearchCV

In [17]:
# Import AdaBoostRegressor and GridSearchCV
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import GridSearchCV

In [18]:
# Define the parameter grid
param_grid = {
    'n_estimators': [10, 50, 100, 200, 500],
    'learning_rate': [0.1, 0.5, 1, 2]
}

# Create AdaBoostRegressor model
ada_model = AdaBoostRegressor(random_state=42)

# Perform GridSearchCV
grid_search = GridSearchCV(
    estimator=ada_model,
    param_grid=param_grid,
    cv=4,
    scoring='r2',
    n_jobs=-1
)

# Fit the grid search
print("Training GridSearchCV... This may take a few minutes.")
grid_search.fit(X_train, y_train)
print("GridSearchCV training completed!")

Training GridSearchCV... This may take a few minutes.
GridSearchCV training completed!


In [19]:
# Get the best model
best_ada_model = grid_search.best_estimator_

# Display best parameters
print("Best Parameters:")
print(grid_search.best_params_)

Best Parameters:
{'learning_rate': 0.1, 'n_estimators': 10}


In [20]:
# Make predictions on test set using the best model
y_pred_ada = best_ada_model.predict(X_test)

# Calculate R2 score on test dataset
r2_ada = r2_score(y_test, y_pred_ada)

print(f"R2 Score on test dataset: {r2_ada:.3f}")
print(f"\nQuestion 9 Answer: {r2_ada:.3f}")

R2 Score on test dataset: 0.540

Question 9 Answer: 0.540


In [21]:
# Get the best n_estimators value
best_n_estimators = grid_search.best_params_['n_estimators']

print(f"Best n_estimators: {best_n_estimators}")
print(f"\nQuestion 10 Answer: {best_n_estimators}")

Best n_estimators: 10

Question 10 Answer: 10
