In [344]:
import pandas as pd
credit = pd.read_csv('data/Credit.csv',index_col=[0])

In [345]:
def adj_r2(r2,n,p):
    return (1 - (1 - r2) * ((n - 1) / (n-p - 1)))

In [346]:
credit.columns.values

array(['Income', 'Limit', 'Rating', 'Cards', 'Age', 'Education', 'Gender',
       'Student', 'Married', 'Ethnicity', 'Balance'], dtype=object)

## Preprocessing

In [347]:
data = pd.get_dummies(credit, columns = ['Gender', 'Student','Married','Ethnicity'])
data.head(3)

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Balance,Gender_Female,Gender_Male,Student_No,Student_Yes,Married_No,Married_Yes,Ethnicity_African American,Ethnicity_Asian,Ethnicity_Caucasian
1,14.891,3606,283,2,34,11,333,0,1,1,0,0,1,0,0,1
2,106.025,6645,483,3,82,15,903,1,0,0,1,0,1,0,1,0
3,104.593,7075,514,4,71,11,580,0,1,1,0,1,0,0,1,0


In [348]:
y= data["Balance"]
X = data.loc[:, data.columns != 'Balance']

In [349]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)

In [350]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

## Linear Regression

In [351]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

In [352]:
from sklearn.metrics import r2_score
rs= r2_score(y_test, y_pred)
print(f"R2 Score:{r2:.4f} Adj R2 Score:{adj_r2(r2,X_test.shape[0],X_test.shape[1]):.4f}")

R2 Score:0.9382 Adj R2 Score:0.9237


## K-Fold Cross Validation

In [353]:
from sklearn.model_selection import cross_val_score,cross_val_predict
score = cross_val_score(LinearRegression(),X=X_train,y=y_train,cv=5,scoring ='neg_mean_squared_error')
y_pred = cross_val_predict(LinearRegression(), X_test, y_test, cv=5)
from sklearn.metrics import r2_score
r2 =r2_score(y_test, y_pred)
print(f"Cross-Validation Error:{abs(score.mean()):.4f} R2 Score:{r2:.4f} Adj R2 Score:{adj_r2(r2,X_test.shape[0],X_test.shape[1]):.4f}")

Cross-Validation Error:9987.4705 R2 Score:0.9130 Adj R2 Score:0.8927


## Leave One Out Cross Validation

In [354]:
from sklearn.model_selection import cross_val_score,cross_val_predict
loo = LeaveOneOut()
score = cross_val_score(LinearRegression(),X=X_train,y=y_train,cv=loo,scoring='neg_mean_squared_error')
y_pred = cross_val_predict(LinearRegression(), X_test, y_test, cv=loo)
from sklearn.metrics import r2_score
r2 =r2_score(y_test, y_pred)
print(f"Cross-Validation Error:{abs(score.mean()):.4f} R2 Score:{r2:.4f} Adj R2 Score:{adj_r2(r2,X_test.shape[0],X_test.shape[1]):.4f}")

Cross-Validation Error:9807.4042 R2 Score:0.9264 Adj R2 Score:0.9092


## Forward Subset Selection

In [355]:
def calculate_score(X_train,y_train):
    from sklearn.model_selection import cross_val_score,cross_val_predict
    score = cross_val_score(LinearRegression(),X=X_train,y=y_train,cv=5,scoring ='neg_mean_squared_error')
    return abs(score.mean())

In [356]:
import itertools
k= X.shape[1]
remaining_features = list(range(0,k))
features = []
RSS_list = [np.inf]
features_list = dict()
for i in range(1,k+1):
    best_RSS = np.inf
    for combo in itertools.combinations(remaining_features,1):
        score = 0       
        features.append(combo[0])
        score = calculate_score(X_train[:,features],y_train)
        features.remove(combo[0])
        
        if  score < best_RSS:
            best_RSS = score
            best_feature = combo[0]
    
    features.append(best_feature)
    remaining_features.remove(best_feature)
            
    features_list[i] = features.copy()
    RSS_list.append(best_RSS)

In [357]:
result = features_list[RSS_list.index(min(RSS_list))]
data.columns[result]

Index(['Rating', 'Income', 'Student_No', 'Limit', 'Cards', 'Age'], dtype='object')

In [358]:
y_pred = cross_val_predict(LinearRegression(), X_test[:,result], y_test, cv=5)
from sklearn.metrics import r2_score
r2 =r2_score(y_test, y_pred)
print(f"Cross-Validation Error:{abs(min(RSS_list)):.4f} R2 Score:{r2:.4f} Adj R2 Score:{adj_r2(r2,X_test.shape[0],X_test.shape[1]):.4f}")

Cross-Validation Error:9804.6953 R2 Score:0.9231 Adj R2 Score:0.9050


## Ridge regression

In [359]:
from sklearn import linear_model
reg = linear_model.Ridge(alpha=.5)
score = cross_val_score(reg,X=X_train,y=y_train,cv=5,scoring='neg_mean_squared_error')
y_pred = cross_val_predict(reg, X_test, y_test, cv=5)
from sklearn.metrics import r2_score
r2 =r2_score(y_test, y_pred)
print(f"Cross-Validation Error:{abs(score.mean()):.4f} R2 Score:{r2:.4f} Adj R2 Score:{adj_r2(r2,X_test.shape[0],X_test.shape[1]):.4f}")

Cross-Validation Error:9945.2057 R2 Score:0.9088 Adj R2 Score:0.8875


## Lasso regression

In [360]:
from sklearn import linear_model
reg = linear_model.Lasso(alpha=0.1)
score = cross_val_score(reg,X=X_train,y=y_train,cv=5,scoring='neg_mean_squared_error')
y_pred = cross_val_predict(reg, X_test, y_test, cv=5)
from sklearn.metrics import r2_score
r2 =r2_score(y_test, y_pred)
print(f"Cross-Validation Error:{abs(score.mean()):.4f} R2 Score:{r2:.4f} Adj R2 Score:{adj_r2(r2,X_test.shape[0],X_test.shape[1]):.4f}")

Cross-Validation Error:10006.4138 R2 Score:0.9132 Adj R2 Score:0.8928
