In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error

# another way of ignoring warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
### Importing data
X = pd.read_csv("case1Data_Xnew.csv")
# splitting to continuous and categorical variables
Xcont = X.iloc[:, :95]
Xcat = X.iloc[:, 95:]
X.head(10)

Unnamed: 0,x_01,x_02,x_03,x_04,x_05,x_06,x_07,x_08,x_09,x_10,...,x_91,x_92,x_93,x_94,x_95,C_01,C_02,C_03,C_04,C_05
0,-0.843969,-9.104918,-5.076919,-4.222152,3.606609,-4.505494,-11.481997,16.201722,15.93947,,...,-13.884702,7.465161,-4.667464,3.949705,-10.715577,73.0,72.0,73.0,75.0,73.0
1,0.802093,-10.196678,-4.50037,-7.827837,5.199002,,-15.928708,20.151309,13.707194,-8.517576,...,-14.937164,5.229448,-6.92797,3.271193,-12.420893,73.0,72.0,73.0,71.0,75.0
2,4.234883,-10.798261,-0.465914,-6.05485,,,-16.182312,16.419564,12.152861,-6.418069,...,-11.058964,4.692879,-0.929818,,-14.551448,,72.0,73.0,71.0,73.0
3,7.041336,-5.169413,-4.158334,-4.270638,14.939894,0.008338,-10.556799,,14.18083,,...,,7.460901,-2.484389,8.149697,-11.598544,72.0,72.0,74.0,75.0,75.0
4,1.135564,-12.048088,-4.828939,-6.565217,7.4931,-2.789944,-15.859234,21.560086,14.147759,-4.848519,...,-9.607803,5.654679,-3.020357,3.030958,-13.320599,72.0,72.0,71.0,74.0,73.0
5,2.70094,-10.946685,-6.465114,-4.881989,4.638099,-6.987986,,21.61587,19.024904,-2.53236,...,-8.42064,9.082611,-6.250724,6.314635,-9.204663,74.0,72.0,,71.0,75.0
6,5.390682,,-4.149893,-2.791737,8.812586,-3.873233,-17.031808,,11.510675,-7.52057,...,-14.226802,9.110132,-4.711318,3.122558,-10.525525,74.0,72.0,,71.0,71.0
7,2.455633,-6.247203,-1.761414,,12.467152,-1.965168,-15.475678,,,4.462509,...,-14.501091,9.869636,-7.113231,2.301097,-9.705442,71.0,72.0,72.0,,74.0
8,-4.55122,-10.161342,-5.896081,-8.75356,7.2466,-3.280027,-11.823492,22.51582,,-8.37778,...,-14.616431,5.733823,-4.996681,2.051505,-9.497427,75.0,72.0,74.0,72.0,71.0
9,,-12.226706,-5.68976,-7.563683,8.756352,-6.947685,-13.352728,18.347187,12.665331,-7.265669,...,-14.291779,4.274889,-4.248666,2.483933,-13.065352,74.0,,74.0,74.0,75.0


In [3]:
def center_dataframe(data):
    mu = data.mean()
    data = data - mu
    return data, mu

In [4]:
# --- Scaling Continuous Variables ---
standard_scaler = StandardScaler()
Xcont_scaled = standard_scaler.fit_transform(Xcont)

# --- KNN Imputation for Continuous Variables ---
knn_imput = KNNImputer(n_neighbors=5)
Xcont_imputed = knn_imput.fit_transform(Xcont_scaled)

# Convert to DataFrame 
Xcont_imputed = pd.DataFrame(Xcont_imputed, columns=Xcont.columns)

# --- Imputation for Categorical Variables ---
simple_imput = SimpleImputer(strategy="most_frequent")
Xcat_imputed = simple_imput.fit_transform(Xcat)

# Convert to DataFrame 
Xcat_imputed = pd.DataFrame(Xcat_imputed, columns=Xcat.columns)

# --- 1 HOT encoding ---
Xcat_imputed1HOT = pd.get_dummies(Xcat_imputed, columns=Xcat_imputed.columns, drop_first=False).astype(int)

# --- Align the target variable with the feature DataFrames ---
# --- for Regression models ---
# --- Concatenate Continuous and Categorical Data ---
Xfinal = pd.concat([Xcont_imputed, Xcat_imputed1HOT], axis=1)

In [5]:
Xfinal

Unnamed: 0,x_01,x_02,x_03,x_04,x_05,x_06,x_07,x_08,x_09,x_10,...,C_04_71.0,C_04_72.0,C_04_73.0,C_04_74.0,C_04_75.0,C_05_71.0,C_05_72.0,C_05_73.0,C_05_74.0,C_05_75.0
0,-1.312986,0.304054,-0.353141,0.516533,-2.110364,0.003217,0.783808,-1.961867,0.648898,0.138268,...,0,0,0,0,1,0,0,1,0,0
1,-0.742718,-0.118817,-0.138252,-0.680070,-1.529121,-0.876841,-0.829762,-0.625927,-0.081544,-1.235373,...,1,0,0,0,0,0,0,0,0,1
2,0.446550,-0.351827,1.365447,-0.091676,-0.146191,-0.005985,-0.921787,-1.888183,-0.590151,-0.538341,...,1,0,0,0,0,0,0,1,0,0
3,1.418827,1.828389,-0.010771,0.500443,2.026424,1.517136,1.119533,0.756996,0.073438,1.028576,...,0,0,0,0,1,0,0,0,0,1
4,-0.627189,-0.835921,-0.260715,-0.261049,-0.691747,0.578605,-0.804552,-0.149411,0.062617,-0.017253,...,0,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.192307,-0.171961,-0.280824,0.257951,1.953734,-0.959454,-0.227832,0.905917,1.632096,0.425086,...,0,0,0,1,0,0,0,0,0,1
996,-1.125068,0.238058,-0.228679,-1.395794,-1.439822,-0.163711,0.160452,0.138119,-0.403542,-0.294754,...,0,0,0,1,0,0,0,0,0,1
997,0.251414,1.300081,1.356533,0.210650,1.596421,-1.382502,0.425142,1.005117,-0.124200,0.466988,...,0,0,0,0,1,0,0,0,0,1
998,-0.204275,2.215726,0.691514,1.093195,0.439034,2.203680,1.392429,-2.048087,-0.950014,0.268800,...,1,0,0,0,0,0,0,1,0,0


In [6]:
# Load models
Lasso_trained = joblib.load("LassoTrained_model.pkl")

# Make predictions
y_pred1 = Lasso_trained.predict(Xfinal)

# Convert predictions into a DataFrame
y_pred_df = pd.DataFrame({
    "y_pred1": y_pred1
})

# Display the DataFrame
#y_pred_df

In [7]:
# Load the original training data
data = pd.read_csv('Xytrain_regression.csv')
Y_train = data.iloc[:, 0]
X_train = data.iloc[:, 1:]

# Estimate RMSE using cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)
cv_scores1 = cross_val_score(Lasso_trained, X_train, Y_train, cv=kf, scoring='neg_mean_squared_error')
rmse_estimate1 = np.sqrt(-np.mean(cv_scores1))


# Save the predictions
y_pred_df.to_csv('predictions_s243598_s232958.csv', index=False, header=False)

# Save the estimated RMSE
with open('estimatedRMSE_s243598_s232958.csv', 'w') as f:
    f.write(f"{rmse_estimate1}") # + ,{rmse_estimate2}

print(f"Estimated RMSE for model 1: {rmse_estimate1}")
#print(f"Estimated RMSE for model 2: {rmse_estimate2}")

Estimated RMSE for model 1: 29.665189996848447
