In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import matplotlib.pyplot as plt

#A2:
def calculate_regression_metrics(y_true_train, y_pred_train, y_true_test, y_pred_test):
    
    mse_train = mean_squared_error(y_true_train, y_pred_train)
    mse_test = mean_squared_error(y_true_test, y_pred_test)

    rmse_train = np.sqrt(mse_train)
    rmse_test = np.sqrt(mse_test)

    mape_train = mean_absolute_percentage_error(y_true_train, y_pred_train)
    mape_test = mean_absolute_percentage_error(y_true_test, y_pred_test)

    r2_train = r2_score(y_true_train, y_pred_train)
    r2_test = r2_score(y_true_test, y_pred_test)


    return {
        'mse_train': mse_train,
        'mse_test': mse_test,
        'rmse_train': rmse_train,
        'rmse_test': rmse_test,
        'mape_train': mape_train,
        'mape_test': mape_test,
        'r2_train': r2_train,
        'r2_test': r2_test
    }


data=pd.read_csv('DCT_withoutduplicate 1.csv')
y_target = data['0']
X_features = data.drop(['0', 'LABEL'], axis=1)  # Use other features except the target and LABEL

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, test_size=0.2, random_state=42)

# Create and train the Linear Regression model
reg = LinearRegression().fit(X_train, y_train)

# Predict on training and test data
y_train_pred = reg.predict(X_train)
y_test_pred = reg.predict(X_test)


metrics = calculate_regression_metrics(y_train, y_train_pred, y_test, y_test_pred)

print("Training MSE:", metrics['mse_train'])
print("Test MSE:", metrics['mse_test'])
print("Training RMSE:", metrics['rmse_train'])
print("Test RMSE:", metrics['rmse_test'])
print("Training MAPE:", metrics['mape_train'])
print("Test MAPE:", metrics['mape_test'])
print("Training R2:", metrics['r2_train'])
print("Test R2:", metrics['r2_test'])

#A4:
X_clustering = data.drop(['0', 'LABEL'], axis=1)
kmeans = KMeans(n_clusters=2, random_state=42, n_init="auto").fit(X_clustering)

labels = kmeans.labels_
centers = kmeans.cluster_centers_

# Print cluster centers
print("Cluster Centers:", centers)

#A5:
silhouette = silhouette_score(X_train, kmeans.labels_)
ch_score = calinski_harabasz_score(X_train, kmeans.labels_)
db_index = davies_bouldin_score(X_train, kmeans.labels_)

# Print the scores
print("Silhouette Score:", silhouette)
print("Calinski-Harabasz Score:", ch_score)
print("Davies-Bouldin Index:", db_index)




Training MSE: 1762247.968776816
Test MSE: 23175093.438325547
Training RMSE: 1327.4968808915582
Test RMSE: 4814.05166552308
Training MAPE: 7.044027690047087e+16
Test MAPE: 1.127044430407534e+18
Training R2: 0.8976196817858509
Test R2: 0.20454218492521592
Cluster Centers: [[ 1.57246670e+01  7.32773995e+01  2.06867985e+03  7.97787526e+01
   2.45891440e+03 -5.63332318e+01 -5.18085212e+01  2.62667160e+01
  -1.13489621e+02 -1.57074493e+02 -1.29778592e+02 -2.25483117e+03
  -1.69936116e+02 -9.59643998e+02  7.98568644e+01 -4.47518377e+01
  -4.32408023e+01  6.25270247e+01 -9.60547189e+01  1.31552109e+01
  -5.66912232e+02  3.89302115e+01  3.45328562e+01  2.88714689e+02
   7.55182496e+02  1.31766800e+02 -1.56896338e+02 -4.26480021e+01
   9.22007957e+01  1.55628143e+01  1.19795237e+02  2.13901637e+02
   3.17606540e+01  7.48629745e+01  5.08425768e+01 -2.47814441e+01
   4.79487216e+01  6.33612670e+02 -7.04073576e+01  1.84754713e+02
  -2.09740929e+02  2.56620607e+02 -1.50920234e+01  3.89023809e+02
  -

ValueError: Found input variables with inconsistent numbers of samples: [1800, 2250]