<a href="https://colab.research.google.com/github/redroced/SAI-ai-/blob/main/coms4771_spring_2022_regression_competition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 3단원 캐글 문제 (Lasso 포함)

라이브러리

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error

데이터 불러오기

In [None]:
from google.colab import drive

drive.mount('/content/drive')

file_path = '/content/drive/MyDrive/학교/세종대/동아리/SAI/2025 1학기/3단원/'

train_X = pd.read_csv(file_path + 'train_examples.csv')
train_y = pd.read_csv(file_path + 'train_labels.csv').squeeze()
test_X = pd.read_csv(file_path + 'test_examples.csv')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


전처리

In [None]:
# Series 변환
if isinstance(train_y, pd.DataFrame):
    train_y = train_y.squeeze()

# 시간 파생 특성 만들기
non_numeric_cols = train_X.select_dtypes(include=['object']).columns
print(f"시간 문자열 컬럼: {list(non_numeric_cols)}")

for col in non_numeric_cols:
    train_X[col] = pd.to_datetime(train_X[col], errors='coerce')
    train_X[f'{col}_month'] = train_X[col].dt.month
    train_X[f'{col}_day'] = train_X[col].dt.day
    train_X[f'{col}_hour'] = train_X[col].dt.hour

    test_X[col] = pd.to_datetime(test_X[col], errors='coerce')
    test_X[f'{col}_month'] = test_X[col].dt.month
    test_X[f'{col}_day'] = test_X[col].dt.day
    test_X[f'{col}_hour'] = test_X[col].dt.hour

# 문자열 컬럼 제거
train_X = train_X.drop(columns=non_numeric_cols)
test_X = test_X.drop(columns=non_numeric_cols)

# NaN 비율 계산
nan_ratio = train_X.isnull().mean()
high_nan_cols = nan_ratio[nan_ratio > 0.3].index.tolist() #30퍼 이상 NaN

# NaN 많은 컬럼 제거
train_X = train_X.drop(columns=high_nan_cols)
test_X = test_X.drop(columns=high_nan_cols)

# 남은 NaN 처리 (중앙값)
train_X = train_X.fillna(train_X.median())
test_X = test_X.fillna(test_X.median())

# 다항 특성 추가
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_train_X = poly.fit_transform(train_X)
poly_test_X = poly.transform(test_X)

# 특성 스케일링
scaler = StandardScaler()
scaled_train_X = scaler.fit_transform(poly_train_X)
scaled_test_X = scaler.transform(poly_test_X)

시간 문자열 컬럼: []


데이터 분할

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    scaled_train_X, train_y, test_size=0.2, random_state=42
)

하이퍼파라미터 튜닝

In [None]:
alphas = np.logspace(-4, 4, 20)

릿지

In [None]:
alpha_list = [0.001, 0.01, 0.1, 1, 10, 100]

ridge_train_score = []
ridge_val_score = []

for alpha in alpha_list:
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_train, y_train)
    ridge_train_score.append(ridge.score(X_train, y_train))
    ridge_val_score.append(ridge.score(X_val, y_val))

ridge_df = pd.DataFrame({
    'alpha': alpha_list,
    'train_score': ridge_train_score,
    'val_score': ridge_val_score
})

print("Ridge 성능")
print(ridge_df)

Ridge 성능
     alpha  train_score    val_score
0    0.001     0.512299   -26.052226
1    0.010     0.512291 -2382.117832
2    0.100     0.512250  -849.961311
3    1.000     0.512158   -23.977983
4   10.000     0.512116     0.491222
5  100.000     0.512042     0.005529


In [None]:
ridge_grid = GridSearchCV(
    Ridge(),
    param_grid={'alpha': alpha_list},
    scoring='neg_mean_squared_error',
    cv=3,
    n_jobs=-1
)
ridge_grid.fit(X_train, y_train)

ridge_val_pred = ridge_grid.predict(X_val)
ridge_rmse = np.sqrt(mean_squared_error(y_val, ridge_val_pred))

best_ridge = ridge_grid.best_estimator_

test_X = pd.read_csv(file_path + 'test_examples.csv')
test_predictions = best_ridge.predict(test_X)
test_predictions = test_predictions.ravel()
submission = pd.DataFrame({
    'id': np.arange(len(test_predictions)),
    'duration': test_predictions
})

submission.to_csv('/content/drive/MyDrive/학교/세종대/동아리/SAI/2025 1학기/3단원/submission1.csv', index=False)
print("'submission.csv' 저장 완료!")



ValueError: could not convert string to float: '02-26 22:37:30'

라쏘

In [None]:
lasso_train_score = []
lasso_val_score = []

for alpha in alpha_list:
    lasso = Lasso(alpha=alpha, max_iter=10000)
    lasso.fit(X_train, y_train)
    lasso_train_score.append(lasso.score(X_train, y_train))
    lasso_val_score.append(lasso.score(X_val, y_val))

lasso_df = pd.DataFrame({
    'alpha': alpha_list,
    'train_score': lasso_train_score,
    'val_score': lasso_val_score
})

print("Lasso 성능")
print(lasso_df)

KeyboardInterrupt: 

성능비교

In [None]:
comparison_df = pd.DataFrame({
    'alpha': alpha_list,
    'ridge_val_score': ridge_val_score,
    'lasso_val_score': lasso_val_score
})

print("Validation 성능 비교 (Ridge vs Lasso)")
print(comparison_df)

In [None]:
# 검증 데이터 예측
ridge_val_pred = ridge_grid.predict(X_val)
lasso_val_pred = lasso_grid.predict(X_val)

# 성능 비교 (RMSE)
ridge_rmse = np.sqrt(mean_squared_error(y_val, ridge_val_pred))
lasso_rmse = np.sqrt(mean_squared_error(y_val, lasso_val_pred))

print(f"Validation RMSE (Ridge): {ridge_rmse:.4f}")
print(f"Validation RMSE (Lasso): {lasso_rmse:.4f}")

# 최종 모델 결정
final_model = ridge_grid.best_estimator_ if ridge_rmse <= lasso_rmse else lasso_grid.best_estimator_
final_model_name = "Ridge" if ridge_rmse <= lasso_rmse else "Lasso"

print(f"Selected Final Model: {final_model_name}")

# 테스트 데이터 예측
final_predictions = final_model.predict(scaled_test_X)

# 결과 출력
print("Test Set Predictions:")
print(final_predictions)

파일 저장

In [None]:
submission = pd.DataFrame({
    'id': np.arange(len(final_predictions)),
    'prediction': final_predictions
})
submission.to_csv('/mnt/data/submission.csv', index=False)
print("Submission file saved as 'submission.csv'.")

### 릿지코드

In [None]:
# 1. 라이브러리 임포트
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler

# 2. 데이터 로드
from google.colab import drive

drive.mount('/content/drive')

file_path = '/content/drive/MyDrive/학교/세종대/동아리/SAI/2025 1학기/3단원/'

train_X = pd.read_csv(file_path + 'train_examples.csv')
train_y = pd.read_csv(file_path + 'train_labels.csv')
train_y = train_y['duration']
test_X = pd.read_csv(file_path + 'test_examples.csv')

# 3. 공통 숫자형 열 추출 후 정렬
train_numeric_cols = train_X.select_dtypes(include=[np.number]).columns
test_numeric_cols = test_X.select_dtypes(include=[np.number]).columns
common_cols = sorted(set(train_numeric_cols).intersection(set(test_numeric_cols)))

train_X_fixed = train_X[common_cols]
test_X_fixed = test_X[common_cols]

# 4. 정규화
scaler = StandardScaler()
scaled_train = scaler.fit_transform(train_X_fixed)
scaled_test = scaler.transform(test_X_fixed)

# 5. Ridge 모델 학습 및 예측 (alpha=0.001)
ridge = Ridge(alpha=0.001)
ridge.fit(scaled_train, train_y)
test_predictions = ridge.predict(scaled_test).ravel()

# 7. 저장
submission.to_csv('/content/drive/MyDrive/학교/세종대/동아리/SAI/2025 1학기/3단원/submission_ridge_fixed.csv', index=False)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
print("🟢 Step-by-step Shape 확인")
print("train_X_fixed.shape:", train_X_fixed.shape)
print("test_X_fixed.shape :", test_X_fixed.shape)

scaled_train = scaler.fit_transform(train_X_fixed)
scaled_test = scaler.transform(test_X_fixed)

print("scaled_train.shape:", scaled_train.shape)
print("scaled_test.shape :", scaled_test.shape)

ridge = Ridge(alpha=0.001)
ridge.fit(scaled_train, train_y)

test_predictions = ridge.predict(scaled_test)
print("test_predictions.shape (before ravel):", test_predictions.shape)

test_predictions = test_predictions.ravel()
print("test_predictions.shape (after ravel):", test_predictions.shape)


🟢 Step-by-step Shape 확인
train_X_fixed.shape: (400000, 11)
test_X_fixed.shape : (100000, 11)
scaled_train.shape: (400000, 11)
scaled_test.shape : (100000, 11)
test_predictions.shape (before ravel): (100000, 2)
test_predictions.shape (after ravel): (200000,)
