In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

file_path = '/content/drive/MyDrive/weather.csv'
df = pd.read_csv(file_path)

# 1. 독립변수/종속변수 분리
X = df.drop("RainTomorrow", axis=1)
y = df["RainTomorrow"]

# 2. 범주형 변수 처리 (Label Encoding)
for col in X.select_dtypes(include=["object"]).columns:
    X[col] = LabelEncoder().fit_transform(X[col].astype(str))

y = LabelEncoder().fit_transform(y.astype(str))  # Yes/No → 1/0

# 3. 결측치 처리 (평균 대체)
imputer = SimpleImputer(strategy="mean")
X = imputer.fit_transform(X)

# 4. 표준화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 5. train/test 분리
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# 6. K-NN 모델 학습
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# 7. 예측 및 평가
y_pred = knn.predict(X_test)

print("✅ 혼동행렬")
print(confusion_matrix(y_test, y_pred))

print("\n✅ 분류 리포트")
print(classification_report(y_test, y_pred))




✅ 혼동행렬
[[56  2]
 [10  6]]

✅ 분류 리포트
              precision    recall  f1-score   support

           0       0.85      0.97      0.90        58
           1       0.75      0.38      0.50        16

    accuracy                           0.84        74
   macro avg       0.80      0.67      0.70        74
weighted avg       0.83      0.84      0.82        74



In [13]:
from sklearn.metrics import accuracy_score

# Explore different k values
k_values = range(1, 21)
accuracy_scores = []

for k in k_values:
    # Train KNN model with current k
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = knn.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores.append(accuracy)

# Display the accuracy scores for each k
for k, accuracy in zip(k_values, accuracy_scores):
    print(f"k = {k}: Accuracy = {accuracy:.4f}")

k = 1: Accuracy = 0.7838
k = 2: Accuracy = 0.8108
k = 3: Accuracy = 0.8108
k = 4: Accuracy = 0.8378
k = 5: Accuracy = 0.8378
k = 6: Accuracy = 0.8378
k = 7: Accuracy = 0.8378
k = 8: Accuracy = 0.8378
k = 9: Accuracy = 0.8514
k = 10: Accuracy = 0.8514
k = 11: Accuracy = 0.8649
k = 12: Accuracy = 0.8378
k = 13: Accuracy = 0.8514
k = 14: Accuracy = 0.8378
k = 15: Accuracy = 0.8378
k = 16: Accuracy = 0.8378
k = 17: Accuracy = 0.8378
k = 18: Accuracy = 0.8108
k = 19: Accuracy = 0.8243
k = 20: Accuracy = 0.8243


In [14]:
# Train KNN model with the best k (k=11)
best_k = 11
knn_best = KNeighborsClassifier(n_neighbors=best_k)
knn_best.fit(X_train, y_train)

# Predict and evaluate
y_pred_best = knn_best.predict(X_test)

print(f"✅ 혼동행렬 (k={best_k})")
print(confusion_matrix(y_test, y_pred_best))

print(f"\n✅ 분류 리포트 (k={best_k})")
print(classification_report(y_test, y_pred_best))

✅ 혼동행렬 (k=11)
[[58  0]
 [10  6]]

✅ 분류 리포트 (k=11)
              precision    recall  f1-score   support

           0       0.85      1.00      0.92        58
           1       1.00      0.38      0.55        16

    accuracy                           0.86        74
   macro avg       0.93      0.69      0.73        74
weighted avg       0.88      0.86      0.84        74



In [16]:
file_path = '/content/drive/MyDrive/Raisin_Dataset.xlsx'

# Task
Perform k-NN analysis on the data in "/content/drive/MyDrive/Raisin_Dataset.xlsx" using Python.

## Load data

### Subtask:
Load the data from the specified Excel file (`/content/drive/MyDrive/Raisin_Dataset.xlsx`) into a pandas DataFrame.


**Reasoning**:
Import the pandas library and load the Excel file into a DataFrame.



In [18]:
df_raisin = pd.read_excel(file_path)

## Explore data

### Subtask:
Briefly examine the data to understand its structure, columns, and data types. Identify the target variable and features.


**Reasoning**:
To understand the structure, columns, and data types of the DataFrame, I will display the first few rows, print the column information, and get a summary of the DataFrame.



In [19]:
display(df_raisin.head())
df_raisin.info()
display(df_raisin.describe())

Unnamed: 0,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,Extent,Perimeter,Class
0,87524,442.246011,253.291155,0.819738,90546,0.758651,1184.04,Kecimen
1,75166,406.690687,243.032436,0.801805,78789,0.68413,1121.786,Kecimen
2,90856,442.267048,266.328318,0.798354,93717,0.637613,1208.575,Kecimen
3,45928,286.540559,208.760042,0.684989,47336,0.699599,844.162,Kecimen
4,79408,352.19077,290.827533,0.564011,81463,0.792772,1073.251,Kecimen


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900 entries, 0 to 899
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Area             900 non-null    int64  
 1   MajorAxisLength  900 non-null    float64
 2   MinorAxisLength  900 non-null    float64
 3   Eccentricity     900 non-null    float64
 4   ConvexArea       900 non-null    int64  
 5   Extent           900 non-null    float64
 6   Perimeter        900 non-null    float64
 7   Class            900 non-null    object 
dtypes: float64(5), int64(2), object(1)
memory usage: 56.4+ KB


Unnamed: 0,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,Extent,Perimeter
count,900.0,900.0,900.0,900.0,900.0,900.0,900.0
mean,87804.127778,430.92995,254.488133,0.781542,91186.09,0.699508,1165.906636
std,39002.11139,116.035121,49.988902,0.090318,40769.290132,0.053468,273.764315
min,25387.0,225.629541,143.710872,0.34873,26139.0,0.379856,619.074
25%,59348.0,345.442898,219.111126,0.741766,61513.25,0.670869,966.41075
50%,78902.0,407.803951,247.848409,0.798846,81651.0,0.707367,1119.509
75%,105028.25,494.187014,279.888575,0.842571,108375.75,0.734991,1308.38975
max,235047.0,997.291941,492.275279,0.962124,278217.0,0.835455,2697.753


## Preprocess data

### Subtask:
Handle missing values, encode categorical variables (if any), and scale numerical features.


**Reasoning**:
Separate features and target, encode the target variable, and scale the numerical features as per the instructions.



In [20]:
# 1. Separate features (X) and target (y)
X_raisin = df_raisin.drop("Class", axis=1)
y_raisin = df_raisin["Class"]

# 2. Encode the categorical target variable
label_encoder_raisin = LabelEncoder()
y_raisin_encoded = label_encoder_raisin.fit_transform(y_raisin)

# 3. Identify numerical features and scale them
numerical_features = X_raisin.select_dtypes(include=np.number).columns
scaler_raisin = StandardScaler()
X_raisin_scaled = scaler_raisin.fit_transform(X_raisin[numerical_features])

## Split data

### Subtask:
Split the preprocessed data into training and testing sets.


**Reasoning**:
Split the preprocessed data into training and testing sets using train_test_split with a test size of 0.2 and a random state for reproducibility.



In [21]:
X_train_raisin, X_test_raisin, y_train_raisin, y_test_raisin = train_test_split(
    X_raisin_scaled, y_raisin_encoded, test_size=0.2, random_state=42
)

## Train knn model

### Subtask:
Train a K-NN model on the training data.


**Reasoning**:
Instantiate and train a K-NN model on the training data.



In [22]:
# Instantiate a KNeighborsClassifier object
knn_raisin = KNeighborsClassifier(n_neighbors=5)

# Fit the K-NN model to the training data
knn_raisin.fit(X_train_raisin, y_train_raisin)

## Evaluate model

### Subtask:
Evaluate the performance of the trained model on the test data using appropriate metrics (e.g., confusion matrix, classification report).


**Reasoning**:
Evaluate the performance of the trained model on the test data using appropriate metrics.



In [23]:
# 1. Make predictions on the test data
y_pred_raisin = knn_raisin.predict(X_test_raisin)

# 2. Generate and print the confusion matrix
print("✅ 혼동행렬")
print(confusion_matrix(y_test_raisin, y_pred_raisin))

# 3. Generate and print the classification report
print("\n✅ 분류 리포트")
print(classification_report(y_test_raisin, y_pred_raisin))

✅ 혼동행렬
[[71 15]
 [16 78]]

✅ 분류 리포트
              precision    recall  f1-score   support

           0       0.82      0.83      0.82        86
           1       0.84      0.83      0.83        94

    accuracy                           0.83       180
   macro avg       0.83      0.83      0.83       180
weighted avg       0.83      0.83      0.83       180



## Find optimal k (optional but recommended)

### Subtask:
Explore different k values to find the optimal one for this dataset.


**Reasoning**:
Iterate through different k values, train the KNN model for each k, make predictions, calculate accuracy, and store the accuracy scores.



In [24]:
# 1. Create a list of k values to explore
k_values_raisin = range(1, 21)

# 2. Initialize an empty list to store accuracy scores for each k
accuracy_scores_raisin = []

# 3. Iterate through the list of k values
for k in k_values_raisin:
    # 4. Inside the loop, train a KNeighborsClassifier model for the current k value
    knn_raisin_k = KNeighborsClassifier(n_neighbors=k)
    knn_raisin_k.fit(X_train_raisin, y_train_raisin)

    # 5. Make predictions on the test data
    y_pred_raisin_k = knn_raisin_k.predict(X_test_raisin)

    # 6. Calculate the accuracy score of the predictions
    accuracy_raisin_k = accuracy_score(y_test_raisin, y_pred_raisin_k)

    # 7. Append the calculated accuracy score to the list of accuracy scores
    accuracy_scores_raisin.append(accuracy_raisin_k)

# 8. After the loop, print the accuracy scores for each k value
for k, accuracy in zip(k_values_raisin, accuracy_scores_raisin):
    print(f"k = {k}: Accuracy = {accuracy:.4f}")

k = 1: Accuracy = 0.8222
k = 2: Accuracy = 0.8000
k = 3: Accuracy = 0.8333
k = 4: Accuracy = 0.8000
k = 5: Accuracy = 0.8278
k = 6: Accuracy = 0.8222
k = 7: Accuracy = 0.8444
k = 8: Accuracy = 0.8222
k = 9: Accuracy = 0.8500
k = 10: Accuracy = 0.8500
k = 11: Accuracy = 0.8444
k = 12: Accuracy = 0.8500
k = 13: Accuracy = 0.8389
k = 14: Accuracy = 0.8444
k = 15: Accuracy = 0.8389
k = 16: Accuracy = 0.8444
k = 17: Accuracy = 0.8444
k = 18: Accuracy = 0.8500
k = 19: Accuracy = 0.8500
k = 20: Accuracy = 0.8500


## Train final model (with optimal k)

### Subtask:
Train the K-NN model again using the optimal k value found in the previous step.


**Reasoning**:
Instantiate and train a new K-NN model with the optimal k value (k=9).



In [25]:
# Instantiate a new KNeighborsClassifier object with the optimal k value (k=9)
knn_optimal_raisin = KNeighborsClassifier(n_neighbors=9)

# Train this new K-NN model using the entire training dataset
knn_optimal_raisin.fit(X_train_raisin, y_train_raisin)

## Evaluate final model

### Subtask:
Evaluate the performance of the final model.


**Reasoning**:
Evaluate the performance of the final model by making predictions on the test data and printing the confusion matrix and classification report.



In [26]:
# 1. Make predictions on the test data using the knn_optimal_raisin model.
y_pred_optimal_raisin = knn_optimal_raisin.predict(X_test_raisin)

# 2. Calculate and print the confusion matrix using y_test_raisin and the predictions.
print("✅ 혼동행렬 (Optimal k=9)")
print(confusion_matrix(y_test_raisin, y_pred_optimal_raisin))

# 3. Calculate and print the classification report using y_test_raisin and the predictions.
print("\n✅ 분류 리포트 (Optimal k=9)")
print(classification_report(y_test_raisin, y_pred_optimal_raisin))

✅ 혼동행렬 (Optimal k=9)
[[71 15]
 [12 82]]

✅ 분류 리포트 (Optimal k=9)
              precision    recall  f1-score   support

           0       0.86      0.83      0.84        86
           1       0.85      0.87      0.86        94

    accuracy                           0.85       180
   macro avg       0.85      0.85      0.85       180
weighted avg       0.85      0.85      0.85       180



## 데이터 로드

### 하위 작업:
지정된 엑셀 파일(`/content/drive/MyDrive/Raisin_Dataset.xlsx`)에서 데이터를 pandas DataFrame으로 로드합니다.

**설명**:
pandas 라이브러리를 가져와 엑셀 파일을 DataFrame으로 로드합니다.

In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

file_path_raisin = '/content/drive/MyDrive/Raisin_Dataset.xlsx'
df_raisin = pd.read_excel(file_path_raisin)

## 데이터 탐색

### 하위 작업:
데이터의 구조, 열, 데이터 유형을 간략하게 살펴봅니다. 목표 변수와 특성을 식별합니다.

**설명**:
DataFrame의 구조, 열 이름, 데이터 유형을 이해하기 위해 처음 몇 행을 표시하고, 열 정보를 출력하고, DataFrame의 요약을 가져옵니다.

In [28]:
display(df_raisin.head())
df_raisin.info()
display(df_raisin.describe())

Unnamed: 0,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,Extent,Perimeter,Class
0,87524,442.246011,253.291155,0.819738,90546,0.758651,1184.04,Kecimen
1,75166,406.690687,243.032436,0.801805,78789,0.68413,1121.786,Kecimen
2,90856,442.267048,266.328318,0.798354,93717,0.637613,1208.575,Kecimen
3,45928,286.540559,208.760042,0.684989,47336,0.699599,844.162,Kecimen
4,79408,352.19077,290.827533,0.564011,81463,0.792772,1073.251,Kecimen


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900 entries, 0 to 899
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Area             900 non-null    int64  
 1   MajorAxisLength  900 non-null    float64
 2   MinorAxisLength  900 non-null    float64
 3   Eccentricity     900 non-null    float64
 4   ConvexArea       900 non-null    int64  
 5   Extent           900 non-null    float64
 6   Perimeter        900 non-null    float64
 7   Class            900 non-null    object 
dtypes: float64(5), int64(2), object(1)
memory usage: 56.4+ KB


Unnamed: 0,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,Extent,Perimeter
count,900.0,900.0,900.0,900.0,900.0,900.0,900.0
mean,87804.127778,430.92995,254.488133,0.781542,91186.09,0.699508,1165.906636
std,39002.11139,116.035121,49.988902,0.090318,40769.290132,0.053468,273.764315
min,25387.0,225.629541,143.710872,0.34873,26139.0,0.379856,619.074
25%,59348.0,345.442898,219.111126,0.741766,61513.25,0.670869,966.41075
50%,78902.0,407.803951,247.848409,0.798846,81651.0,0.707367,1119.509
75%,105028.25,494.187014,279.888575,0.842571,108375.75,0.734991,1308.38975
max,235047.0,997.291941,492.275279,0.962124,278217.0,0.835455,2697.753


## 데이터 전처리

### 하위 작업:
결측값을 처리하고, 범주형 변수(있는 경우)를 인코딩하고, 숫자 특성을 스케일링합니다.

**설명**:
특성과 목표 변수를 분리하고, 목표 변수를 인코딩하고, 지침에 따라 숫자 특성을 스케일링합니다.

In [30]:
# 1. 특성(X)과 목표 변수(y) 분리
X_raisin = df_raisin.drop("Class", axis=1)
y_raisin = df_raisin["Class"]

# 2. 범주형 목표 변수 인코딩
label_encoder_raisin = LabelEncoder()
y_raisin_encoded = label_encoder_raisin.fit_transform(y_raisin)

# 3. 숫자 특성 식별 및 스케일링
# 이 데이터셋에는 결측치가 없으므로 Imputer 단계는 생략합니다.
numerical_features_raisin = X_raisin.select_dtypes(include=np.number).columns
scaler_raisin = StandardScaler()
X_raisin_scaled = scaler_raisin.fit_transform(X_raisin[numerical_features_raisin])

## 데이터 분할

### 하위 작업:
전처리된 데이터를 훈련 세트와 테스트 세트로 분할합니다.

**설명**:
재현성을 위해 테스트 크기 0.2와 random state를 사용하여 전처리된 데이터를 훈련 세트와 테스트 세트로 분할합니다.

In [31]:
X_train_raisin, X_test_raisin, y_train_raisin, y_test_raisin = train_test_split(
    X_raisin_scaled, y_raisin_encoded, test_size=0.2, random_state=42
)

## KNN 모델 학습 (초기 k=5)

### 하위 작업:
훈련 데이터로 K-NN 모델을 학습시킵니다.

**설명**:
K-NN 모델 객체를 인스턴스화하고 훈련 데이터에 모델을 학습시킵니다. 초기 k 값은 5로 설정합니다.

In [32]:
# KNeighborsClassifier 객체 인스턴스화
knn_raisin = KNeighborsClassifier(n_neighbors=5)

# 훈련 데이터에 K-NN 모델 학습
knn_raisin.fit(X_train_raisin, y_train_raisin)

## 모델 평가 (초기 k=5)

### 하위 작업:
적절한 지표(예: 혼동 행렬, 분류 리포트)를 사용하여 테스트 데이터에서 학습된 모델의 성능을 평가합니다.

**설명**:
테스트 데이터에서 학습된 모델의 성능을 적절한 지표를 사용하여 평가합니다.

In [33]:
# 1. 테스트 데이터에 대한 예측 수행
y_pred_raisin = knn_raisin.predict(X_test_raisin)

# 2. 혼동 행렬 생성 및 출력
print("✅ 혼동행렬 (초기 k=5)")
print(confusion_matrix(y_test_raisin, y_pred_raisin))

# 3. 분류 리포트 생성 및 출력
print("\n✅ 분류 리포트 (초기 k=5)")
print(classification_report(y_test_raisin, y_pred_raisin))

✅ 혼동행렬 (초기 k=5)
[[71 15]
 [16 78]]

✅ 분류 리포트 (초기 k=5)
              precision    recall  f1-score   support

           0       0.82      0.83      0.82        86
           1       0.84      0.83      0.83        94

    accuracy                           0.83       180
   macro avg       0.83      0.83      0.83       180
weighted avg       0.83      0.83      0.83       180



## 최적의 k 찾기

### 하위 작업:
이 데이터 세트에 대한 최적의 k 값을 찾기 위해 다양한 k 값을 탐색합니다.

**설명**:
다양한 k 값을 반복하여 각 k에 대한 KNN 모델을 학습시키고, 예측을 수행하고, 정확도를 계산하고, 정확도 점수를 저장합니다.

In [34]:
# 1. 탐색할 k 값 목록 생성
k_values_raisin = range(1, 21)

# 2. 각 k에 대한 정확도 점수를 저장할 빈 목록 초기화
accuracy_scores_raisin = []

# 3. k 값 목록 반복
for k in k_values_raisin:
    # 4. 루프 내에서 현재 k 값에 대한 KNeighborsClassifier 모델 학습
    knn_raisin_k = KNeighborsClassifier(n_neighbors=k)
    knn_raisin_k.fit(X_train_raisin, y_train_raisin)

    # 5. 테스트 데이터에 대한 예측 수행
    y_pred_raisin_k = knn_raisin_k.predict(X_test_raisin)

    # 6. 예측 정확도 점수 계산
    accuracy_raisin_k = accuracy_score(y_test_raisin, y_pred_raisin_k)

    # 7. 계산된 정확도 점수를 정확도 점수 목록에 추가
    accuracy_scores_raisin.append(accuracy_raisin_k)

# 8. 루프 후 각 k 값에 대한 정확도 점수 출력
for k, accuracy in zip(k_values_raisin, accuracy_scores_raisin):
    print(f"k = {k}: Accuracy = {accuracy:.4f}")

k = 1: Accuracy = 0.8222
k = 2: Accuracy = 0.8000
k = 3: Accuracy = 0.8333
k = 4: Accuracy = 0.8000
k = 5: Accuracy = 0.8278
k = 6: Accuracy = 0.8222
k = 7: Accuracy = 0.8444
k = 8: Accuracy = 0.8222
k = 9: Accuracy = 0.8500
k = 10: Accuracy = 0.8500
k = 11: Accuracy = 0.8444
k = 12: Accuracy = 0.8500
k = 13: Accuracy = 0.8389
k = 14: Accuracy = 0.8444
k = 15: Accuracy = 0.8389
k = 16: Accuracy = 0.8444
k = 17: Accuracy = 0.8444
k = 18: Accuracy = 0.8500
k = 19: Accuracy = 0.8500
k = 20: Accuracy = 0.8500


## 최적의 k로 최종 모델 학습

### 하위 작업:
이전 단계에서 찾은 최적의 k 값을 사용하여 K-NN 모델을 다시 학습시킵니다.

**설명**:
최적의 k 값으로 새로운 K-NN 모델을 인스턴스화하고 학습시킵니다. 위의 결과에서 정확도가 가장 높은 k 값을 선택합니다. 예를 들어 k=9가 가장 높다면 9를 사용합니다.

In [35]:
# 최적의 k 값을 가진 새로운 KNeighborsClassifier 객체 인스턴스화 (예: k=9)
# NOTE: 이전 단계에서 찾은 실제 최적 k 값으로 9를 대체하세요.
best_k_raisin = 9 # 예시 값, 이전 단계 결과에 따라 변경
knn_optimal_raisin = KNeighborsClassifier(n_neighbors=best_k_raisin)

# 이 새로운 K-NN 모델을 전체 훈련 데이터셋으로 학습
knn_optimal_raisin.fit(X_train_raisin, y_train_raisin)

## 최종 모델 평가

### 하위 작업:
최종 모델의 성능을 평가합니다.

**설명**:
테스트 데이터에 대한 예측을 수행하고 혼동 행렬과 분류 리포트를 출력하여 최종 모델의 성능을 평가합니다.

In [36]:
# 1. knn_optimal_raisin 모델을 사용하여 테스트 데이터에 대한 예측 수행.
y_pred_optimal_raisin = knn_optimal_raisin.predict(X_test_raisin)

# 2. y_test_raisin 및 예측을 사용하여 혼동 행렬 계산 및 출력.
print(f"✅ 혼동행렬 (최적 k={best_k_raisin})")
print(confusion_matrix(y_test_raisin, y_pred_optimal_raisin))

# 3. y_test_raisin 및 예측을 사용하여 분류 리포트 계산 및 출력.
print(f"\n✅ 분류 리포트 (최적 k={best_k_raisin})")
print(classification_report(y_test_raisin, y_pred_optimal_raisin))

✅ 혼동행렬 (최적 k=9)
[[71 15]
 [12 82]]

✅ 분류 리포트 (최적 k=9)
              precision    recall  f1-score   support

           0       0.86      0.83      0.84        86
           1       0.85      0.87      0.86        94

    accuracy                           0.85       180
   macro avg       0.85      0.85      0.85       180
weighted avg       0.85      0.85      0.85       180

