# 데이터 분석

In [22]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Setting up the plotting style
plt.style.use('default')
sns.set_palette("deep")

# Loading the dataset
df = pd.read_csv('train.csv')

# Basic Data Overview
print("Dataset Shape:", df.shape)
print("\nColumn Names:", df.columns.tolist())
print("\nData Types:\n", df.dtypes)
print("\nMissing Values:\n", df.isnull().sum())

# Summary Statistics for Numerical Columns
numerical_cols = ['Age', 'Nodule_Size', 'TSH_Result', 'T4_Result', 'T3_Result']
print("\nSummary Statistics for Numerical Columns:\n", df[numerical_cols].describe())

# Summary for Categorical Columns
categorical_cols = ['Gender', 'Country', 'Race', 'Family_Background', 
                   'Radiation_History', 'Iodine_Deficiency', 'Smoke', 
                   'Weight_Risk', 'Diabetes', 'Cancer']
for col in categorical_cols:
    print(f"\nValue Counts for {col}:\n", df[col].value_counts())

# Correlation Analysis
plt.figure(figsize=(10, 8))
sns.heatmap(df[numerical_cols].corr(), annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix of Numerical Features')
plt.tight_layout()
plt.savefig('correlation_matrix.png')
plt.close()

# Distribution of Numerical Features
fig, axes = plt.subplots(3, 2, figsize=(15, 12))
axes = axes.ravel()
for idx, col in enumerate(numerical_cols):
    sns.histplot(df[col], kde=True, ax=axes[idx])
    axes[idx].set_title(f'Distribution of {col}')
axes[-1].axis('off')  # Hide unused subplot
plt.tight_layout()
plt.savefig('numerical_distributions.png')
plt.close()

# Cancer Distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='Cancer', data=df)
plt.title('Distribution of Cancer Cases (0: No, 1: Yes)')
plt.savefig('cancer_distribution.png')
plt.close()

# Age vs Cancer
plt.figure(figsize=(10, 6))
sns.boxplot(x='Cancer', y='Age', data=df)
plt.title('Age Distribution by Cancer Status')
plt.savefig('age_vs_cancer.png')
plt.close()

# Categorical Features vs Cancer
fig, axes = plt.subplots(3, 3, figsize=(18, 12))
axes = axes.ravel()
for idx, col in enumerate(categorical_cols[:-1]):  # Excluding Cancer
    sns.countplot(x=col, hue='Cancer', data=df, ax=axes[idx])
    axes[idx].set_title(f'{col} vs Cancer')
    axes[idx].tick_params(axis='x', rotation=45)
for ax in axes[idx+1:]:
    ax.axis('off')
plt.tight_layout()
plt.savefig('categorical_vs_cancer.png')
plt.close()

# Nodule Size vs Cancer
plt.figure(figsize=(10, 6))
sns.boxplot(x='Cancer', y='Nodule_Size', data=df)
plt.title('Nodule Size by Cancer Status')
plt.savefig('nodule_size_vs_cancer.png')
plt.close()

# Thyroid Function Tests vs Cancer
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
for idx, col in enumerate(['TSH_Result', 'T4_Result', 'T3_Result']):
    sns.boxplot(x='Cancer', y=col, data=df, ax=axes[idx])
    axes[idx].set_title(f'{col} by Cancer Status')
plt.tight_layout()
plt.savefig('thyroid_tests_vs_cancer.png')
plt.close()

# Interesting Fact: Checking for potential age-related patterns
young_cancer = df[(df['Age'] < 30) & (df['Cancer'] == 1)].shape[0]
total_young = df[df['Age'] < 30].shape[0]
print(f"\nInteresting Fact: {young_cancer/total_young*100:.2f}% of patients under 30 have cancer.")

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.
Dataset Shape: (87159, 16)

Column Names: ['ID', 'Age', 'Gender', 'Country', 'Race', 'Family_Background', 'Radiation_History', 'Iodine_Deficiency', 'Smoke', 'Weight_Risk', 'Diabetes', 'Nodule_Size', 'TSH_Result', 'T4_Result', 'T3_Result', 'Cancer']

Data Types:
 ID                    object
Age                    int64
Gender                object
Country               object
Race                  object
Family_Background     object
Radiation_History     object
Iodine_Deficiency     object
Smoke                 object
Weight_Risk           object
Diabetes              object
Nodule_Size          float64
TSH_Result           float64
T4_Result            float64
T3_Result            float64
Cancer                 int64
dtype: object

Missing Values:
 ID                   0
Age                  0
Gender               0
Country           