### 2. 데이터프레임 / 시리즈

In [1]:
# Pandas DataFrame 예시
import pandas as pd

df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35],
    'City': ['New York', 'Paris', 'Berlin']
})

# 각 속성 호출
print("Shape:", df.shape)
print("Data Types:\n", df.dtypes)
print("Index:", df.index)
print("Columns:", df.columns)

Shape: (3, 3)
Data Types:
 Name    object
Age      int64
City    object
dtype: object
Index: RangeIndex(start=0, stop=3, step=1)
Columns: Index(['Name', 'Age', 'City'], dtype='object')


In [2]:
# 실습
test = pd.read_csv("/content/sample_data/california_housing_train.csv")

In [3]:
# 각 속성 호출
print("Shape:", test.shape)
print("Data Types:\n", test.dtypes)
print("Index:", test.index)
print("Columns:", test.columns)

Shape: (17000, 9)
Data Types:
 longitude             float64
latitude              float64
housing_median_age    float64
total_rooms           float64
total_bedrooms        float64
population            float64
households            float64
median_income         float64
median_house_value    float64
dtype: object
Index: RangeIndex(start=0, stop=17000, step=1)
Columns: Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value'],
      dtype='object')


In [4]:
import pandas as pd

# 예시 데이터프레임 생성
data = {
    'Name': ['Alice', 'Bob'],  # 문자열 타입
    'Age': ['25', '30'],       # 숫자 데이터를 문자열로 표현
    'Member': ['True', 'False'],  # 불린 데이터를 문자열로 표현
    'Join Date': ['2021-01-01', '2021-07-01']  # 날짜 데이터를 문자열로 표현
}

# DB -> 문자열! -> 메모리 / 저장용량 효율적!

df = pd.DataFrame(data)
# 결과 출력
print(df)
print(df.dtypes)

    Name Age Member   Join Date
0  Alice  25   True  2021-01-01
1    Bob  30  False  2021-07-01
Name         object
Age          object
Member       object
Join Date    object
dtype: object


In [5]:
# 데이터 타입 변환
df['Age'] = df['Age'].astype(int)

# 결과 출력
print(df)
print(df.dtypes)

    Name  Age Member   Join Date
0  Alice   25   True  2021-01-01
1    Bob   30  False  2021-07-01
Name         object
Age           int64
Member       object
Join Date    object
dtype: object


In [6]:
df['Member'] = df['Member'].astype(bool)  # 'Member' 열을 불린형으로 변환

# 결과 출력
print(df)
print(df.dtypes)

    Name  Age  Member   Join Date
0  Alice   25    True  2021-01-01
1    Bob   30    True  2021-07-01
Name         object
Age           int64
Member         bool
Join Date    object
dtype: object


In [7]:
# 데이터 타입 변환
df['Join Date'] = pd.to_datetime(df['Join Date'])  # 'Join Date' 열을 datetime으로 변환

# 결과 출력
print(df)
print(df.dtypes)

    Name  Age  Member  Join Date
0  Alice   25    True 2021-01-01
1    Bob   30    True 2021-07-01
Name                 object
Age                   int64
Member                 bool
Join Date    datetime64[ns]
dtype: object


In [8]:
df_joindate = df['Join Date']

In [9]:
df_joindate.info()

<class 'pandas.core.series.Series'>
RangeIndex: 2 entries, 0 to 1
Series name: Join Date
Non-Null Count  Dtype         
--------------  -----         
2 non-null      datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 144.0 bytes


In [10]:
df_joindate_age = df[['Join Date', 'Age']]

In [11]:
df_joindate_age.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Join Date  2 non-null      datetime64[ns]
 1   Age        2 non-null      int64         
dtypes: datetime64[ns](1), int64(1)
memory usage: 160.0 bytes


In [12]:
import pandas as pd
import numpy as np

# 예시 데이터프레임 생성
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35],
    'City': ['New York', 'Paris', 'Berlin']
}
df = pd.DataFrame(data)

In [13]:
# 단일 열 선택 -> 결과는 Series
age_series = df['Age']
print(type(age_series))

<class 'pandas.core.series.Series'>


In [14]:
# 다중 열 선택 -> 결과는 DataFrame
subset_df = df[['Name', 'City']]
print(type(subset_df))

<class 'pandas.core.frame.DataFrame'>


In [15]:
# loc 사용 예시
print(df.loc[:, 'Age'])

0    25
1    30
2    35
Name: Age, dtype: int64


In [16]:
# iloc 사용 예시
print(df.iloc[:, 1])

0    25
1    30
2    35
Name: Age, dtype: int64


In [17]:
# 결측치 추가
# [행 / 열]
df.loc[1, 'Age'] = np.nan

In [18]:
print(df)

      Name   Age      City
0    Alice  25.0  New York
1      Bob   NaN     Paris
2  Charlie  35.0    Berlin


In [19]:
# 결측치 확인 isnull -> null이 아니면 false null이면 True
print(df['Age'].isnull())

0    False
1     True
2    False
Name: Age, dtype: bool


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    3 non-null      object 
 1   Age     2 non-null      float64
 2   City    3 non-null      object 
dtypes: float64(1), object(2)
memory usage: 200.0+ bytes


In [21]:
# series 메서드 활용
import pandas as pd

# Series 생성
data = pd.Series([10, 20, 20, 30, 30, 30, 40])

# sum(), mean(), value_counts() 사용 예시
print("Sum:", data.sum())           # 합계
print("Mean:", data.mean())         # 평균
print("Value Counts:\n", data.value_counts())  # 빈도 계산

Sum: 180
Mean: 25.714285714285715
Value Counts:
 30    3
20    2
10    1
40    1
dtype: int64


In [22]:
import pandas as pd

# Series 생성
data = pd.Series([10, 15, 20, 25, 30, 10, 15, 20])

# 다양한 메서드 활용
print("Minimum:", data.min())             # 최소값
print("Maximum:", data.max())             # 최대값
print("Standard Deviation:", data.std())  # 표준편차
print("Variance:", data.var())            # 분산
print("Number of Unique Values:", data.nunique())  # 고유값 개수

# 데이터프레임 요약
df = pd.DataFrame({
    'Age': [25, 30, 35, 40, 45],
    'Salary': [50000, 55000, 60000, 65000, 70000]
})
print("\nDataFrame Summary:\n", df.describe())
print("\nDataFrame Info:")
df.info()

Minimum: 10
Maximum: 30
Standard Deviation: 7.039429765866794
Variance: 49.55357142857143
Number of Unique Values: 5

DataFrame Summary:
              Age       Salary
count   5.000000      5.00000
mean   35.000000  60000.00000
std     7.905694   7905.69415
min    25.000000  50000.00000
25%    30.000000  55000.00000
50%    35.000000  60000.00000
75%    40.000000  65000.00000
max    45.000000  70000.00000

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Age     5 non-null      int64
 1   Salary  5 non-null      int64
dtypes: int64(2)
memory usage: 208.0 bytes


In [26]:
# 실습
test = pd.read_csv("/content/sample_data/california_housing_train.csv")
test
# 데이터프레임의 'housing_median_age' 열을 Series로 변환하고 숫자형으로 변환
test_series = test['housing_median_age']

In [27]:
print(test_series)

0        15.0
1        19.0
2        17.0
3        14.0
4        20.0
         ... 
16995    52.0
16996    36.0
16997    17.0
16998    19.0
16999    52.0
Name: housing_median_age, Length: 17000, dtype: float64


In [28]:
# 각 요소에 1 더하기
test_plus_one = test_series + 1
print("Stars Plus One:\n", test_plus_one.head())

Stars Plus One:
 0    16.0
1    20.0
2    18.0
3    15.0
4    21.0
Name: housing_median_age, dtype: float64


In [29]:
# 각 요소에 2 곱하기
test_times_two = test_series * 2
print("Stars Times Two:\n", test_times_two.head())

Stars Times Two:
 0    30.0
1    38.0
2    34.0
3    28.0
4    40.0
Name: housing_median_age, dtype: float64


In [31]:
# 불린 연산이 가능 -> 해당하는 값만 가져올 수 있음
high_age = test_series > 17.0
print("High age:\n", high_age.head())

High age:
 0    False
1     True
2    False
3    False
4     True
Name: housing_median_age, dtype: bool


In [32]:
# 조건을 만족하는 요소만 필터링
filtered_series = test_series[high_age]
print("Filtered Series:\n", filtered_series.head())

Filtered Series:
 1    19.0
4    20.0
5    29.0
6    25.0
7    41.0
Name: housing_median_age, dtype: float64


In [33]:
import pandas as pd

df = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [4, 5, 6],
    'C': [7, 8, 9]
})

# 열 이름 변경
# columns({변경하기전 : 변경한 후 , 변경하기전 : 변경한 후})
df = df.rename(columns={'A': 'X', 'B': 'Y', 'C': 'Z'})
print(df)

   X  Y  Z
0  1  4  7
1  2  5  8
2  3  6  9


In [34]:
df = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [4, 5, 6],
    'C': [7, 8, 9]
})
# 모든 열 이름 변경
df.columns = ['X', 'Y', 'Z']
print(df)

   X  Y  Z
0  1  4  7
1  2  5  8
2  3  6  9
