# **빅데이터 분석과정**



1.   데이터 불러오기 -> read_csv()
2.   데이터 살펴보기 -> info(),describe(),head()
3.   데이터전처리 -> 결측치처리(fillna), 수치형변수 스케일링(StandardScaler,MinMaxScaler), str 컬럼은 인코딩(LabelEncoder)
4.   분석 데이터셋 분비 -> train_test_split
5.   데이터 분석 실행
6.   성능평가 및 시각화



---

**의사결정나무 분류모델을 위한 필요패키지**

*   from sklearn.tree import DecisionTreeClassifier

**학습 및 테스트 데이터셋 분리를 위한 필요패키지**
*   from sklearn.model_selection import train_test_split



In [None]:
# 데이터 불러오기
import numpy as np
import pandas as pd

df = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/dataset/main/iris.csv')
print(df.head())
df.info()

   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [None]:
# 데이터전처리

df['species'].unique()  # (setosa, versicolor, virginica)
df['species'].replace({'setosa' : 0, 'versicolor':1, 'virginica':2},inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 6.0 KB


In [None]:
# 분석 데이터셋 준비
# 전처리를 마친 데이터를 학습용, 테스트용 데이터셋으로 분리(일반적으로 8:2 비율)
# sklearn의 train_test_split()이용

from sklearn.model_selection import train_test_split

#x = df[['sepal_length','sepal_width','petal_length','petal_width']]
#y = df['species']

# x_train, x_test, y_test, y_train = train_test_split(x,y,test_size=0.2,stratify=df['species'])

# X (연속형, 범주형 숫자데이터), y(범주형 숫자데이터)
x_train, x_test, y_train, y_test = train_test_split(df,df['species'],test_size=0.2,stratify = df['species'])

#print(x_train['species'].value_counts())
x_train = x_train.drop(['species'],axis=1)
x_test = x_test.drop(['species'],axis=1)


<class 'pandas.core.frame.DataFrame'>
Index: 120 entries, 110 to 44
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  120 non-null    float64
 1   sepal_width   120 non-null    float64
 2   petal_length  120 non-null    float64
 3   petal_width   120 non-null    float64
dtypes: float64(4)
memory usage: 4.7 KB
None


In [None]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(120, 4)
(30, 4)
(120,)
(30,)


---

**해당 과제는 지도학습(분류)이므로, '의사결정나무'를 활용**

---

In [None]:
from sklearn.tree import DecisionTreeClassifier

df=DecisionTreeClassifier(random_state=11)
df.fit(x_train,y_train)      #학습 수행


In [None]:
pred = df.predict(x_test)    # 예측수행

# 모델 성능 - 정확도 측정
from sklearn.metrics import accuracy_score

acc=accuracy_score(y_test,pred)
print(acc)

0.9666666666666667


In [None]:
# 성능평가

# 모델 성능 - 오차행렬
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,pred)

# 모델 성능 평가 - 평가지표 계산

from sklearn.metrics import classification_report

rpt= classification_report(y_test,pred)
print(rpt)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      0.90      0.95        10
           2       0.91      1.00      0.95        10

    accuracy                           0.97        30
   macro avg       0.97      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30



# 로지스틱 회귀(Logistic Regression)



**y = sigmoid( w * x + b)**
*   선형 회귀의 결과를 입력 값으로 받아 특정 레이블로 분류
*   확률을 0과 1 사이의 S자형 곡선으로 나타내는 시그모이드 함수를 사용 -> 입력 값이 클루록 1에 수렴, 작으면 0으로 수렴






In [None]:
import numpy as np
import pandas as pd

df = pd.read_csv("https://raw.githubusercontent.com/YoungjinBD/dataset/main/iris.csv")
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [None]:
# 데이터 살펴보기
print(df.info())       #결측치 없음
print()
print(df.describe())   #데이터프레임의 기술통계를 보여줌

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB
None

       sepal_length  sepal_width  petal_length  petal_width
count    150.000000   150.000000    150.000000   150.000000
mean       5.843333     3.057333      3.758000     1.199333
std        0.828066     0.435866      1.765298     0.762238
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.300000
50%        5.800000     3.000000      4.350000     1.300000
75%        6.400000     3.300000      5.100000     1.800000
max        7.900000     4.400000      6.900000     2.5000

In [None]:
#데이터 전처리 : 결측치 처리-> 스케일링-> 인코딩
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[['sepal_length']] = scaler.fit_transform(df[['sepal_length']])
df[['sepal_width']] = scaler.fit_transform(df[['sepal_width']])
df[['petal_length']] = scaler.fit_transform(df[['petal_length']])
df[['petal_width']] = scaler.fit_transform(df[['petal_width']])

# specied 는 텍스트 이므로 => 범주형 변수로 만들기 위해서 레이블 인코딩 진행
from sklearn.preprocessing import LabelEncoder
df['species'] = LabelEncoder().fit_transform(df[['species']])

  y = column_or_1d(y, warn=True)


In [None]:
# 분석 데이터셋 준비

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df,df['species'],test_size=0.2, stratify = df['species'])
x_train = x_train.drop(['species'],axis=1)
x_test = x_test.drop(['species'],axis=1)

In [None]:
print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)

(120, 4) (30, 4) (120,) (30,)


In [None]:
# 데이터분석

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(x_train,y_train)

pred = lr.predict(x_test)

In [None]:
# 성능 평가

from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test,pred)
print(acc)

1.0


# 랜덤포레스트 분류

In [31]:
import numpy as np
import pandas as pd

df = pd.read_csv("https://raw.githubusercontent.com/YoungjinBD/dataset/main/titanic.csv")
print(df.head(5))
print()
print(df.info())
print()
print(df.describe())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  

<

In [32]:
# cabin 컬럼은 결측치가 너무 많아 분석에서 제외
# Embarked 컬럼은 최빈값으로 결측치를 대체
# Age 컬럼은 평균값으로 대체

df['Embarked']=df['Embarked'].fillna(df['Embarked'].mode()[0])
df['Age'] = df['Age'].fillna(np.mean(df['Age']))
df.drop(['Cabin'],axis=1,inplace = True)

In [33]:
from sklearn.preprocessing import LabelEncoder

# 텍스트 컬럼을 범주형 변수로 레이블인코딩
df['Sex'] = LabelEncoder().fit_transform(df['Sex'])
df['Embarked'] = LabelEncoder().fit_transform(df['Embarked'])

In [35]:
# SibSp, Parch의 값을 더해서 FamilySize 컬럼(파생변수)를 생성한다.

df["FamilySize"] = df["SibSp"]+df["Parch"]

# 분석 데이터셋 준비

x = df[["Pclass","Sex","Age","Fare","Embarked","FamilySize","Survived"]]
y = df["Survived"]

from sklearn.model_selection import train_test_split


x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,stratify = df['Survived'])
x_train = x_train.drop(['Survived'],axis =1)
x_test = x_test.drop(['Survived'],axis=1)

In [36]:
#print(x_train['Survived'].value_counts(), x_test['Survived'].value_counts())
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(712, 6) (179, 6) (712,) (179,)


In [41]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(x_train,y_train)

pred = rf.predict(x_test)

In [42]:
# 성능 측정

from sklearn.metrics import accuracy_score

acc = accuracy_score(y_test,pred)
print(acc)

0.8435754189944135
