### pd.get_dummies() 예제

#### // 원 핫 인코딩의 대표적인 3가지 방법
1. OneHotEncoder: scikit-learn 제공
2. get_dummies(): pandas 제공
3. to_categorical(): keras 제공

In [1]:
import pandas as pd
import numpy as np

fruit = pd.DataFrame({'name': ['apple', 'banana', 'cherry', 'durain', np.nan],
                      'color': ['red', 'yellow', 'red', 'green', np.nan]})
pd.get_dummies(fruit)

# get_dummies() 사용시 자동으로 별칭 부여 => '컬럼명_값'
# 이를 사용하면 컬럼명을 각각 생성할 필요 없음

Unnamed: 0,name_apple,name_banana,name_cherry,name_durain,color_green,color_red,color_yellow
0,1,0,0,0,0,1,0
1,0,1,0,0,0,0,1
2,0,0,1,0,0,1,0
3,0,0,0,1,1,0,0
4,0,0,0,0,0,0,0


In [2]:
fruit

Unnamed: 0,name,color
0,apple,red
1,banana,yellow
2,cherry,red
3,durain,green
4,,


In [3]:
pd.get_dummies(fruit['name'])

# 전체가 아닌 일부분만 변환도 가능하다 => 이름 만드는 형식이 다른 것 check!

Unnamed: 0,apple,banana,cherry,durain
0,1,0,0,0
1,0,1,0,0
2,0,0,1,0
3,0,0,0,1
4,0,0,0,0


In [4]:
pd.get_dummies(fruit['color'])

Unnamed: 0,green,red,yellow
0,0,1,0
1,0,0,1
2,0,1,0
3,1,0,0
4,0,0,0


In [5]:
pd.get_dummies(fruit, columns=['name'])

Unnamed: 0,color,name_apple,name_banana,name_cherry,name_durain
0,red,1,0,0,0
1,yellow,0,1,0,0
2,red,0,0,1,0
3,green,0,0,0,1
4,,0,0,0,0


In [6]:
pd.get_dummies(fruit, columns=['color'])

Unnamed: 0,name,color_green,color_red,color_yellow
0,apple,0,1,0
1,banana,0,0,1
2,cherry,0,1,0
3,durain,1,0,0
4,,0,0,0


In [29]:
fruit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    4 non-null      object
 1   color   4 non-null      object
dtypes: object(2)
memory usage: 208.0+ bytes


---

## 인터넷 신규 가입 여부 예측 실습 예제

#### 1. 데이터 로드

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings(action="ignore")

In [8]:
df = pd.read_csv("./datasets/Newbie.csv")
df

Unnamed: 0,who,Newbie,Age,Gender,Household Income,Sexual Preference,Country,Education Attainment,Major Occupation,Marital Status,Years on Internet
0,id74364,0,54.0,Male,$50-74,Gay male,Ontario,Some College,Computer,Other,4-6 yr
1,id84505,0,39.0,Female,Over $100,Heterosexual,Sweden,Professional,Other,Other,1-3 yr
2,id84509,1,49.0,Female,$40-49,Heterosexual,Washington,Some College,Management,Other,Under 6 mo
3,id87028,1,22.0,Female,$40-49,Heterosexual,Florida,Some College,Computer,Married,6-12 mo
4,id76087,0,20.0,Male,$30-39,Bisexual,New Jersey,Some College,Education,Single,1-3 yr
...,...,...,...,...,...,...,...,...,...,...,...
19578,id83400,0,22.0,Male,Over $100,Heterosexual,Texas,Some College,Education,Single,4-6 yr
19579,id72216,0,19.0,Male,,Heterosexual,New Jersey,Some College,Education,Single,4-6 yr
19580,id8654,0,49.0,Female,$50-74,Heterosexual,Missouri,Doctoral,Education,Married,1-3 yr
19581,id84503,1,42.0,Female,$50-74,Heterosexual,Kentucky,Some College,Other,Married,Under 6 mo


In [9]:
df.keys()

Index(['who', 'Newbie', 'Age', 'Gender', 'Household Income',
       'Sexual Preference', 'Country', 'Education Attainment',
       'Major Occupation', 'Marital Status', 'Years on Internet'],
      dtype='object')

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19583 entries, 0 to 19582
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   who                   19583 non-null  object 
 1   Newbie                19583 non-null  int64  
 2   Age                   19022 non-null  float64
 3   Gender                19583 non-null  object 
 4   Household Income      16398 non-null  object 
 5   Sexual Preference     18291 non-null  object 
 6   Country               19583 non-null  object 
 7   Education Attainment  19583 non-null  object 
 8   Major Occupation      19583 non-null  object 
 9   Marital Status        19240 non-null  object 
 10  Years on Internet     19583 non-null  object 
dtypes: float64(1), int64(1), object(9)
memory usage: 1.6+ MB


In [11]:
df.describe()

Unnamed: 0,Newbie,Age
count,19583.0,19022.0
mean,0.253383,35.222269
std,0.434959,12.642339
min,0.0,5.0
25%,0.0,25.0
50%,0.0,33.0
75%,1.0,44.0
max,1.0,80.0


In [12]:
df.describe(include="all")

Unnamed: 0,who,Newbie,Age,Gender,Household Income,Sexual Preference,Country,Education Attainment,Major Occupation,Marital Status,Years on Internet
count,19583,19583.0,19022.0,19583,16398,18291,19583,19583,19583,19240,19583
unique,19583,,,2,8,6,142,9,5,6,5
top,id74364,,,Male,$50-74,Heterosexual,California,College,Computer,Married,1-3 yr
freq,1,,,13454,3935,16869,2312,6194,5921,8742,8615
mean,,0.253383,35.222269,,,,,,,,
std,,0.434959,12.642339,,,,,,,,
min,,0.0,5.0,,,,,,,,
25%,,0.0,25.0,,,,,,,,
50%,,0.0,33.0,,,,,,,,
75%,,1.0,44.0,,,,,,,,


#### 2. 독립변수 제거

In [13]:
df2 = df.drop(['who', 'Country', 'Years on Internet'], axis=1)
df2.head()

Unnamed: 0,Newbie,Age,Gender,Household Income,Sexual Preference,Education Attainment,Major Occupation,Marital Status
0,0,54.0,Male,$50-74,Gay male,Some College,Computer,Other
1,0,39.0,Female,Over $100,Heterosexual,Professional,Other,Other
2,1,49.0,Female,$40-49,Heterosexual,Some College,Management,Other
3,1,22.0,Female,$40-49,Heterosexual,Some College,Computer,Married
4,0,20.0,Male,$30-39,Bisexual,Some College,Education,Single


#### 3. 결측값 제거

In [14]:
df2.isnull().sum()

Newbie                     0
Age                      561
Gender                     0
Household Income        3185
Sexual Preference       1292
Education Attainment       0
Major Occupation           0
Marital Status           343
dtype: int64

In [15]:
from sklearn.impute import SimpleImputer

age_mean = SimpleImputer(strategy="mean")
df2['Age'] = age_mean.fit_transform(df[['Age']])
df2

Unnamed: 0,Newbie,Age,Gender,Household Income,Sexual Preference,Education Attainment,Major Occupation,Marital Status
0,0,54.0,Male,$50-74,Gay male,Some College,Computer,Other
1,0,39.0,Female,Over $100,Heterosexual,Professional,Other,Other
2,1,49.0,Female,$40-49,Heterosexual,Some College,Management,Other
3,1,22.0,Female,$40-49,Heterosexual,Some College,Computer,Married
4,0,20.0,Male,$30-39,Bisexual,Some College,Education,Single
...,...,...,...,...,...,...,...,...
19578,0,22.0,Male,Over $100,Heterosexual,Some College,Education,Single
19579,0,19.0,Male,,Heterosexual,Some College,Education,Single
19580,0,49.0,Female,$50-74,Heterosexual,Doctoral,Education,Married
19581,1,42.0,Female,$50-74,Heterosexual,Some College,Other,Married


In [16]:
income_freq = SimpleImputer(strategy="most_frequent")
df2['Household Income'] = income_freq.fit_transform(df[['Household Income']])

sexual_freq = SimpleImputer(strategy="most_frequent")
df2['Sexual Preference'] = sexual_freq.fit_transform(df[['Sexual Preference']])

marital_freq = SimpleImputer(strategy="most_frequent")
df2['Marital Status'] = marital_freq.fit_transform(df[['Marital Status']])

df2

Unnamed: 0,Newbie,Age,Gender,Household Income,Sexual Preference,Education Attainment,Major Occupation,Marital Status
0,0,54.0,Male,$50-74,Gay male,Some College,Computer,Other
1,0,39.0,Female,Over $100,Heterosexual,Professional,Other,Other
2,1,49.0,Female,$40-49,Heterosexual,Some College,Management,Other
3,1,22.0,Female,$40-49,Heterosexual,Some College,Computer,Married
4,0,20.0,Male,$30-39,Bisexual,Some College,Education,Single
...,...,...,...,...,...,...,...,...
19578,0,22.0,Male,Over $100,Heterosexual,Some College,Education,Single
19579,0,19.0,Male,$50-74,Heterosexual,Some College,Education,Single
19580,0,49.0,Female,$50-74,Heterosexual,Doctoral,Education,Married
19581,1,42.0,Female,$50-74,Heterosexual,Some College,Other,Married


In [17]:
df2.isnull().sum()

Newbie                  0
Age                     0
Gender                  0
Household Income        0
Sexual Preference       0
Education Attainment    0
Major Occupation        0
Marital Status          0
dtype: int64

#### 4. get_dummies()

In [25]:
df2.value_counts()

Newbie  Age        Gender  Household Income  Sexual Preference  Education Attainment  Major Occupation  Marital Status
0       20.000000  Male    $50-74            Heterosexual       Some College          Education         Single            59
        19.000000  Male    $50-74            Heterosexual       Some College          Education         Single            44
        21.000000  Male    $50-74            Heterosexual       Some College          Education         Single            37
        18.000000  Male    $50-74            Heterosexual       Some College          Education         Single            30
        35.222269  Male    $50-74            Heterosexual       College               Computer          Married           28
                                                                                                                          ..
        38.000000  Female  $10-19            Heterosexual       College               Education         Married            1
      

In [32]:
df2

Unnamed: 0,Newbie,Age,Gender,Household Income,Sexual Preference,Education Attainment,Major Occupation,Marital Status
0,0,54.0,Male,$50-74,Gay male,Some College,Computer,Other
1,0,39.0,Female,Over $100,Heterosexual,Professional,Other,Other
2,1,49.0,Female,$40-49,Heterosexual,Some College,Management,Other
3,1,22.0,Female,$40-49,Heterosexual,Some College,Computer,Married
4,0,20.0,Male,$30-39,Bisexual,Some College,Education,Single
...,...,...,...,...,...,...,...,...
19578,0,22.0,Male,Over $100,Heterosexual,Some College,Education,Single
19579,0,19.0,Male,$50-74,Heterosexual,Some College,Education,Single
19580,0,49.0,Female,$50-74,Heterosexual,Doctoral,Education,Married
19581,1,42.0,Female,$50-74,Heterosexual,Some College,Other,Married


In [33]:
# dummy = pd.DataFrame(df2[['Gender', 'Household Income', 'Sexual Preference', 'Education Attainment', 'Major Occupation', 'Marital Status']])
dummy = pd.DataFrame(df2.iloc[:, 2:])
dummy_all = pd.get_dummies(dummy)
dummy_all

Unnamed: 0,Gender_Female,Gender_Male,Household Income_$10-19,Household Income_$20-29,Household Income_$30-39,Household Income_$40-49,Household Income_$50-74,Household Income_$75-99,Household Income_Over $100,Household Income_Under $10,...,Major Occupation_Education,Major Occupation_Management,Major Occupation_Other,Major Occupation_Professional,Marital Status_Divorced,Marital Status_Married,Marital Status_Other,Marital Status_Separated,Marital Status_Single,Marital Status_Widowed
0,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,1,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,1,0,0,0
2,1,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
3,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,1,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19578,0,1,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0
19579,0,1,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
19580,1,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,1,0,0,0,0
19581,1,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,1,0,0,0,0


In [35]:
dummy_all.keys()

Index(['Gender_Female', 'Gender_Male', 'Household Income_$10-19',
       'Household Income_$20-29', 'Household Income_$30-39',
       'Household Income_$40-49', 'Household Income_$50-74',
       'Household Income_$75-99', 'Household Income_Over $100',
       'Household Income_Under $10', 'Sexual Preference_Bisexual',
       'Sexual Preference_Gay male', 'Sexual Preference_Heterosexual',
       'Sexual Preference_Lesbian', 'Sexual Preference_Transgender',
       'Sexual Preference_na', 'Education Attainment_College',
       'Education Attainment_Doctoral', 'Education Attainment_Grammar',
       'Education Attainment_High School', 'Education Attainment_Masters',
       'Education Attainment_Other', 'Education Attainment_Professional',
       'Education Attainment_Some College', 'Education Attainment_Special',
       'Major Occupation_Computer', 'Major Occupation_Education',
       'Major Occupation_Management', 'Major Occupation_Other',
       'Major Occupation_Professional', 'Marital St

In [36]:
df3 = df2.iloc[:, :2]
df3

Unnamed: 0,Newbie,Age
0,0,54.0
1,0,39.0
2,1,49.0
3,1,22.0
4,0,20.0
...,...,...
19578,0,22.0
19579,0,19.0
19580,0,49.0
19581,1,42.0


In [39]:
newbie = pd.concat([df3, dummy_all], axis=1)
newbie

Unnamed: 0,Newbie,Age,Gender_Female,Gender_Male,Household Income_$10-19,Household Income_$20-29,Household Income_$30-39,Household Income_$40-49,Household Income_$50-74,Household Income_$75-99,...,Major Occupation_Education,Major Occupation_Management,Major Occupation_Other,Major Occupation_Professional,Marital Status_Divorced,Marital Status_Married,Marital Status_Other,Marital Status_Separated,Marital Status_Single,Marital Status_Widowed
0,0,54.0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
1,0,39.0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
2,1,49.0,1,0,0,0,0,1,0,0,...,0,1,0,0,0,0,1,0,0,0
3,1,22.0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,20.0,0,1,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19578,0,22.0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
19579,0,19.0,0,1,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0
19580,0,49.0,1,0,0,0,0,0,1,0,...,1,0,0,0,0,1,0,0,0,0
19581,1,42.0,1,0,0,0,0,0,1,0,...,0,0,1,0,0,1,0,0,0,0


In [66]:
from sklearn.model_selection import train_test_split

newbie_X = newbie.iloc[:, 1:]
newbie_y = newbie['Newbie']

X_train, X_test, y_train, y_test = train_test_split(newbie_X, newbie_y, random_state=1)
X_train.shape, X_test.shape

((14687, 37), (4896, 37))

In [67]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver="liblinear")
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.7638888888888888

In [68]:
pred = model.predict(X_test)
score = accuracy_score(y_test, pred)
print("test score:", score)

test score: 0.7638888888888888


---

## Answer

In [69]:
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model, datasets

In [70]:
df = pd.read_csv('./datasets/newbie.csv')
print(df.shape)
df.head()

(19583, 11)


Unnamed: 0,who,Newbie,Age,Gender,Household Income,Sexual Preference,Country,Education Attainment,Major Occupation,Marital Status,Years on Internet
0,id74364,0,54.0,Male,$50-74,Gay male,Ontario,Some College,Computer,Other,4-6 yr
1,id84505,0,39.0,Female,Over $100,Heterosexual,Sweden,Professional,Other,Other,1-3 yr
2,id84509,1,49.0,Female,$40-49,Heterosexual,Washington,Some College,Management,Other,Under 6 mo
3,id87028,1,22.0,Female,$40-49,Heterosexual,Florida,Some College,Computer,Married,6-12 mo
4,id76087,0,20.0,Male,$30-39,Bisexual,New Jersey,Some College,Education,Single,1-3 yr


In [71]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19583 entries, 0 to 19582
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   who                   19583 non-null  object 
 1   Newbie                19583 non-null  int64  
 2   Age                   19022 non-null  float64
 3   Gender                19583 non-null  object 
 4   Household Income      16398 non-null  object 
 5   Sexual Preference     18291 non-null  object 
 6   Country               19583 non-null  object 
 7   Education Attainment  19583 non-null  object 
 8   Major Occupation      19583 non-null  object 
 9   Marital Status        19240 non-null  object 
 10  Years on Internet     19583 non-null  object 
dtypes: float64(1), int64(1), object(9)
memory usage: 1.6+ MB


// 독립변수 제거

In [72]:
df = df.drop(['who', 'Country', 'Years on Internet'], axis=1)
df.head()

Unnamed: 0,Newbie,Age,Gender,Household Income,Sexual Preference,Education Attainment,Major Occupation,Marital Status
0,0,54.0,Male,$50-74,Gay male,Some College,Computer,Other
1,0,39.0,Female,Over $100,Heterosexual,Professional,Other,Other
2,1,49.0,Female,$40-49,Heterosexual,Some College,Management,Other
3,1,22.0,Female,$40-49,Heterosexual,Some College,Computer,Married
4,0,20.0,Male,$30-39,Bisexual,Some College,Education,Single


// 결측치 제거

In [73]:
df.isnull().sum()

Newbie                     0
Age                      561
Gender                     0
Household Income        3185
Sexual Preference       1292
Education Attainment       0
Major Occupation           0
Marital Status           343
dtype: int64

In [74]:
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Age']

0        54.0
1        39.0
2        49.0
3        22.0
4        20.0
         ... 
19578    22.0
19579    19.0
19580    49.0
19581    42.0
19582    24.0
Name: Age, Length: 19583, dtype: float64

In [75]:
df['Household Income'].mode()

# 최빈값 구할때 사용

0    $50-74
dtype: object

In [76]:
df['Household Income'] = df['Household Income'].fillna(df['Household Income'].mode()[0])
df['Sexual Preference'] = df['Sexual Preference'].fillna(df['Sexual Preference'].mode()[0])
df['Marital Status'] = df['Marital Status'].fillna(df['Marital Status'].mode()[0])
df[['Household Income', 'Sexual Preference', 'Marital Status']]

Unnamed: 0,Household Income,Sexual Preference,Marital Status
0,$50-74,Gay male,Other
1,Over $100,Heterosexual,Other
2,$40-49,Heterosexual,Other
3,$40-49,Heterosexual,Married
4,$30-39,Bisexual,Single
...,...,...,...
19578,Over $100,Heterosexual,Single
19579,$50-74,Heterosexual,Single
19580,$50-74,Heterosexual,Married
19581,$50-74,Heterosexual,Married


In [77]:
df.isnull().sum()

Newbie                  0
Age                     0
Gender                  0
Household Income        0
Sexual Preference       0
Education Attainment    0
Major Occupation        0
Marital Status          0
dtype: int64

// 데이터 타입 category로 변환<br>
// (데이터를 한꺼번에 인코딩하기위해)

In [78]:
df.iloc[:, 2:] = df.iloc[:, 2:].astype("category")

In [79]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19583 entries, 0 to 19582
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   Newbie                19583 non-null  int64   
 1   Age                   19583 non-null  float64 
 2   Gender                19583 non-null  category
 3   Household Income      19583 non-null  category
 4   Sexual Preference     19583 non-null  category
 5   Education Attainment  19583 non-null  category
 6   Major Occupation      19583 non-null  category
 7   Marital Status        19583 non-null  category
dtypes: category(6), float64(1), int64(1)
memory usage: 422.3 KB


In [80]:
df

Unnamed: 0,Newbie,Age,Gender,Household Income,Sexual Preference,Education Attainment,Major Occupation,Marital Status
0,0,54.0,Male,$50-74,Gay male,Some College,Computer,Other
1,0,39.0,Female,Over $100,Heterosexual,Professional,Other,Other
2,1,49.0,Female,$40-49,Heterosexual,Some College,Management,Other
3,1,22.0,Female,$40-49,Heterosexual,Some College,Computer,Married
4,0,20.0,Male,$30-39,Bisexual,Some College,Education,Single
...,...,...,...,...,...,...,...,...
19578,0,22.0,Male,Over $100,Heterosexual,Some College,Education,Single
19579,0,19.0,Male,$50-74,Heterosexual,Some College,Education,Single
19580,0,49.0,Female,$50-74,Heterosexual,Doctoral,Education,Married
19581,1,42.0,Female,$50-74,Heterosexual,Some College,Other,Married


// 더미 변수로 원핫인코딩 처리

In [81]:
df = pd.get_dummies(df)
df.head()

Unnamed: 0,Newbie,Age,Gender_Female,Gender_Male,Household Income_$10-19,Household Income_$20-29,Household Income_$30-39,Household Income_$40-49,Household Income_$50-74,Household Income_$75-99,...,Major Occupation_Education,Major Occupation_Management,Major Occupation_Other,Major Occupation_Professional,Marital Status_Divorced,Marital Status_Married,Marital Status_Other,Marital Status_Separated,Marital Status_Single,Marital Status_Widowed
0,0,54.0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
1,0,39.0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
2,1,49.0,1,0,0,0,0,1,0,0,...,0,1,0,0,0,0,1,0,0,0
3,1,22.0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,20.0,0,1,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0


In [82]:
x_data = df.iloc[:, 1:]
y_data = df['Newbie']

In [83]:
x_data

Unnamed: 0,Age,Gender_Female,Gender_Male,Household Income_$10-19,Household Income_$20-29,Household Income_$30-39,Household Income_$40-49,Household Income_$50-74,Household Income_$75-99,Household Income_Over $100,...,Major Occupation_Education,Major Occupation_Management,Major Occupation_Other,Major Occupation_Professional,Marital Status_Divorced,Marital Status_Married,Marital Status_Other,Marital Status_Separated,Marital Status_Single,Marital Status_Widowed
0,54.0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
1,39.0,1,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,1,0,0,0
2,49.0,1,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,1,0,0,0
3,22.0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,20.0,0,1,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19578,22.0,0,1,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,1,0
19579,19.0,0,1,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
19580,49.0,1,0,0,0,0,0,1,0,0,...,1,0,0,0,0,1,0,0,0,0
19581,42.0,1,0,0,0,0,0,1,0,0,...,0,0,1,0,0,1,0,0,0,0


In [84]:
y_data

0        0
1        0
2        1
3        1
4        0
        ..
19578    0
19579    0
19580    0
19581    1
19582    0
Name: Newbie, Length: 19583, dtype: int64

// 학습 데이터와 테스트 데이터 분리

In [85]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(x_data, y_data, test_size=0.33, random_state=42)
x_train.shape, x_test.shape

((13120, 37), (6463, 37))

In [64]:
model = linear_model.LogisticRegression(solver="liblinear")
model.fit(x_train, y_train)

LogisticRegression(solver='liblinear')

In [65]:
pred = model.predict(x_test)
score = accuracy_score(y_test, pred)
print("test score:", score)

test score: 0.7494971375522204


#### cf) MinMaxScaler 변환

In [86]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(x_train)
X_train_scaled = scaler.transform(x_train)
X_test_scaled = scaler.transform(x_test)

In [88]:
model = linear_model.LogisticRegression(solver="liblinear")
model.fit(X_train_scaled, y_train)

LogisticRegression(solver='liblinear')

In [90]:
pred = model.predict(X_test_scaled)
score = accuracy_score(y_test, pred)
print("test score:", score)

test score: 0.7499613182732477
