In [1]:
# 데이터를 불러오고 살펴보기 위한 pandas 라이브러리
import pandas as pd

# train 데이터 불러오기
train = pd.read_csv('./train.csv')

# test 데이터 불러오기
test = pd.read_csv('./test.csv')

# sample_submission 불러오기
sample_submission = pd.read_csv('./sample_submission.csv')

In [2]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1955 entries, 0 to 1954
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        1955 non-null   int64  
 1   Age                       1861 non-null   float64
 2   TypeofContact             1945 non-null   object 
 3   CityTier                  1955 non-null   int64  
 4   DurationOfPitch           1853 non-null   float64
 5   Occupation                1955 non-null   object 
 6   Gender                    1955 non-null   object 
 7   NumberOfPersonVisiting    1955 non-null   int64  
 8   NumberOfFollowups         1942 non-null   float64
 9   ProductPitched            1955 non-null   object 
 10  PreferredPropertyStar     1945 non-null   float64
 11  MaritalStatus             1955 non-null   object 
 12  NumberOfTrips             1898 non-null   float64
 13  Passport                  1955 non-null   int64  
 14  PitchSat

In [3]:
# 먼저 결측치가 얼마나 있는지 확인합니다.
na_check = train.isna().sum()

na_check[na_check > 0]

Age                          94
TypeofContact                10
DurationOfPitch             102
NumberOfFollowups            13
PreferredPropertyStar        10
NumberOfTrips                57
NumberOfChildrenVisiting     27
MonthlyIncome               100
dtype: int64

In [4]:
# 해당 열들의 값을 살펴봅시다.
train[na_check[na_check > 0].keys()].head()

Unnamed: 0,Age,TypeofContact,DurationOfPitch,NumberOfFollowups,PreferredPropertyStar,NumberOfTrips,NumberOfChildrenVisiting,MonthlyIncome
0,28.0,Company Invited,10.0,4.0,3.0,3.0,1.0,20384.0
1,34.0,Self Enquiry,,4.0,4.0,1.0,0.0,19599.0
2,45.0,Company Invited,,3.0,4.0,2.0,0.0,
3,29.0,Company Invited,7.0,5.0,4.0,3.0,1.0,21274.0
4,42.0,Self Enquiry,6.0,3.0,3.0,2.0,0.0,19907.0


In [7]:
# pandas의 fillna 메소드를 활용하여 NAN 값을 채워니다.
train = train.copy()

# 0 으로 채우는 경우
train.DurationOfPitch = train.DurationOfPitch.fillna(0)

# mean 값으로 채우는 경우
mean_cols = ['Age','NumberOfFollowups','PreferredPropertyStar','NumberOfTrips','NumberOfChildrenVisiting','MonthlyIncome']
for col in mean_cols:
    train[col] = train[col].fillna(train[col].mean())

# "Unknown"으로 채우는 경우
train.TypeofContact = train.TypeofContact.fillna("Unknown")

# 결과를 확인합니다.
train.isna().sum()

id                          0
Age                         0
TypeofContact               0
CityTier                    0
DurationOfPitch             0
Occupation                  0
Gender                      0
NumberOfPersonVisiting      0
NumberOfFollowups           0
ProductPitched              0
PreferredPropertyStar       0
MaritalStatus               0
NumberOfTrips               0
Passport                    0
PitchSatisfactionScore      0
OwnCar                      0
NumberOfChildrenVisiting    0
Designation                 0
MonthlyIncome               0
ProdTaken                   0
dtype: int64

In [None]:
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

#상관관계 분석도
plt.figure(figsize=(15,10))

heat_table = train.corr()
mask = np.zeros_like(heat_table)
mask[np.triu_indices_from(mask)] = True
heatmap_ax = sns.heatmap(heat_table, annot=True, mask = mask, cmap='coolwarm', vmin=-1, vmax=1)
heatmap_ax.set_xticklabels(heatmap_ax.get_xticklabels(), fontsize=15, rotation=90)
heatmap_ax.set_yticklabels(heatmap_ax.get_yticklabels(), fontsize=15)
plt.title('correlation between features', fontsize=40)
plt.show()

In [9]:

object_columns = train.columns[train.dtypes == 'object']
print('object 칼럼은 다음과 같습니다 : ', list(object_columns))

# 해당 칼럼만 보아서 봅시다
train[object_columns]

object 칼럼은 다음과 같습니다 :  ['TypeofContact', 'Occupation', 'Gender', 'ProductPitched', 'MaritalStatus', 'Designation']


Unnamed: 0,TypeofContact,Occupation,Gender,ProductPitched,MaritalStatus,Designation
0,Company Invited,Small Business,Male,Basic,Married,Executive
1,Self Enquiry,Small Business,Female,Deluxe,Single,Manager
2,Company Invited,Salaried,Male,Deluxe,Married,Manager
3,Company Invited,Small Business,Male,Basic,Married,Executive
4,Self Enquiry,Salaried,Male,Deluxe,Divorced,Manager
...,...,...,...,...,...,...
1950,Self Enquiry,Small Business,Male,Basic,Single,Executive
1951,Self Enquiry,Salaried,Female,Super Deluxe,Divorced,AVP
1952,Company Invited,Small Business,Female,Basic,Divorced,Executive
1953,Self Enquiry,Small Business,Female,Deluxe,Married,Manager


In [10]:

from sklearn.preprocessing import LabelEncoder

train_enc = train.copy()

# 모든 문자형 변수에 대해 encoder를 적용합니다.
for o_col in object_columns:
    encoder = LabelEncoder()
    encoder.fit(train_enc[o_col])
    train_enc[o_col] = encoder.transform(train_enc[o_col])

# 결과를 확인합니다.
train_enc

Unnamed: 0,id,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome,ProdTaken
0,1,28.0,0,1,10.0,3,2,3,4.0,0,3.0,1,3.0,0,1,0,1.0,1,20384.000000,0
1,2,34.0,1,3,0.0,3,1,2,4.0,1,4.0,2,1.0,1,5,1,0.0,2,19599.000000,1
2,3,45.0,0,1,0.0,2,2,2,3.0,1,4.0,1,2.0,0,4,1,0.0,2,23624.108895,0
3,4,29.0,0,1,7.0,3,2,3,5.0,0,4.0,1,3.0,0,4,0,1.0,1,21274.000000,1
4,5,42.0,1,3,6.0,2,2,2,3.0,1,3.0,0,2.0,0,3,1,0.0,2,19907.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1950,1951,28.0,1,1,10.0,3,2,3,5.0,0,3.0,2,2.0,0,1,1,2.0,1,20723.000000,0
1951,1952,41.0,1,3,8.0,2,1,3,3.0,4,5.0,0,1.0,0,5,1,1.0,0,31595.000000,0
1952,1953,38.0,0,3,28.0,3,1,3,4.0,0,3.0,0,7.0,0,2,1,2.0,1,21651.000000,0
1953,1954,28.0,1,3,30.0,3,1,3,5.0,1,3.0,1,3.0,0,1,1,2.0,2,22218.000000,0


In [11]:
# MinMaxScaler를 준비해줍니다.
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

train_scale = train_enc.copy()

# MinMaxScaler는 학습하는 과정을 필요로 합니다.
scaler.fit(train_scale[['Age', 'DurationOfPitch', 'MonthlyIncome']])

# 학습된 scaler를 사용하여 변환해줍니다.
train_scale[['Age', 'DurationOfPitch', 'MonthlyIncome']] = scaler.transform(train_scale[['Age', 'DurationOfPitch', 'MonthlyIncome']])

# 결과를 확인합니다.
train_scale

Unnamed: 0,id,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome,ProdTaken
0,1,0.232558,0,1,0.277778,3,2,3,4.0,0,3.0,1,3.0,0,1,0,1.0,1,0.198448,0
1,2,0.372093,1,3,0.000000,3,1,2,4.0,1,4.0,2,1.0,1,5,1,0.0,2,0.190411,1
2,3,0.627907,0,1,0.000000,2,2,2,3.0,1,4.0,1,2.0,0,4,1,0.0,2,0.231619,0
3,4,0.255814,0,1,0.194444,3,2,3,5.0,0,4.0,1,3.0,0,4,0,1.0,1,0.207560,1
4,5,0.558140,1,3,0.166667,2,2,2,3.0,1,3.0,0,2.0,0,3,1,0.0,2,0.193565,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1950,1951,0.232558,1,1,0.277778,3,2,3,5.0,0,3.0,2,2.0,0,1,1,2.0,1,0.201919,0
1951,1952,0.534884,1,3,0.222222,2,1,3,3.0,4,5.0,0,1.0,0,5,1,1.0,0,0.313223,0
1952,1953,0.465116,0,3,0.777778,3,1,3,4.0,0,3.0,0,7.0,0,2,1,2.0,1,0.211419,0
1953,1954,0.232558,1,3,0.833333,3,1,3,5.0,1,3.0,1,3.0,0,1,1,2.0,2,0.217224,0


In [13]:
# 결측치 처리
# 0 으로 채우는 경우
test.DurationOfPitch = test.DurationOfPitch.fillna(0)

# mean 값으로 채우는 경우
mean_cols = ['Age','NumberOfFollowups','PreferredPropertyStar','NumberOfTrips','NumberOfChildrenVisiting','MonthlyIncome']
for col in mean_cols:
    test[col] = test[col].fillna(test[col].mean())

# "Unknown"으로 채우는 경우
test.TypeofContact = test.TypeofContact.fillna("Unknown")

# 문자형 변수 전처리
for o_col in object_columns:
    encoder = LabelEncoder()
    
    # test 데이터를 이용해 encoder를 학습하는 것은 Data Leakage 입니다! 조심!
    encoder.fit(train[o_col])
    
    # test 데이터는 오로지 transform 에서만 사용되어야 합니다.
    test[o_col] = encoder.transform(test[o_col])

# 숫자형 변수 scaling
# 학습된 scaler를 사용하여 변환해줍니다.
test[['Age', 'DurationOfPitch', 'MonthlyIncome']] = scaler.transform(test[['Age', 'DurationOfPitch', 'MonthlyIncome']])

# 최종 확인
test

Unnamed: 0,id,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,1,0.325581,0,3,0.000000,3,2,2,5.000000,1,3.0,1,1.0,0,2,0,1.0,2,0.191118
1,2,0.651163,1,2,0.305556,3,2,3,3.701827,1,4.0,1,1.0,1,5,0,1.0,2,0.194732
2,3,0.441860,1,3,0.611111,3,2,3,4.000000,1,3.0,1,5.0,0,5,1,0.0,2,0.208174
3,4,0.581395,1,1,1.000000,3,2,3,6.000000,1,3.0,3,6.0,0,3,1,2.0,2,0.224718
4,5,0.162791,1,3,0.194444,1,1,4,4.000000,0,4.0,3,3.0,1,4,1,3.0,1,0.213764
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2928,2929,0.837209,1,1,0.166667,3,1,2,3.000000,4,3.0,2,7.0,0,4,1,1.0,0,0.320727
2929,2930,0.348837,1,1,0.250000,3,0,4,2.000000,1,3.0,3,2.0,0,3,0,1.0,2,0.232734
2930,2931,0.348837,0,1,0.861111,2,2,4,4.000000,1,3.0,0,3.0,0,4,1,1.0,2,0.235334
2931,2932,0.186047,1,1,0.250000,3,2,4,2.000000,0,5.0,3,2.0,0,2,1,3.0,1,0.216036


In [14]:
from sklearn.ensemble import RandomForestClassifier

# 모델 선언
model = RandomForestClassifier()

In [15]:
# 분석할 의미가 없는 칼럼을 제거합니다.
train = train_scale.drop(columns=['id'])
test = test.drop(columns=['id'])

# 학습에 사용할 정보와 예측하고자 하는 정보를 분리합니다.
x_train = train.drop(columns=['ProdTaken'])
y_train = train[['ProdTaken']]

In [16]:
# 모델 학습
model.fit(x_train,y_train)

  


RandomForestClassifier()

In [17]:
# 학습된 모델을 이용해 결과값 예측후 상위 10개의 값 확인
prediction = model.predict(test)
print('----------------------예측된 데이터의 상위 10개의 값 확인--------------------\n')
print(prediction[:10])

----------------------예측된 데이터의 상위 10개의 값 확인--------------------

[0 0 0 0 1 0 0 1 0 0]


In [18]:
# 예측된 값을 정답파일과 병합
sample_submission['ProdTaken'] = prediction

# 정답파일 데이터프레임 확인
sample_submission.head()

Unnamed: 0,id,ProdTaken
0,1,0
1,2,0
2,3,0
3,4,0
4,5,1


In [19]:
# submission을 csv 파일로 저장합니다.
# index=False란 추가적인 id를 부여할 필요가 없다는 뜻입니다. 
# 정확한 채점을 위해 꼭 index=False를 넣어주세요.
sample_submission.to_csv('submission.csv',index = False)