# 1. 데이터 업로드

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import cross_val_score, train_test_split, KFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [2]:
train = pd.read_csv('./data/Pakistan.csv', encoding = 'unicode_escape')
print(train.shape)
train.head()

(51, 18)


Unnamed: 0,S#,Politician,Day,Date,Day Type,Time,City,Location,Location Category,Province,Latitude,Longititude,Target Category,Space (Open/Closed),Party,Target Status,Killed,Injured
0,1,Liaqat Ali Khan,Tuesday,16-Oct-51,Working Day,Evening,Rawalpindi,"Company Bagh (Company Gardens), Rawalpindi",Park/Ground,Punjab,33.6058,73.0437,Target,Open,Alll India Muslim League,Killed,1,1
1,2,Khan abdul Jabbar Khan,Friday,9-May-58,Working Day,Morning,Lahore,"Sadullah Khan's house at 16 Aikman Road, GOR",Residence,Punjab,31.482635,74.071272,Target,Open,Chief Minister of West Pakistan,Killed,1,0
2,3,Hayyat Sherpao,Friday,8-Feb-85,Working Day,Evening,Peshawar,Campus of University of Peshawar,University,KPK,33.999533,71.42502,Target,Open,PPPP,Killed,1,0
3,4,Najeeb Ahmed,Friday,6-Apr-90,Working Day,After Noon,Karachi,North Nazimabad,Residence,Sindh,24.9918,66.9911,Target,Closed,Student wing of PPP,Killed,1,0
4,5,Azeem Ahmed Tariq,Saturday,1-May-93,Working Day,Morning,Karachi,near his house,Residence,Sindh,24.9918,66.9911,Target,Closed,MQM,Killed,1,0


### 1.1) 타겟 분포 및 카테고리 변수 확인

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 18 columns):
S#                     51 non-null int64
Politician             51 non-null object
Day                    51 non-null object
Date                   51 non-null object
Day Type               51 non-null object
Time                   51 non-null object
City                   51 non-null object
Location               51 non-null object
Location Category      48 non-null object
Province               51 non-null object
Latitude               51 non-null float64
Longititude            51 non-null float64
Target Category        51 non-null object
Space (Open/Closed)    51 non-null object
Party                  51 non-null object
Target Status          51 non-null object
Killed                 51 non-null int64
Injured                51 non-null int64
dtypes: float64(2), int64(3), object(13)
memory usage: 7.3+ KB


In [4]:
train['Target Category'].value_counts()
# Imbalanced class는 아닌 듯 하다.

Suicide    27
Target     24
Name: Target Category, dtype: int64

In [5]:
object_column = train.dtypes[train.dtypes == 'object'].index.tolist()
for column in object_column:
    print("=================== " + column + " ===================")
    print(train[column].unique())
    print(train[column].value_counts())

['Liaqat Ali Khan' 'Khan abdul Jabbar Khan' 'Hayyat Sherpao'
 'Najeeb Ahmed' 'Azeem Ahmed Tariq' 'Hakim Muhammad Said'
 'Maulana Azam Tariq' 'Pervez Musharaf' 'Shaukat Aziz' 'Zil-e-Huma Usman'
 'Aftab Ahmad Khan Sherpao' 'Political Meeting' 'Abdul Razzaq Bugti'
 'Benazir Bhutto' 'Ahsan Iqbal' 'Miangul Asfandyar Amir Zeb'
 'Khursheed Khan' ' Rashid Akbar Nawani' 'Hussain Ali Yousafi'
 'Muhammed Khan Baba Jan' 'Malik Swab Khan' 'Habib Jalib Baloch'
 '\xa0Asim Ali Kurd' 'Mir Nooruddin Mengal' '\xa0Amjad Ali Khan'
 'Muhammad Aslam Khan Raisani' 'Salman Taseer' 'Shahbaz Bhatti'
 'Bashir Ahmed Bilour' 'Syed Manzar Imam' '\xa0Khalid Mumtaz Kundi'
 'Ghulam Ahmed Bilour' 'Dr Mohammad Ibrahim Jatoi\xa0' 'Syed Janan'
 'Zahra Shahid Hussain' 'Farid Khan' 'Imran Khan Mohmand'
 'Israr Ullah Khan Gandapur' 'Col Shuja Khanzada\xa0'
 'Muhammad Amjad Farooq' 'Soran Singh' 'Sherin malik' 'Haroon Bilour'
 'Akram Khan Durrani' 'Siraj Raisani']
Political Meeting              5
Pervez Musharaf               

데이터 확인을 하였으니, 필요없는 변수 빼기

1. Day 요일 별
2. Date -> 특정 기간에서 빼기
3. Day Type -> Label encoding
4. Time = Label encoding
5. City = One-hot
6. Location  제거
7. Location Category 대신 넣기
8. Province 
9. Space (Open/Closed) : label
10. Party = label
11. Target Status = label


#### Target Category = target 값

### 2. 결측값 확인

In [6]:
train.isnull().sum() 
# 결측값 최빈값으로 대체
train = train.fillna('Residence')

### 3. 레이블 인코딩

In [7]:
train.drop(['S#','Politician', 'Latitude', 'Longititude', 'Date','Location'], axis = 1, inplace = True)
object_columns = train.dtypes[train.dtypes == 'object'].index.tolist()

# LabelEncoder() 와 같은 pandas의 factorize
for column in object_columns:
    train[column] = pd.factorize(train[column])[0]

# 3. 모델링

In [8]:
# 레이블 인코딩용
Train = train.drop('Target Category', axis = 1, inplace = False)
target = train['Target Category']

print(Train.shape , target.shape)

(51, 11) (51,)


### 3.1 RandomForest

In [15]:
rf_clf = RandomForestClassifier(n_jobs = -1,random_state = 2015)                               
scores = cross_val_score(rf_clf, Train , target , scoring = 'f1', cv=10)

for iter_count, f1_score in enumerate(scores):
    print("fold {0} F1 score: {1:.4f}".format(iter_count+1, f1_score))

print("Total (Average) F1 score: {0:.4f}".format(np.mean(scores))) 

fold 1 F1 score: 1.0000
fold 2 F1 score: 1.0000
fold 3 F1 score: 0.8571
fold 4 F1 score: 1.0000
fold 5 F1 score: 1.0000
fold 6 F1 score: 1.0000
fold 7 F1 score: 1.0000
fold 8 F1 score: 1.0000
fold 9 F1 score: 0.0000
fold 10 F1 score: 0.6667
Total (Average) F1 score: 0.8524


In [10]:
rf_clf

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators='warn',
                       n_jobs=-1, oob_score=False, random_state=2015, verbose=0,
                       warm_start=False)

### 3.2 최적화

In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(Train, target, test_size=0.2, random_state=2015)

params = { 'n_estimators' : [80, 100, 120],
           'max_depth' : [2, 3, 4, 5],
           'min_samples_leaf' : [1, 2, 3],
           'min_samples_split' : [2, 3, 4]
            }

# RandomForestClassifier 객체 생성 후 GridSearchCV 수행하였습니다.
rf_clf = RandomForestClassifier(random_state = 2015, n_jobs = -1)
grid_cv = GridSearchCV(rf_clf, param_grid = params, cv = 3, scoring='f1', n_jobs = -1)
grid_cv.fit(X_train, Y_train)

print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
print('최고 예측 F1 score: {:.4f}'.format(grid_cv.best_score_))

최적 하이퍼 파라미터:  {'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 120}
최고 예측 F1 score: 0.9594




## 4. Measures

In [13]:
rf_clf = RandomForestClassifier(n_estimators = 80,
                                max_depth = 2,
                                min_samples_leaf = 1,
                                min_samples_split = 2,
                                 n_jobs = -1,
                                random_state = 2015)
                                
scores = cross_val_score(rf_clf, Train , target , scoring = 'f1', cv=10)

for iter_count, f1_score in enumerate(scores):
    print("fold {0} F1 score: {1:.4f}".format(iter_count+1, f1_score))

print("Total (Average) F1 score: {0:.4f}".format(np.mean(scores))) 


fold 1 F1 score: 1.0000
fold 2 F1 score: 1.0000
fold 3 F1 score: 1.0000
fold 4 F1 score: 0.8000
fold 5 F1 score: 0.8000
fold 6 F1 score: 1.0000
fold 7 F1 score: 1.0000
fold 8 F1 score: 1.0000
fold 9 F1 score: 0.8000
fold 10 F1 score: 0.6667
Total (Average) F1 score: 0.9067
