In [11]:
import numpy as np
import pandas as pd

In [12]:
battles = pd.read_csv('dataset/battles.csv')

In [13]:
battles.head()

Unnamed: 0,name,year,battle_number,attacker_king,defender_king,attacker_1,attacker_2,attacker_3,attacker_4,defender_1,...,major_death,major_capture,attacker_size,defender_size,attacker_commander,defender_commander,summer,location,region,note
0,Battle of the Golden Tooth,298,1,Joffrey/Tommen Baratheon,Robb Stark,Lannister,,,,Tully,...,1.0,0.0,15000.0,4000.0,Jaime Lannister,"Clement Piper, Vance",1.0,Golden Tooth,The Westerlands,
1,Battle at the Mummer's Ford,298,2,Joffrey/Tommen Baratheon,Robb Stark,Lannister,,,,Baratheon,...,1.0,0.0,,120.0,Gregor Clegane,Beric Dondarrion,1.0,Mummer's Ford,The Riverlands,
2,Battle of Riverrun,298,3,Joffrey/Tommen Baratheon,Robb Stark,Lannister,,,,Tully,...,0.0,1.0,15000.0,10000.0,"Jaime Lannister, Andros Brax","Edmure Tully, Tytos Blackwood",1.0,Riverrun,The Riverlands,
3,Battle of the Green Fork,298,4,Robb Stark,Joffrey/Tommen Baratheon,Stark,,,,Lannister,...,1.0,1.0,18000.0,20000.0,"Roose Bolton, Wylis Manderly, Medger Cerwyn, H...","Tywin Lannister, Gregor Clegane, Kevan Lannist...",1.0,Green Fork,The Riverlands,
4,Battle of the Whispering Wood,298,5,Robb Stark,Joffrey/Tommen Baratheon,Stark,Tully,,,Lannister,...,1.0,1.0,1875.0,6000.0,"Robb Stark, Brynden Tully",Jaime Lannister,1.0,Whispering Wood,The Riverlands,


In [14]:
df = battles.copy()

### *battle_type*  dan *attacker_outcome* pada baris ke-37

melakukan input missing value pada kolom *battle_type* sesuai dengan battle *name* pada baris ke-37

In [15]:
df.loc[37,['name','battle_type','attacker_outcome']]

name                Siege of Winterfell
battle_type                         NaN
attacker_outcome                    NaN
Name: 37, dtype: object

In [16]:
df.loc[37, 'battle_type'] = 'siege'

In [17]:
df.loc[37, 'attacker_outcome'] = 'win'

In [18]:
df.loc[37,['name','battle_type','attacker_outcome']]

name                Siege of Winterfell
battle_type                       siege
attacker_outcome                    win
Name: 37, dtype: object

### *attacker_king* dan *defender_king* baris ke-27
melakukan penukaran nama *attacker_king* dan *defender_king* karena terjadi kesalahan pada dataset, hasil temuan dari eksplorasi dataset

In [19]:
df[['attacker_king','defender_king']].loc[27]

attacker_king    Stannis Baratheon
defender_king         Mance Rayder
Name: 27, dtype: object

In [20]:
df.loc[27, 'attacker_king'] = 'Mance Rayder'
df.loc[27, 'defender_king'] = 'Stannis Baratheon'

In [21]:
df[['attacker_king','defender_king']].loc[27]

attacker_king         Mance Rayder
defender_king    Stannis Baratheon
Name: 27, dtype: object

## Feature Extraction

### *attacker_count* dan *defender_count*
menghitung jumlah major house attacker dan major house defender

In [22]:
df['attacker_count'] = np.nan
df['defender_count'] = np.nan

In [23]:
df[['attacker_count','defender_count']].head()

Unnamed: 0,attacker_count,defender_count
0,,
1,,
2,,
3,,
4,,


In [24]:
atk_count = []
def_count = []
for idx,row in df.iterrows():
    atk_c = row[['attacker_1','attacker_2','attacker_3','attacker_4']].notnull().sum()
    def_c = row[['defender_1','defender_2','defender_3','defender_4']].notnull().sum()
    atk_count.append(atk_c)
    def_count.append(def_c)
df['attacker_count'] = atk_count
df['defender_count'] = def_count

In [25]:
df[['attacker_count','defender_count']].head()

Unnamed: 0,attacker_count,defender_count
0,1,1
1,1,1
2,1,1
3,1,1
4,2,1


### attacker_commander_count dan defender_commander_count
menghitung jumlah attacker commander dan defender commander

In [26]:
df['attacker_commander_count'] = np.nan
df['defender_commander_count'] = np.nan

In [27]:
df[['attacker_commander_count','defender_commander_count']].head()

Unnamed: 0,attacker_commander_count,defender_commander_count
0,,
1,,
2,,
3,,
4,,


In [28]:
df['attacker_commander_count'] = df['attacker_commander'].str.split(',').str.len()
df['defender_commander_count'] = df['defender_commander'].str.split(',').str.len()

# mengubah nilai NaN menjadi 0, yang artinya tidak memiliki commander
df['attacker_commander_count'].fillna(value=0,inplace=True)
df['defender_commander_count'].fillna(value=0,inplace=True)

df['attacker_commander_count'] = df['attacker_commander_count'].astype('int64')
df['defender_commander_count'] = df['defender_commander_count'].astype('int64')

In [29]:
df[['attacker_commander_count','defender_commander_count']].head()

Unnamed: 0,attacker_commander_count,defender_commander_count
0,1,2
1,1,1
2,2,2
3,5,4
4,2,1


## Drop Column
menghapus kolom dengan persentase nilai null lebih dari 60%

In [30]:
missing_values_total = df.isnull().sum().sort_values(ascending=False)
missing_values_pct = (df.isnull().sum()/len(df)*100).sort_values(ascending=False)

missing_values = pd.concat([missing_values_total, missing_values_pct,df.dtypes],
                           keys=['Total','Percentage','Dtype'],
                           axis=1)
missing_values.head(10)

Unnamed: 0,Total,Percentage,Dtype
defender_3,38,100.0,float64
defender_4,38,100.0,float64
attacker_4,36,94.736842,object
defender_2,36,94.736842,object
attacker_3,35,92.105263,object
note,33,86.842105,object
attacker_2,28,73.684211,object
defender_size,19,50.0,float64
attacker_size,14,36.842105,float64
defender_commander,10,26.315789,object


In [31]:
df = df.loc[:, df.isnull().mean() <= .6]

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38 entries, 0 to 37
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   name                      38 non-null     object 
 1   year                      38 non-null     int64  
 2   battle_number             38 non-null     int64  
 3   attacker_king             36 non-null     object 
 4   defender_king             35 non-null     object 
 5   attacker_1                38 non-null     object 
 6   defender_1                37 non-null     object 
 7   attacker_outcome          38 non-null     object 
 8   battle_type               38 non-null     object 
 9   major_death               37 non-null     float64
 10  major_capture             37 non-null     float64
 11  attacker_size             24 non-null     float64
 12  defender_size             19 non-null     float64
 13  attacker_commander        37 non-null     object 
 14  defender_com

In [33]:
df.to_csv('dataset/battles_cleaned.csv',index=False)

In [36]:
dx = pd.read_csv('dataset/battles_cleaned.csv')

In [35]:
dx.tail()

Unnamed: 0,name,year,battle_number,attacker_king,defender_king,attacker_1,defender_1,attacker_outcome,battle_type,major_death,...,defender_size,attacker_commander,defender_commander,summer,location,region,attacker_count,defender_count,attacker_commander_count,defender_commander_count
33,Second Seige of Storm's End,300,34,Joffrey/Tommen Baratheon,Stannis Baratheon,Baratheon,Baratheon,win,siege,0.0,...,200.0,"Mace Tyrell, Mathis Rowan",Gilbert Farring,0.0,Storm's End,The Stormlands,1,1,2,1
34,Siege of Dragonstone,300,35,Joffrey/Tommen Baratheon,Stannis Baratheon,Baratheon,Baratheon,win,siege,0.0,...,,"Loras Tyrell, Raxter Redwyne",Rolland Storm,0.0,Dragonstone,The Stormlands,1,1,2,1
35,Siege of Riverrun,300,36,Joffrey/Tommen Baratheon,Robb Stark,Lannister,Tully,win,siege,0.0,...,,"Daven Lannister, Ryman Fey, Jaime Lannister",Brynden Tully,0.0,Riverrun,The Riverlands,2,1,3,1
36,Siege of Raventree,300,37,Joffrey/Tommen Baratheon,Robb Stark,Bracken,Blackwood,win,siege,0.0,...,,"Jonos Bracken, Jaime Lannister",Tytos Blackwood,0.0,Raventree,The Riverlands,2,1,2,1
37,Siege of Winterfell,300,38,Stannis Baratheon,Joffrey/Tommen Baratheon,Baratheon,Bolton,win,siege,,...,8000.0,Stannis Baratheon,Roose Bolton,0.0,Winterfell,The North,4,2,1,1
