In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [4]:
df = pd.read_csv('kyphosis.csv')

In [5]:
df.head()

Unnamed: 0,Kyphosis,Age,Number,Start
0,absent,71,3,5
1,absent,158,3,14
2,present,128,4,5
3,absent,2,5,1
4,absent,1,4,15


In [3]:
from sklearn.model_selection import train_test_split

In [7]:
x = df.drop('Kyphosis',axis=1)
y = df['Kyphosis']

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.30, random_state = 101)

In [9]:
y_train.value_counts()

absent     47
present     9
Name: Kyphosis, dtype: int64

In [4]:
from sklearn.ensemble import RandomForestClassifier

In [21]:
rfc = RandomForestClassifier()

In [12]:
rfc.fit(x_train, y_train)

RandomForestClassifier()

In [16]:
pred = rfc.predict(x_test)

In [5]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [17]:
confusion_matrix(y_test, pred)

array([[17,  0],
       [ 6,  2]])

In [32]:
19/25

0.76

## Undersampling
- Digunakan untuk mengurangi jumlah kelas major (banyak)
- Sehingga kelas major (banyak) akan sama dengan kelas minor (dikit)
- Menggunakan sampling berulang kali

## Oversampling
- Digunakan untuk menambahkan jumlah kelas minor (dikit)
- Sehingga kelas minor (dikit) akan sama dengan kelas major (banyak)
- Menggunakan SMOTE
- Lebih sering digunakan

### 1. Synthetic Minority Oversampling Technique (SMOTE)
- Menggunakan pendekatan equilibrium distance

In [6]:
from imblearn.over_sampling import SMOTE

In [18]:
sm = SMOTE(random_state = 101)

In [19]:
oversampled_trainx, oversampled_trainy = sm.fit_sample(x_train, y_train)

In [21]:
os_train = pd.concat([pd.DataFrame(oversampled_trainy), pd.DataFrame(oversampled_trainx)], axis=1)

In [23]:
pd.Series(oversampled_trainy).value_counts()

present    47
absent     47
Name: Kyphosis, dtype: int64

In [25]:
os_train.head()

Unnamed: 0,Kyphosis,Age,Number,Start
0,absent,11,3,15
1,absent,140,4,15
2,absent,158,3,14
3,absent,127,4,12
4,present,139,3,10


In [26]:
df.head()

Unnamed: 0,Kyphosis,Age,Number,Start
0,absent,71,3,5
1,absent,158,3,14
2,present,128,4,5
3,absent,2,5,1
4,absent,1,4,15


In [27]:
rfc.fit(oversampled_trainx, oversampled_trainy)

RandomForestClassifier()

In [28]:
pred_os = rfc.predict(x_test)

In [29]:
pred_os

array(['absent', 'present', 'present', 'absent', 'absent', 'absent',
       'absent', 'absent', 'absent', 'absent', 'absent', 'present',
       'present', 'absent', 'absent', 'absent', 'absent', 'absent',
       'present', 'absent', 'absent', 'present', 'absent', 'present',
       'present'], dtype=object)

In [30]:
confusion_matrix(y_test, pred_os)

array([[15,  2],
       [ 2,  6]])

In [31]:
21/25

0.84

## Tuning

### 1. GridSearchCV

In [7]:
from sklearn.model_selection import GridSearchCV

In [53]:
rfc2 = RandomForestClassifier()

In [54]:
cari = GridSearchCV(estimator = rfc2, 
                   param_grid = 
                   {'n_estimators' :[100,200,300,500],
                   'bootstrap' :[True, False],
                   'min_samples_split' :[2,5,10,20]},
                   scoring = 'accuracy',
                   cv = 5,
                   n_jobs = -1)

In [55]:
cari.fit(x_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [True, False],
                         'min_samples_split': [2, 5, 10, 20],
                         'n_estimators': [100, 200, 300, 500]},
             scoring='accuracy')

In [56]:
cari.best_params_

{'bootstrap': True, 'min_samples_split': 10, 'n_estimators': 100}

In [46]:
rfc3 = RandomForestClassifier(bootstrap = True, min_samples_split = 10, n_estimators = 100)

In [57]:
rfc3.fit(x_train, y_train)

RandomForestClassifier(min_samples_split=10)

In [58]:
prd_new = rfc3.predict(x_test)

In [59]:
confusion_matrix(y_test, prd_new)

array([[17,  0],
       [ 8,  0]])

In [60]:
17/25

0.68

## Latihan

In [8]:
data = pd.read_csv('dataanime.csv')

In [9]:
data.head()

Unnamed: 0,Title,Type,Episodes,Status,Start airing,End airing,Starting season,Broadcast time,Producers,Licensors,Studios,Sources,Genres,Duration,Rating,Score,Scored by,Members,Favorites,Description
0,Fullmetal Alchemist: Brotherhood,TV,64,Finished Airing,2009-4-5,2010-7-4,Spring,Sundays at 17:00 (JST),"Aniplex,Square Enix,Mainichi Broadcasting Syst...","Funimation,Aniplex of America",Bones,Manga,"Action,Military,Adventure,Comedy,Drama,Magic,F...",24 min. per ep.,R,9.25,719706,1176368,105387,"""In order for something to be obtained, someth..."
1,Kimi no Na wa.,Movie,1,Finished Airing,2016-8-26,-,-,-,"Kadokawa Shoten,Toho,Sound Team Don Juan,Lawso...","Funimation,NYAV Post",CoMix Wave Films,Original,"Supernatural,Drama,Romance,School",1 hr. 46 min.,PG-13,9.19,454969,705186,33936,"Mitsuha Miyamizu, a high school girl, yearns t..."
2,Gintama°,TV,51,Finished Airing,2015-4-8,2016-3-30,Spring,Wednesdays at 18:00 (JST),"TV Tokyo,Aniplex,Dentsu","Funimation,Crunchyroll",Bandai Namco Pictures,Manga,"Action,Comedy,Historical,Parody,Samurai,Sci-Fi...",24 min. per ep.,R,9.16,70279,194359,5597,"Gintoki, Shinpachi, and Kagura return as the f..."
3,Steins;Gate 0,TV,23,Currently Airing,2018-4-12,-,Spring,Thursdays at 01:35 (JST),Nitroplus,Funimation,White Fox,Visual novel,"Sci-Fi,Thriller",23 min. per ep.,PG-13,9.16,12609,186331,1117,The dark untold story of Steins;Gate that lead...
4,Steins;Gate,TV,24,Finished Airing,2011-4-6,2011-9-14,Spring,Wednesdays at 02:05 (JST),"Frontier Works,Media Factory,Movic,AT-X,Kadoka...",Funimation,White Fox,Visual novel,"Sci-Fi,Thriller",24 min. per ep.,PG-13,9.14,552791,990419,90365,The self-proclaimed mad scientist Rintarou Oka...


In [10]:
data.drop(['Title','Episodes','Start airing','End airing','Broadcast time','Producers','Licensors','Studios','Sources','Genres','Duration','Description','Starting season','Type','Rating'],axis=1,inplace=True)

In [11]:
data.head()

Unnamed: 0,Status,Score,Scored by,Members,Favorites
0,Finished Airing,9.25,719706,1176368,105387
1,Finished Airing,9.19,454969,705186,33936
2,Finished Airing,9.16,70279,194359,5597
3,Currently Airing,9.16,12609,186331,1117
4,Finished Airing,9.14,552791,990419,90365


In [12]:
x = data.drop('Status', axis=1)
y = data['Status']

In [13]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.30, random_state = 101)

In [14]:
y_train.value_counts()

Finished Airing     1067
Currently Airing      27
Name: Status, dtype: int64

In [15]:
sm = SMOTE(random_state = 101)

In [16]:
oversampled_trainx, oversampled_trainy = sm.fit_sample(x_train, y_train)

In [17]:
os_train = pd.concat([pd.DataFrame(oversampled_trainy), pd.DataFrame(oversampled_trainx)], axis=1)

In [18]:
os_train.head()

Unnamed: 0,Status,Score,Scored by,Members,Favorites
0,Finished Airing,7.81,29337,64191,142
1,Finished Airing,7.8,176465,302074,1760
2,Finished Airing,7.84,24861,68933,1285
3,Finished Airing,7.7,22877,64870,685
4,Finished Airing,7.55,3817,10222,130


In [19]:
pd.Series(oversampled_trainy).value_counts()

Currently Airing    1067
Finished Airing     1067
Name: Status, dtype: int64

In [22]:
rfc.fit(oversampled_trainx, oversampled_trainy)

RandomForestClassifier()

In [23]:
import random
idx_sample = random.sample(range(len(data)),200)
sample_x = x.iloc[idx_sample]
sample_y = y.iloc[idx_sample]

In [24]:
pred = rfc.predict(sample_x)

In [25]:
confusion_matrix(sample_y, pred)

array([[  4,   0],
       [  1, 195]])

In [30]:
199/200

0.995

In [31]:
from sklearn.linear_model import LogisticRegression

In [32]:
logisticmodel = LogisticRegression()
logisticmodel.fit (x_train, y_train)

LogisticRegression()

In [33]:
pred_lr = logisticmodel.predict(sample_x)

In [34]:
confusion_matrix(sample_y, pred_lr)

array([[  0,   4],
       [  2, 194]])

In [35]:
194/200

0.97