In [22]:
# import important libraries
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn.utils
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score
from xgboost import XGBClassifier

In [23]:
df = pd.read_csv('Data_for_UCI_named.csv')
df

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.959060,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.781760,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.277210,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.669600,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.797110,0.455450,0.656947,0.820923,0.049860,unstable
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2.930406,9.487627,2.376523,6.187797,3.343416,-0.658054,-1.449106,-1.236256,0.601709,0.779642,0.813512,0.608385,0.023892,unstable
9996,3.392299,1.274827,2.954947,6.894759,4.349512,-1.663661,-0.952437,-1.733414,0.502079,0.567242,0.285880,0.366120,-0.025803,stable
9997,2.364034,2.842030,8.776391,1.008906,4.299976,-1.380719,-0.943884,-1.975373,0.487838,0.986505,0.149286,0.145984,-0.031810,stable
9998,9.631511,3.994398,2.757071,7.821347,2.514755,-0.966330,-0.649915,-0.898510,0.365246,0.587558,0.889118,0.818391,0.037789,unstable


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tau1    10000 non-null  float64
 1   tau2    10000 non-null  float64
 2   tau3    10000 non-null  float64
 3   tau4    10000 non-null  float64
 4   p1      10000 non-null  float64
 5   p2      10000 non-null  float64
 6   p3      10000 non-null  float64
 7   p4      10000 non-null  float64
 8   g1      10000 non-null  float64
 9   g2      10000 non-null  float64
 10  g3      10000 non-null  float64
 11  g4      10000 non-null  float64
 12  stab    10000 non-null  float64
 13  stabf   10000 non-null  object 
dtypes: float64(13), object(1)
memory usage: 1.1+ MB


In [25]:
df.isnull().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stab     0
stabf    0
dtype: int64

In [26]:
df['stabf'].value_counts()

unstable    6380
stable      3620
Name: stabf, dtype: int64

In [27]:
# shuffle and assign inputs and outputs for the model
df1 = shuffle(df)
df1 = df1.reset_index(drop=True)

inputs = df1.drop(columns=['stabf', 'stab'])
outputs = df1['stabf']

In [28]:
# split data
x_train, x_test, y_train, y_test = train_test_split(inputs, outputs, test_size=0.2, random_state=1)

In [29]:
x_test.reset_index(drop=True)

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
0,9.097629,7.895145,8.501274,7.837818,3.366299,-0.659356,-1.856660,-0.850283,0.633081,0.144480,0.776211,0.501516
1,4.624190,5.436351,6.305130,2.990331,2.610646,-0.806854,-0.599730,-1.204062,0.886401,0.502993,0.385810,0.851986
2,5.996614,4.907569,0.568923,6.234950,2.823046,-0.716716,-0.559305,-1.547024,0.685726,0.361178,0.505085,0.192988
3,5.562805,6.487545,5.783878,9.602979,3.635305,-1.948495,-0.506887,-1.179923,0.168576,0.866844,0.841695,0.493653
4,0.675248,9.504198,3.416111,3.706283,3.619465,-0.842804,-0.802796,-1.973865,0.066769,0.107896,0.827136,0.770769
...,...,...,...,...,...,...,...,...,...,...,...,...
1995,6.713878,3.528662,2.561357,1.543185,3.111843,-0.845011,-1.734586,-0.532246,0.573419,0.349372,0.766504,0.067430
1996,6.235470,5.840562,3.856312,9.600154,5.174934,-1.938965,-1.645219,-1.590750,0.648965,0.159546,0.640387,0.441351
1997,5.660502,3.894512,6.654106,7.302559,4.101001,-1.909865,-0.524942,-1.666194,0.971766,0.221357,0.393227,0.693821
1998,2.957239,7.989375,3.803450,3.538377,4.561326,-1.558625,-1.620463,-1.382237,0.701205,0.646816,0.280255,0.850290


In [30]:
# scale dataset
scaler = StandardScaler()

X_train = scaler.fit_transform(x_train)
X_test = scaler.transform(x_test)

RANDOM FOREST CLASSIFICATION

In [31]:
# train model with random forest
rf_class = RandomForestClassifier(random_state=1)
rf_class.fit(X_train, y_train)

rf_class_p = rf_class.predict(X_test)

In [32]:
# confusion_matrix
conf_matrix = confusion_matrix(y_true=y_test, y_pred=rf_class_p, labels=['stable', 'unstable'])
conf_matrix

array([[ 610,  114],
       [  58, 1218]], dtype=int64)

In [33]:
accuracy = accuracy_score(y_true=y_test, y_pred=rf_class_p)
print(f'accuracy: {accuracy*100}')

accuracy: 91.4


In [34]:
precision = precision_score(y_true=y_test, y_pred=rf_class_p, pos_label='stable')
print(f'precision: {round((precision*100), 1)}')

precision: 91.3


In [35]:
recall = recall_score(y_true=y_test, y_pred=rf_class_p, pos_label='stable')
print(f'recall: {round((recall*100), 1)}')

recall: 84.3


In [37]:
et_class = ExtraTreesClassifier(random_state=1)
et_class.fit(X_train, y_train)

et_class_p = et_class.predict(X_test)

In [38]:
accuracy = accuracy_score(y_true=y_test, y_pred=et_class_p)
print(f'accuracy: {accuracy*100}')

accuracy: 92.9


In [39]:
precision = precision_score(y_true=y_test, y_pred=et_class_p, pos_label='stable')
print(f'precision: {round((precision*100), 1)}')

precision: 95.9


In [40]:
recall = recall_score(y_true=y_test, y_pred=et_class_p, pos_label='stable')
print(f'recall: {round((recall*100), 1)}')

recall: 84.0


 RANDOMIZED CROSS VALIDATION SEARCH (RandomizedSearchCV)

In [42]:
n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None]

hyperparameters_grid = {'n_estimators': n_estimators,
                        'min_samples_leaf': min_samples_leaf,
                        'min_samples_split': min_samples_split,
                        'max_features': max_features}

search_cv = RandomizedSearchCV(et_class, hyperparameters_grid, random_state=1)
search = search_cv.fit(X_train, y_train)

In [43]:
search.best_params_ # Get the best parameters for the model

{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 8,
 'max_features': None}

XGBOOST CLASSIFICATION

In [44]:
et_class1 = ExtraTreesClassifier(n_estimators=1000, min_samples_split=2, min_samples_leaf=8, max_features=None, random_state=1)
et_class1.fit(X_train, y_train)

et_class1 = et_class1.predict(X_test)

In [45]:
accuracy1 = accuracy_score(y_true=y_test, y_pred=et_class1)
print(f'accuracy: {accuracy*100}')

accuracy: 92.9


In [None]:
# importance_levels = pd.DataFrame({'feature':X_train.columns+,'importance':np.round(et_class1.feature_importances_,3)})
# importance_levels = importance_levels.sort_values('importance',ascending = False).set_index('feature')
# importance_levels

In [46]:
xgb = XGBClassifier(random_state=1)
xgb.fit(X_train, y_train)

xgb_p = xgb.predict(X_test)

In [47]:
conf_matrix = confusion_matrix(y_true=y_test, y_pred=xgb_p, labels=['stable', 'unstable'])
conf_matrix

array([[ 655,   69],
       [  41, 1235]], dtype=int64)

In [48]:
accuracy = accuracy_score(y_true=y_test, y_pred=xgb_p)
print(f'Accuracy: {round((accuracy * 100), 1)}')

Accuracy: 94.5


In [49]:
recall = recall_score(y_true=y_test, y_pred=xgb_p, pos_label='stable')
print(f'recall_score: {round((recall * 100), 1)}')

recall_score: 90.5


In [50]:
precision = precision_score(y_true=y_test, y_pred=xgb_p, pos_label='stable')
print(f'precision: {round((precision * 100), 1)}')

precision: 94.1
