In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import recall_score, classification_report, accuracy_score, precision_score, f1_score, confusion_matrix

In [None]:
# Reading dataset
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [None]:
df.isnull().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stab     0
stabf    0
dtype: int64

In [None]:
df.shape

(10000, 14)

In [None]:
#Because of the direct relationship between 'stab' and 'stabf' ('stabf' = 'stable' if 'stab' <= 0, 'unstable' otherwise),
#'stab' should be dropped and 'stabf' will remain as the sole dependent variable (binary classification).
df = df.drop('stab', axis=1)
df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,unstable


In [None]:
x = df.drop('stabf', axis=1)
y = df['stabf']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

In [None]:
scaler = StandardScaler()

x_train_norm = pd.DataFrame(scaler.fit_transform(x_train), columns = x_train.columns)
x_test_norm = pd.DataFrame(scaler.transform(x_test), columns = x_test.columns)

# Question 4

In [None]:
forest = RandomForestClassifier(random_state = 1)
forest.fit(x_train_norm, y_train)
pred_test = forest.predict(x_test_norm)
print('Accuracy: {}'.format(round(accuracy_score(y_test, pred_test), 4)))

Accuracy: 0.929


# Question 5

In [None]:
xgb = XGBClassifier(random_state = 1, learning_rate = 0.1, max_depth = 3)
xgb.fit(x_train_norm, y_train)
pred_test2 = xgb.predict(x_test_norm)
print('Accuracy: {}'.format(round(accuracy_score(y_test, pred_test2), 4)))

Accuracy: 0.9195


#Question 9 and 1

In [None]:
tree = ExtraTreesClassifier(random_state = 1)
tree.fit(x_train_norm, y_train)
pred_test3 = tree.predict(x_test_norm)
print('Accuracy: {}'.format(accuracy_score(y_test, pred_test3)))

Accuracy: 0.928


In [None]:
n_estimators = [50, 100, 300, 500, 1000]

min_samples_split = [2, 3, 5, 7, 9]

min_samples_leaf = [1, 2, 4, 6, 8]

max_features = ['auto', 'sqrt', 'log2', None] 

hyperparameter_grid = {'n_estimators': n_estimators,

                       'min_samples_leaf': min_samples_leaf,

                       'min_samples_split': min_samples_split,

                       'max_features': max_features}

In [None]:
from sklearn.model_selection import RandomizedSearchCV
rsv = RandomizedSearchCV(tree, hyperparameter_grid, cv=5, n_iter = 10, scoring = 'accuracy', n_jobs = -1, verbose = 1, random_state = 1)
search = rsv.fit(x_train_norm, y_train)
search.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


{'max_features': None,
 'min_samples_leaf': 8,
 'min_samples_split': 2,
 'n_estimators': 1000}

In [None]:
##using the best parameters to train the dataset using tree model
tree = ExtraTreesClassifier(**search.best_params_, random_state = 1)
tree.fit(x_train_norm, y_train)
pred_test4 = tree.predict(x_test_norm)
print('Accuracy: {}'.format(accuracy_score(y_test, pred_test4)))

Accuracy: 0.927


#Question 20

In [None]:
lgbm = LGBMClassifier(random_state = 1)
lgbm.fit(x_train_norm, y_train)
pred_test5 = lgbm.predict(x_test_norm)
print('Accuracy: {}'.format(round(accuracy_score(y_test, pred_test5), 4)))

Accuracy: 0.9375


#Question 8

In [21]:
importance = pd.Series(tree.feature_importances_,  x_train_norm.columns).sort_values()
importance

p1      0.003683
p4      0.004962
p2      0.005337
p3      0.005429
g1      0.102562
g2      0.107578
g4      0.109541
g3      0.113063
tau3    0.134680
tau4    0.135417
tau1    0.137240
tau2    0.140508
dtype: float64