In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
from sklearn.utils import compute_class_weight
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [None]:
data = pd.read_csv("../input/glass/glass.csv")

In [None]:
data.head()

In [None]:
features = data.iloc[:,np.r_[0:9]]
labels = data.iloc[:,9]

In [None]:
features.head()

In [None]:
labels.unique()

In [None]:
data.Type.value_counts()

In [None]:
data['Type'].unique()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, random_state=20, test_size=0.3)

In [None]:
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(data['Type']), y=data['Type'])
class_weights_dict = dict(enumerate(class_weights))
class_weights_dict

In [None]:
old_keys = [0,1,2,3,4,5]
new_keys = [1,2,3,5,6,7]
weights_final = dict(zip(new_keys, list(class_weights_dict.values()))) 
#class_weights_dict[new_keys] = class_weights_dict.pop(old_keys)
#class_weights_dict.keys()
weights_final

In [None]:
rf_classifier = RandomForestClassifier()

param_grid = [
    {'n_estimators': [10, 20, 30, 50, 70, 80, 100, 120, 150, 180, 200, 220, 280, 320], 
     'max_features': ['auto','sqrt','log2'], 
      'oob_score':[True, False], 'bootstrap':[True, False], 
     'class_weight':[None, 'balanced'], 'criterion':['gini', 'entropy']}
] # a list of dictionaries

grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, return_train_score=True, n_jobs=-1)
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

____

In [None]:
best_param_classifier_1 = RandomForestClassifier(bootstrap=False, n_estimators=150, max_features='log2', 
                                               oob_score=False, class_weight= 'balanced',
                                                criterion='entropy')
best_param_classifier_1.fit(X_train, y_train)

In [None]:
rf_prediction_test = best_param_classifier_1.predict(X_test)
accuracy_score(y_test, rf_prediction_test)

In [None]:
rf_prediction_train = best_param_classifier_1.predict(X_train)
accuracy_score(y_train, rf_prediction_train)

In [None]:
precision_score(y_test, rf_prediction_test, average=None)

In [None]:
recall_score(y_test, rf_prediction_test, average=None)

In [None]:
f1_score(y_test, rf_prediction_test, average=None) # indicates how well the model performs on minority class

In [None]:
plot_confusion_matrix(best_param_classifier_1, X_test, y_test)

In [None]:
plot_confusion_matrix(best_param_classifier_1, X_train, y_train)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, rf_prediction_test))

In [None]:
best_param_classifier_1.feature_importances_

_____