In [None]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import xgboost
import cv2
import imblearn

# Data Loading

In [None]:
df=pd.read_csv('../input/glass/glass.csv')

In [None]:
df

In [None]:
df.columns

In [None]:
df.describe()

# Identifing Missing Values

In [None]:
np.sum(df.isnull())

# Identifing No of Classes

In [None]:
df['Type'].unique()

In [None]:
X=df.drop('Type', axis=1)
#X=X.drop('RI', axis=1)
X

In [None]:
Y=df['Type']
Y

# Handling imbalanced data using SMOTETomek

In [None]:
from imblearn.combine import SMOTETomek

In [None]:
smt=SMOTETomek(random_state=42)
x_res, y_res= smt.fit_sample(X,Y)

In [None]:
from sklearn.model_selection import train_test_split


x_train, x_test, y_train, y_test= train_test_split(x_res,y_res, random_state=42, test_size=0.2, stratify=y_res)
print(len(x_train))
print(len(x_test))
print(len(y_train))
print(len(y_test))

In [None]:
for x in set(X):
    print('{}....{}'.format(x,len(X[X==x])))

In [None]:
for y in set(Y):
    print('{}....{}'.format(y,len(Y[Y==y])))

In [None]:
parms={
    'n_estimators':[100],
    'max_depth':[7],
    'learning_rate':[1],
    'gamma':[0.1,1,0.5,0],
    'subsample':[0.7],
    'colsample_bylevel':[0.1,0.3],
    'colsample_bytree':[0.3,0.5,0.7],
    'min_child_weight':[0.1,0.3],
    'reg_lambda':[0,1,0.5]
}

#'colsample_bylevel': 0.1,
# 'colsample_bytree': 0.3,
# 'gamma': 0.1,
# 'learning_rate': 1,
# 'max_depth': 7,
# 'min_child_weight': 0.1,
# 'n_estimators': 100,
# 'reg_lambda': 1,
# 'subsample': 0.7

In [None]:
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import f1_score, make_scorer

xg=xgboost.XGBClassifier()
#xg.fit(x_train, y_train)
f1=make_scorer(f1_score, average='macro')
grids=GridSearchCV(xg, param_grid=parms, cv=10, n_jobs=-1, scoring=f1)
grids.fit(x_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix

y_train_pred=grids.predict(x_train)
print(confusion_matrix(y_train, y_train_pred))
print()
print()
from sklearn.metrics import classification_report

print(classification_report(y_train, y_train_pred))

In [None]:
y_test_pred=grids.predict(x_test)
print(confusion_matrix(y_test, y_test_pred))
print()
print()
from sklearn.metrics import classification_report

print(classification_report(y_test, y_test_pred))

In [None]:
grids.best_score_

In [None]:
grids.best_params_

# Handling imbalanced data using OVERSAMPLING

In [None]:
from imblearn.over_sampling import RandomOverSampler

In [None]:
rs=RandomOverSampler()

In [None]:
x_res, y_res=rs.fit_sample(X,Y)

In [None]:
from sklearn.model_selection import train_test_split


x_train, x_test, y_train, y_test= train_test_split(x_res,y_res, random_state=42, test_size=0.2, stratify=y_res)
print(len(x_train))
print(len(x_test))
print(len(y_train))
print(len(y_test))

In [None]:
for x in set(X):
    print('{}....{}'.format(x,len(X[X==x])))

In [None]:
for y in set(Y):
    print('{}....{}'.format(y,len(Y[Y==y])))

In [None]:
parms={
    'n_estimators':[100],
    'max_depth':[7],
    'learning_rate':[1,0.1,0.5],
    'gamma':[0.1,1,0.5],
    'subsample':[0.7],
    'colsample_bylevel':[0.1],
    'colsample_bytree':[0.3,0.7,0.5],
    'min_child_weight':[0.3,0.7,0.5,1],
    'reg_lambda':[0,1,0.5]
}

#'colsample_bylevel': 0.1,
# 'colsample_bytree': 0.3,
# 'gamma': 0.1,
# 'learning_rate': 1,
# 'max_depth': 7,
# 'min_child_weight': 0.3,
# 'n_estimators': 100,
# 'reg_lambda': 0,
# 'subsample': 0.7}

In [None]:
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import f1_score, make_scorer

xg=xgboost.XGBClassifier()
#xg.fit(x_train, y_train)
f1=make_scorer(f1_score, average='macro')
grids=GridSearchCV(xg, param_grid=parms, cv=10, n_jobs=-1, scoring=f1)
grids.fit(x_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix

y_train_pred=grids.predict(x_train)
print(confusion_matrix(y_train, y_train_pred))
print()
print()
from sklearn.metrics import classification_report

print(classification_report(y_train, y_train_pred))

In [None]:
y_test_pred=grids.predict(x_test)
print(confusion_matrix(y_test, y_test_pred))
print()
print()
from sklearn.metrics import classification_report

print(classification_report(y_test, y_test_pred))

In [None]:
grids.best_score_

In [None]:
grids.best_params_

#Final Model

In [None]:
xg=xgboost.XGBClassifier(colsample_bylevel= 0.1,
                          colsample_bytree= 0.3,
                          gamma= 0.1,
                          learning_rate= 0.1,
                          max_depth= 7,
                          min_child_weight= 0.3,
                          n_estimators= 100,
                          reg_lambda= 0,
                          subsample= 0.7)
xg.fit(x_train, y_train)
y_train_pred=xg.predict(x_train)

In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_train, y_train_pred))
print()
print()
from sklearn.metrics import classification_report

print(classification_report(y_train, y_train_pred))

In [None]:
from sklearn.model_selection import cross_val_predict

y_train_pred= cross_val_predict(xg, x_train, y_train,cv=5)

In [None]:
print(confusion_matrix(y_train, y_train_pred))
print()
print()
from sklearn.metrics import classification_report

print(classification_report(y_train, y_train_pred))

In [None]:
y_test_pred=xg.predict(x_test)
print(confusion_matrix(y_test, y_test_pred))
print()
print()
from sklearn.metrics import classification_report

print(classification_report(y_test, y_test_pred))