## Imports

In [None]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_predict,GridSearchCV
from sklearn.metrics import confusion_matrix,accuracy_score,roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier

from xgboost import XGBClassifier

## Data Preprocessing

In [None]:
data = pd.read_csv("/kaggle/input/brain-tumor/Brain Tumor.csv")

data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.sample(frac=1)

dataX = data.iloc[:,2:]

scaler = MinMaxScaler()
dxs = scaler.fit_transform(dataX)
dataX = pd.DataFrame(dxs,columns=dataX.columns)

dataX.describe()

In [None]:
# 1 : Tumor
# 0 : Non-Tumor

dataY = data[["Class"]]

print("dataY\n",dataY.value_counts(),"\n")

## Data Visualization

In [None]:
plt.bar(["0","1"],dataY.value_counts())
plt.title("Distribution")
plt.show()

In [None]:
plt.subplots(figsize=(12, 12))
sns.heatmap(data.corr(),linewidths=.01,cmap="coolwarm")
plt.show()

In [None]:
colors = {0:"#00b8e6", 1:"#e62e00"}
fig, axes = plt.subplots(3,3)
grouped = pd.concat([dataX,dataY],axis=1).groupby("Class")

colms = ["Entropy", "Energy", "Homogeneity"]

for i in range(3):
    for j in range(3):
        for key, group in grouped:
            group.plot(ax=axes[i,j], kind="scatter", x=colms[i], y=colms[j], label=key, color=colors[key],figsize=(12,12))
plt.show()

## Classification

In [None]:
#external test data

dataXTest = dataX[3262:]
dataYTest = dataY[3262:]

dataX = dataX[:3262]
dataY = dataY[:3262]

### Gradient Boost

In [None]:
params = { 
    "n_estimators": [20,50,100],
    "max_depth" : [3,5,7],
    "learning_rate" : [0.1, 0.05, 0.01, 0.005],
    "subsample" : [0.6,0.8,1.0] 
}

gradientBoost = GradientBoostingClassifier()

gridSCV = GridSearchCV(gradientBoost,params,cv=10)
gridSCV.fit(dataX,dataY.values.ravel())
best_gb = gridSCV.best_estimator_

gridSCV.best_params_

In [None]:
best_gb.fit(dataX.values,dataY.values.ravel())
gboostPred = cross_val_predict(best_gb, X=dataX, y=dataY.values.ravel(), cv=10)
sns.heatmap(confusion_matrix(dataY.values.ravel(),gboostPred), annot=True,fmt="d")
plt.show()
print(accuracy_score(dataY.values.ravel(),gboostPred))

In [None]:
gboostPredTest = cross_val_predict(best_gb, X=dataXTest, y=dataYTest.values.ravel(), cv=10)
sns.heatmap(confusion_matrix(dataYTest.values.ravel(),gboostPredTest), annot=True,fmt="d")
plt.show()
print(accuracy_score(dataYTest.values.ravel(),gboostPredTest))

### XGBoost

In [None]:
#XGBoost

params = {
    "min_child_weight" : [0.1, 0.2, 0.5],
    "max_depth" : [2, 4, 6, 8],
    "subsample" : [0.1, 0.2, 0.5, 1],
    "n_estimators" : range(50,450,50),
    "learning_rate" : [0.1, 0.05, 0.01]    
}

xgboost = XGBClassifier()
gridSCV = GridSearchCV(xgboost,params,cv=10)
gridSCV.fit(dataX.values,dataY.values.ravel())
best_xgboost = gridSCV.best_estimator_
gridSCV.best_params_

In [None]:
best_xgboost.fit(dataX.values,dataY.values.ravel())
xgboostPred = cross_val_predict(best_xgboost, X=dataX, y=dataY.values.ravel(), cv=10)
sns.heatmap(confusion_matrix(dataY.values.ravel(),xgboostPred), annot=True,fmt="d")
plt.show()
print(accuracy_score(dataY.values.ravel(),xgboostPred))

In [None]:
xgboostPredTest = cross_val_predict(best_xgboost, X=dataXTest, y=dataYTest.values.ravel(), cv=10)
sns.heatmap(confusion_matrix(dataYTest.values.ravel(),xgboostPredTest), annot=True,fmt="d")
plt.show()
print(accuracy_score(dataYTest.values.ravel(),xgboostPredTest))