In [None]:
!pip install natsort
!pip install videofig
from natsort import natsorted 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import xgboost
import cv2
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import os
%matplotlib inline


# Data Analysis

**First Order Features**


* **Mean** : 
*Gives the contribution of individual pixel intensity for the entire image*
* **Variance** : 
*Used to find how each pixel varies from the neighbouring pixel* 
* **Standard Deviation** : 
*measures the deviation of measured Values or the data from its mean.*
* **Skewness** : 
*measures of symmetry, or more precisely, the lack of symmetry.* 
* **Kurtosis** : 
*describes the peakedness of e.g. a frequency distribution*

**Second Order Features** 


* **Contrast** : 
*the difference in luminance or colour across the image*
* **Energy** : 
*It's the rate of change in the color/brightness/magnitude of the pixels over local areas.*
* **ASM (Angular second moment)** : 
*is a measure of textural Uniformity of an image*
* **Entropy** : 
*is a statistical measure of randomness that can be used to characterize the texture of the image*
* **Homogeneity** :
*homogeneity expresses how similar certain elements (pixels) of the image are. *
* **Dissimilarity** : 
*is a numerical measure of how different two data objects are.*
* **Correlation** : 
*Correlation is the process of moving a filter mask often referred to as kernel over the image and computing the sum of products at each location(CNN alike)*
* **Coarseness** : 
*Describes the roughness/harshness of a texture*

**Labels**
* **Class** :  

**1 = Tumor**

**0 = Not Tumor**

In [None]:
data=pd.read_csv('/kaggle/input/brain-tumor/Brain Tumor.csv')
data

# Scaling

In [None]:
scalable=['Mean', 'Variance', 'Standard Deviation', 'Entropy',
       'Skewness', 'Kurtosis', 'Contrast', 'Energy', 'ASM', 'Homogeneity',
       'Dissimilarity', 'Correlation', 'Coarseness']


data[scalable]=StandardScaler().fit_transform(data[scalable])
data

# Data Correlations

In [None]:
sns.swarmplot(x=y, y= data['Homogeneity'])
plt.title("Distribution of image Homogenity, by Class")



In [None]:
class1=data['Class']== 1
class0=data['Class']== 0
_data=data.copy()
_data=data.drop('Image',axis=1,inplace=False)
sns.distplot(a= _data[class1]['Energy'], label="Tumor")
sns.distplot(a = _data[class0]['Energy'], label="No tumor" )

plt.title("Distribution of image Energy, by Class")
plt.legend()

In [None]:
sns.distplot(a= _data[class1]['Entropy'], label="Tumor")
sns.distplot(a = _data[class0]['Entropy'], label="No tumor" )
plt.title("Distribution of image Entropy, by Class")
plt.legend()

In [None]:
fig = plt.figure()  
folder='/kaggle/input/brain-tumor/Brain Tumor/Brain Tumor/'
imgs=[os.path.join(folder,img) for img in os.listdir(folder) if img.endswith('.jpg')]
imgs=natsorted(imgs)

img=cv2.imread(imgs[3760],cv2.IMREAD_GRAYSCALE)

im = plt.imshow(img,  interpolation='none', aspect='auto',cmap ='gray', vmin=0, vmax=255)   
plt.title('No Tumor')

In [None]:
img=cv2.imread(imgs[3],cv2.IMREAD_GRAYSCALE)

im = plt.imshow(img,  interpolation='none', aspect='auto',cmap ='gray', vmin=0, vmax=255)   
plt.title('Tumor')

In [None]:
y=data.Class
y

# Most impactful features

In [None]:
from collections import OrderedDict

model = xgboost.XGBRegressor(colsample_bytree=0.4,
                 gamma=0,                 
                 learning_rate=0.07,
                 max_depth=3,
                 min_child_weight=1.5,
                 n_estimators=10000,                                                                    
                 reg_alpha=0.75,
                 reg_lambda=0.45,
                 subsample=0.6,
                 seed=42) 
model.fit(data.drop(['Image','Class'],axis=1,inplace=False),y)
OrderedDict(sorted(model.get_booster().get_fscore().items(),key=lambda t: t[1], reverse=True))

As the **previous graphs** suggested , the more ***un-uniform*** and ***un-structured*** the pixels values are,the more likely it is to have cancer

Thus,probably the best approach to take is some [Anomaly Detection](https://en.wikipedia.org/wiki/Anomaly_detection) algorithm


# Logistic Regression

In [None]:
logr= LogisticRegression(dual=False, verbose=1, random_state=  4)
logr.fit(X_train , y_train )
logr.score(X_valid,y_valid)

# Random Forest

In [None]:
RFclf = RandomForestClassifier(n_estimators = 2000, random_state= 4 ,verbose=1)
RFclf.fit( X_train, y_train  )
RFclf.score(X_valid,y_valid)

# KNN

In [None]:
knn=KNeighborsClassifier( algorithm='auto' ,leaf_size= 50,n_neighbors= 5)
knn.fit(X_train,y_train )
knn.score ( X_valid,y_valid)