In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Feature Importance
*  techniques that assign a score to input features based on how useful they are at predicting a target variable
* There are many types and sources of feature importance scores, although popular examples include statistical correlation scores, coefficients calculated as part of linear models, decision trees, and permutation importance scores
* Feature importance scores play an important role in a predictive modeling project, including providing insight into the data, insight into the model
* Feature importance refers to a class of techniques for assigning scores to input features to a predictive model that indicates the relative importance of each feature when making a prediction
* Feature importance scores can be calculated for problems that involve predicting a numerical value, called regression, and those problems that involve predicting a class label, called classification.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from pandas import plotting

#plotly 
import plotly.offline as py
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode, iplot
from plotly import tools
init_notebook_mode(connected=True)
import plotly.figure_factory as ff
import plotly.express as px

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn import neighbors
from sklearn.metrics import confusion_matrix,classification_report,precision_score
from sklearn.model_selection import train_test_split

import statsmodels.api as sm
import statsmodels.formula.api as smf

sns.set(style="whitegrid")

In [None]:
df=pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')
df.head()

In [None]:
df=df.drop('Unnamed: 32', axis=1)


In [None]:
from scipy import stats


# Joint PLot with Pearson Coefficent

In [None]:
r, p=stats.pearsonr(df.loc[:,'concavity_worst'], df.loc[:,'concave points_worst'])
graph=sns.jointplot(df.loc[:,'concavity_worst'], df.loc[:,'concave points_worst'], kind="reg", color="#ce1414",)
phantom, =graph.ax_joint.plot([],[], linestyle="", alpha=0)
graph.ax_joint.legend([phantom],['r={:f}, p={:f}'.format(r,p)])
plt.show()

In [None]:
diagnosis={'M':1, 'B':0}
df['diagnosis']=[diagnosis[x] for x in df['diagnosis']]

In [None]:
col=['id', 'diagnosis']
X=df.drop(col, axis=1)
y=df['diagnosis']
X_train, x_test, y_train, y_test=train_test_split(X, y, test_size=0.3, random_state=10)

In [None]:
X.shape, y.shape

In [None]:
X.info()

In [None]:
y.dtype

# Logistic Regresion

In [None]:
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix,ConfusionMatrixDisplay
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [None]:
X_train.shape, y_train.shape

In [None]:
lr=LogisticRegression(solver="liblinear")
lr.fit(X, y)


In [None]:
col=['radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst']

In [None]:
importance = lr.coef_[0]
# summarize feature importance
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
plt.figure(figsize=(10,10))
plt.bar(X.columns, importance)
plt.xticks(rotation=90)
plt.show()

# Feature imporatnce after Standarization

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

In [None]:
lr=LogisticRegression(solver="liblinear")
lr.fit(X, y)

In [None]:
importance = lr.coef_[0]
# summarize feature importance
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
plt.figure(figsize=(10,10))
plt.bar([x for x in range(len(importance))], importance)
plt.show()

# Conclusion
* Standarization affect the features importance of the dataset
* As in above graph we can see that there wwre few features whose values were either too negative or positive. Rest of the features had almost 0
* After standarization we can see that a lot of the features wholse coefficeint values are much better. 
* Here negative values indicate that it tries to push the model towards the negative side
* Same case with the positive value which tends to push the model in positive side.

# Decision Tree

In [None]:
col=['id', 'diagnosis']
X=df.drop(col, axis=1)
y=df['diagnosis']
X_train, x_test, y_train, y_test=train_test_split(X, y, test_size=0.3, random_state=10)

In [None]:
from sklearn.tree import DecisionTreeClassifier


In [None]:
model = DecisionTreeClassifier()
# fit the model
model.fit(X, y)
# get importance
importance = model.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
    print('Feature: %d, Score: %.5f' % (i,v))
# plot feature importance
plt.figure(figsize=(15,15))
plt.bar(X.columns, importance)
plt.xticks(rotation=90)

plt.show()

# Conclusion
* The most important features are "radius_worst"
* The lkist of the important featuires are --Radius_worst, texture_worst, concave point worst, texture mean, concavity features, etc
* Those features wholse value is almost equal to 0 , are least important features. 
* Least features can be removed form the dataset and the dataset will be used for prediction. 

# Feature importance after the standarization 

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

In [None]:
model = DecisionTreeClassifier()
# fit the model
model.fit(X, y)
# get importance
importance = model.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
    print('Feature: %d, Score: %.5f' % (i,v))
# plot feature importance
plt.figure(figsize=(15,15))
plt.bar([x for x in range(len(importance))], importance)
plt.xticks(rotation=90)

plt.show()

# Conlusion
* Since we are getting same number of the important features.


# Random Forest

In [None]:
col=['id', 'diagnosis']
X=df.drop(col, axis=1)
y=df['diagnosis']
X_train, x_test, y_train, y_test=train_test_split(X, y, test_size=0.3, random_state=10)

In [None]:
from sklearn.ensemble import RandomForestClassifier


In [None]:
model = RandomForestClassifier()
# fit the model
model.fit(X, y)
# get importance
importance = model.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
    print('Feature: %d, Score: %.5f' % (i,v))
# plot feature importance
plt.figure(figsize=(15,15))
plt.bar(X.columns, importance)
plt.xticks(rotation=90)

plt.show()

# Conclusion
* The most important features is "Concave Point Wrost"
* The list of the most imporatnt feature that we obtained using Random Forest Classifier are----Perimetr Wrost, Concave point worst, radius worst, concave point mean, area worst, texture mean etc.
* SInce we can see that there are llarge number of important features ae available
* Those feature which have least value can be removed from the dataset. 

# Feature importance before Standarization

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

In [None]:
model=RandomForestClassifier()
model.fit(X, y)

In [None]:
importance = model.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
plt.figure(figsize=(10,10))
plt.bar([x for x in range(len(importance))], importance)
plt.show()

# Conclusion
* It is same as done above 
* Without and with standarization has less impact or almost no impact on feature importance

# XGBoost Feature Importance

In [None]:
col=['id', 'diagnosis']
X=df.drop(col, axis=1)
y=df['diagnosis']
X_train, x_test, y_train, y_test=train_test_split(X, y, test_size=0.3, random_state=10)

In [None]:
from xgboost import XGBClassifier


In [None]:
model = XGBClassifier()
# fit the model
model.fit(X, y)
# get importance
importance = model.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
    print('Feature: %d, Score: %.5f' % (i,v))
# plot feature importance
plt.figure(figsize=(15,15))
plt.bar(X.columns, importance)
plt.xticks(rotation=90)

plt.show()

# Classification
* The most important features is Radius Worst
* The number of important features decreaseas.
* As we can see that Perimeter Worst is 2nd important features.

# Permutation Feature Importance
* technique for calculating relative importance scores that is independent of the model used.
* First, a model is fit on the dataset, such as a model that does not support native feature importance scores. 
* This approach can be used for regression or classification and requires that a performance metric be chosen as the basis of the importance score, such as the mean squared error for regression and accuracy for classification.


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.inspection import permutation_importance

In [None]:
col=['id', 'diagnosis']
X=df.drop(col, axis=1)
y=df['diagnosis']
X_train, x_test, y_train, y_test=train_test_split(X, y, test_size=0.3, random_state=10)

In [None]:
model = KNeighborsClassifier()
# fit the model
model.fit(X, y)
# perform permutation importance
results = permutation_importance(model, X, y, scoring='neg_mean_squared_error')
# get importance
importance = results.importances_mean
for i,v in enumerate(importance):
    print('Feature: %d, Score: %.5f' % (i,v))
# plot feature importance
plt.figure(figsize=(15,15))
plt.bar(X.columns, importance)
plt.xticks(rotation=90)

plt.show()

# Conclusion
* The most imprtant features is Area worst
* The 2nd most important feature is Area Mean
* as we can see that there is only 2 features which is important