In [None]:
import pandas as pd
import numpy as np

## Exploring dataset

In [None]:
df=pd.read_csv("../input/breast-cancer-wisconsin-data/data.csv")

## Accesing first five rows with df.head() method

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

## checking for any missing data

In [None]:
df.isnull().sum()

## Removing unwanted data

In [None]:
df=df.drop("id",axis=1)
df=df.drop("Unnamed: 32",axis=1)

## Histogram representation of data
##### histogramns tells us about the skewness and outliers in the data

In [None]:
import matplotlib.pyplot as plt
for i in df.columns:
    print(i)
    df[i].hist()
    plt.show()
    

# Checking for skewness

In [None]:
df.skew()

## Checking for the variance in the data

In [None]:
df.var()

## Encoding data

In [None]:
df.diagnosis.replace(to_replace=['B','M'],value=[0,1],inplace=True)

## visualizing target variable for any class imbalance

In [None]:
import seaborn as sns
sns.countplot(df.diagnosis)

## Correlation matrix

In [None]:
plt.figure(figsize=(15,8))
sns.heatmap(df.corr(),linewidths=.9, yticklabels=False,square=False,linecolor='black')

## Model with no preprocessing

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [None]:
X=df.drop("diagnosis",axis=1)
y=df["diagnosis"]

## Train test split

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=40)

## random forest classifier 

In [None]:
rfc=RandomForestClassifier()
rfc.fit(x_train,y_train)
rfc.score(x_test,y_test)

## Xgboost classifier

In [None]:
xgb = XGBClassifier()
xgb.fit(x_train,y_train)
xgb.score(x_test,y_test)

In [None]:
lgm=LGBMClassifier()
lgm.fit(x_train,y_train)
print(lgm.score(x_test,y_test))
pred=lgm.predict(x_test)

## Model performence after preprocessing

### Applying log transformation for high variance variables

In [None]:
for i in df.columns:
    if df[i].var()>10:
        df[i]=np.log(df[i])
        print(i)
    

In [None]:
df.var()

In [None]:
df.skew()

In [None]:
X=df.drop("diagnosis",axis=1)
y=df["diagnosis"]


In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=40)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)


In [None]:
rfc=RandomForestClassifier()
rfc.fit(x_train,y_train)
rfc.score(x_test,y_test)

In [None]:
xgb = XGBClassifier()
xgb.fit(x_train,y_train)
xgb.score(x_test,y_test)

In [None]:
abc = AdaBoostClassifier()
abc.fit(x_train,y_train)
abc.score(x_test,y_test)

In [None]:
lgm=LGBMClassifier()
lgm.fit(x_train,y_train)
print(lgm.score(x_test,y_test))
pred=lgm.predict(x_test)

#### So,Standard scaler increased the accuracy by ~2 percent 

## Model evaluation

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,pred))