> # **DATA MINING**

* Dwi Krisnawan
* Bandem Mahatma
* Gus Rai Surya Laksana

# **Data Visualization Section**

In [None]:
import pandas as pd
import numpy  as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from mpl_toolkits.mplot3d import Axes3D

In [None]:
diabetes=pd.read_csv("../input/diabetes/diabetes.csv")
diabetes_df=diabetes

In [None]:
print (diabetes.shape)

In [None]:
diabetes.head(20)

In [None]:
print (diabetes.columns)
diabetes.describe()

In [None]:
plt.figure(figsize=(20,10))
cor = diabetes.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.YlGnBu,vmin= 0,vmax= 1)
plt.show()

In [None]:
g = sns.FacetGrid(diabetes, hue='Outcome',height=8)  # Menentukan Axis
g = g.map(plt.scatter, "Pregnancies", "Glucose").add_legend() # Plotting

plt.show()


In [None]:
sns.pairplot(diabetes, hue = "Outcome", corner=True);

In [None]:
sns.pairplot(diabetes [["Pregnancies", "Glucose", "BloodPressure", "BMI"]], diag_kind="kde");


In [None]:
sns.catplot(x="Pregnancies", y="Age", hue="Outcome", kind="box", data=diabetes);

In [None]:
import plotly.express as px
fig = px.scatter(diabetes.query("Outcome==1"), x = "Glucose", y = "BloodPressure", size = "Insulin", color = "Pregnancies", hover_name = "Pregnancies", log_x = True, size_max = 60,title="outcome = 1")

fig.show()


In [None]:
fig = px.scatter(diabetes.query("Outcome==0"), x = "Glucose", y = "BloodPressure", size = "Insulin", color = "Pregnancies", hover_name = "Pregnancies", log_x = True, size_max = 60,title="outcome = 0")
fig.show()

In [None]:
fig = px.parallel_categories(diabetes, color="Glucose", color_continuous_scale=px.colors.sequential.Sunset)
fig.show()

In [None]:
fig = px.parallel_categories(diabetes, color="BloodPressure", color_continuous_scale=px.colors.sequential.deep)
fig.show()


In [None]:
# Masih Error Jangan Dirubah
import plotly.graph_objects as go
from ipywidgets import widgets
import pandas as pd
import numpy as np

diabetes_df = diabetes
# Build parcats dimensions
categorical_dimensions = ['Pregnancies', 'BloodPressure', 'Glucose'];

dimensions = [dict(values=diabetes_df[label], label=label) for label in categorical_dimensions]

# Build colorscale
color = np.zeros(len(diabetes_df), dtype='uint8')
colorscale = [[0, 'gray'], [1, 'firebrick']]

# Build figure as FigureWidget
fig = go.FigureWidget(
    data=[go.Scatter(x=diabetes_df.Glucose, y=diabetes_df['Outcome'],
    marker={'color': 'gray'}, mode='markers', selected={'marker': {'color': 'firebrick'}},
    unselected={'marker': {'opacity': 0.3}}), go.Parcats(
        domain={'y': [0, 0.4]}, dimensions=dimensions,
        line={'colorscale': colorscale, 'cmin': 0,
              'cmax': 1, 'color': color, 'shape': 'hspline'})
    ])

fig.update_layout(
        height=800, xaxis={'title': 'Glucose'},
        yaxis={'title': 'Outcome', 'domain': [0.6, 1]},
        dragmode='lasso', hovermode='closest')

# Update color callback
def update_color(trace, points, state):
    # Update scatter selection
    fig.data[0].selectedpoints = points.point_inds

    # Update parcats colors
    new_color = np.zeros(len(diabetes_df), dtype='uint8')
    new_color[points.point_inds] = 1
    fig.data[1].line.color = new_color

# Register callback on scatter selection...
fig.data[0].on_selection(update_color)
# and parcats click
fig.data[1].on_click(update_color)

fig

In [None]:
import plotly.express as px
fig = px.scatter(diabetes, x="Glucose", y="BloodPressure", animation_frame="Age", animation_group="BMI",
           size="Pregnancies", color="SkinThickness", hover_name="Insulin", facet_col="Outcome",
           log_x=True, size_max=45, range_x=[1,400], range_y=[25,90])
fig.show()

# **Deep Learning Section**

In [None]:
import sys
import pandas as pd
import numpy as np
import sklearn
import matplotlib
import keras
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix

#Print Versi dari Library
print('Python: {}'.format(sys.version))
print('Pandas: {}'.format(pd.__version__))
print('Numpy: {}'.format(np.__version__))
print('Sklearn: {}'.format(sklearn.__version__))
print('Matplotlib: {}'.format(matplotlib.__version__))
print('Keras :{}'.format(keras.__version__))
print('Pandas :{}'.format(pd.__version__))




In [None]:
diabetes_df.head(10)

In [None]:
diabetes_df.describe()

In [None]:
diabetes_df.info()

In [None]:
dataset = diabetes_df.values
print(dataset.shape)

In [None]:
X = dataset[:,0:8]
Y = dataset[:, 8].astype(int)

In [None]:
print(X.shape)
print(Y.shape)
print(Y[:5])

In [None]:
from sklearn.model_selection import GridSearchCV, KFold
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X)

In [None]:
X_standardized = scaler.transform(X)

data = pd.DataFrame(X_standardized)
data.describe()

In [None]:
#Menentukan Seed
seed = 6
np.random.seed(seed)

# Membuat Model
def create_model():
    # create model
    model = Sequential()
    model.add(Dense(8, input_dim = 8, kernel_initializer='normal', activation='relu'))
    model.add(Dense(4, input_dim = 8, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    
    # Compile Model
    adam = Adam(lr = 0.01)
    model.compile(loss = 'binary_crossentropy', optimizer = adam, metrics = ['accuracy'])
    return model

model = KerasClassifier(build_fn = create_model, verbose = 1)

# define the grid search parameters
batch_size = [10, 20, 40]
epochs = [10, 50, 100]

# make a dictionary of the grid search parameters
param_grid = dict(batch_size=batch_size, epochs=epochs)

# build and fit the GridSearchCV
grid = GridSearchCV(estimator = model, param_grid = param_grid, cv = KFold(random_state=seed), verbose = 10)
grid_results = grid.fit(X_standardized, Y)

# summarize the results
print("Best: {0}, using {1}".format(grid_results.best_score_, grid_results.best_params_))
means = grid_results.cv_results_['mean_test_score']
stds = grid_results.cv_results_['std_test_score']
params = grid_results.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print('{0} ({1}) with: {2}'.format(mean, stdev, param))

In [None]:
from datetime import datetime
from packaging import version
from tensorflow import keras
%reload_ext tensorboard

logdir="logs/fit/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir)
%tensorboard --logdir logs

# **SECTION 2 MACHINE LEARNING (Data Split)**

In [None]:
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [None]:
diabetes_df.head()

In [None]:
diabetes_df.describe()

In [None]:
not_zero = ['Glucose','BloodPressure','SkinThickness','Insulin','BMI']

for column in not_zero:
    diabetes_df[column] = diabetes_df[column].replace(0,np.NaN)
    mean = int(diabetes_df[column].mean(skipna=True))
    diabetes_df[column] = diabetes_df[column].replace(np.NaN,mean)

In [None]:
X = diabetes_df.iloc[:, 0:7]
y = diabetes_df['Outcome']

#Ganti ukuran test size sesuai soal
X_train,X_test,y_train,y_test = train_test_split(X, y, train_size=0.6, test_size=0.4)

In [None]:
import math
math.sqrt(len(y_test))

In [None]:
knn = KNeighborsClassifier(n_neighbors=27, p=2, metric='euclidean')
knn.fit(X_train,y_train)

In [None]:
y_pred = knn.predict(X_test)
y_pred

In [None]:
accuracy_score(y_pred,y_test)

In [None]:
prediction=knn.predict([[6,148.0,62.0,35.0,455.0,33.6,0.627,30]])
if prediction ==1:
    print("The person have Diabetes")
else:
    print("The person is not have Diabetes")
prediction