# Importing Libraries

In [None]:
# Importing necessary library and packages
# General Use
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

# Clustering
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# 3D Visualization
import plotly as py
import plotly.graph_objs as go

# Principal Component Analysis
from sklearn.decomposition import PCA

# Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
# Set random seed
np.random.seed(0)

# Data Preparation

In [None]:
#Importing the data of block information
data1 = pd.read_excel("blockhash.xlsx")
data1.head()

In [None]:
#Importing the data of transaction
data2 = pd.read_excel("tx.xlsx")
data2.head()

In [None]:
#Importing the data of transaction inputs
data3 = pd.read_excel("txin.xlsx")
data3.head()

In [None]:
#Importing the data of transaction outputs
data4 = pd.read_excel("txout.xlsx")
data4

In [None]:
#Merging the dataset data2 and data3
df1 = pd.merge(data2, data3, on="txID")
df1.head()

In [None]:
#Merging the dataset df1 and data4
df2 = pd.merge(df1, data4, on="txID")
df2.head()

In [None]:
#Creating pivot table to get the information of blocks
pivot = pd.pivot_table(df2, values=['n_inputs', 'n_outputs', 'value_x', 'value_y'], index=['blockID'], 
                       aggfunc=np.sum, fill_value=0)
pivot.reset_index()
pivot.head()

In [None]:
#Merging the dataset pivot and data and preparing the final dataset
final = pd.merge(pivot, data1, on="blockID")
final = final.rename(columns={'value_x': 'send_value', 'value_y': 'receive_value'})
final.drop(columns=['bhash'], inplace=True)
final['blockID'] = final['blockID'].astype(str)
final.head()

In [None]:
#the datatypes of final dataset
final.info()

# Exloratory Data Analysis

## Understanding the data

In [None]:
#the head of final dataset
final.head()

In [None]:
#the tail of final dataset
final.tail()

In [None]:
#the shape of final dataset
final.shape

In [None]:
#the summary statistics of final dataset
final.describe()

In [None]:
#the columns of final dataset
final.columns

In [None]:
#the number of unique values of different variable
final.nunique()

## Cleaning the data

In [None]:
#total null values of different variables
final.isnull().sum()

In [None]:
#dropping the dependent variable blockID
final = final.drop(['blockID'], axis=1)
final.head()

## Relationship analysis

In [None]:
#setting the correlation coefficient
correlation = final.corr()

In [None]:
#Using heatmap to illustrate the correlations between variables
sns.heatmap(correlation, xticklabels=correlation.columns, yticklabels=correlation.columns, annot=True)

In [None]:
#Using pairplot to illustrate the correlations between variables
sns.pairplot(final)

In [None]:
#Using distribution plot to illustrate the distribution of n_inputs
sns.distplot(final["n_inputs"])

In [None]:
#Using distribution plot to illustrate the distribution of n_outputs
sns.distplot(final["n_outputs"])

In [None]:
#Using distribution plot to illustrate the distribution of send_value
sns.distplot(final["send_value"])

In [None]:
#Using distribution plot to illustrate the distribution of recieve_value
sns.distplot(final["receive_value"])

In [None]:
#Using distribution plot to illustrate the distribution of block creation time
sns.distplot(final["btime"])

In [None]:
#Using distribution plot to illustrate the distribution of transactions
sns.distplot(final["txs"])

In [None]:
#Using box plot to illustrate the distribution of n_inputs
sns.catplot(x='n_inputs', kind='box',data=final)

In [None]:
#Using boxplot plot to illustrate the distribution of n_outputs
sns.catplot(x='n_outputs', kind='box',data=final)

In [None]:
#Using box plot to illustrate the distribution of send_value
sns.catplot(x='send_value', kind='box',data=final)

In [None]:
#Using box plot to illustrate the distribution of receive_value
sns.catplot(x='receive_value', kind='box', data=final)

In [None]:
#Using box plot to illustrate the distribution of block creation time
sns.catplot(x='btime', kind='box', data=final)

In [None]:
#Using box plot to illustrate the distribution of transactions
sns.catplot(x='txs', kind='box', data=final)

# K-means Clustering

In [None]:
#Preparaing the dataset for K-means algorithm
dataD = final[["n_inputs", "n_outputs", "send_value", "receive_value", "btime", "txs"]]
dataD

## Finding the Optimal Number of Clusters with the Elbow Method

In [None]:
#Setting the algorithm for the Elbow Method
sum_of_sqr_dist = {}

for k in range(1, 10):
    km = KMeans(n_clusters=k, init='k-means++', max_iter=1000)
    km = km.fit(dataD)
    sum_of_sqr_dist[k] = km.inertia_

In [None]:
#Visualizing the Elbow Method for Optimal number of clustering
sns.pointplot(x=list(sum_of_sqr_dist.keys()), y=list(sum_of_sqr_dist.values()))
plt.xlabel("Number of Clusters (K)")
plt.ylabel("Sum of Square Distances")
plt.title("Elbow Method for Optimal K")
plt.show()

In [None]:
#Setting the KMeans algorithm for the number of clusters 2
Model2 = KMeans(n_clusters=2, init='k-means++', max_iter=1000)
Model2.fit(dataD)

In [None]:
#Fitting the model and visualizing the predicted clusters
dataD['Cluster'] = Model2.fit_predict(dataD)
dataD.head()

In [None]:
#Separating Normal blocks (Cluster==0) and Suspicious blocks (Cluster==1)
clx0 = dataD[dataD.Cluster==0]
clx1 = dataD[dataD.Cluster==1]

In [None]:
#the head of Suspicious blocks
clx1.head()

In [None]:
#Counting the number of blocks in two clusters 
kmeans_model = KMeans(n_clusters = 2, random_state = 1).fit(dataD)
dataD['kmean'] = kmeans_model.labels_
dataD['kmean'].value_counts()

## Evaluation with Silhouette Score

In [None]:
#Setting the labels variable
labels = Model2.labels_

In [None]:
#the silhouette score for this dataset
silhouette_score(dataD, labels)

In [None]:
#Setting the algorithm for the silhouette score of different number of clusters
silhouette = {}

for k in range(2,8):
    km = KMeans(n_clusters=k, init='k-means++', max_iter=1000)
    km.fit(dataD)
    silhouette[k] = silhouette_score(dataD, km.labels_)

In [None]:
#Visualizing the silhouette score of different number of clusters
sns.pointplot(x=list(silhouette.keys()), y=list(silhouette.values()))
plt.xlabel("Number of Clusters (K)")
plt.ylabel("Silhouette Scores")
plt.title("Silhouette Scores for Each K")
plt.show()

# Principal Component Analysis

In [None]:
#the head of dataset for Principal Component Analysis
dataD.head()

In [None]:
#Preparing the dataset for Principal Component Analysis
PCAD = pd.merge(final, dataD, on=["n_inputs", "n_outputs", "receive_value", "send_value", "btime", "txs"])
FPCAD = PCAD['Cluster']
PCAD.drop(columns=['Cluster', 'kmean'], inplace=True)
PCAD

In [None]:
#the columns of the dataset
PCAD.keys()

In [None]:
#Scaling the data StandardScaler
scaler = StandardScaler()
scaler.fit(PCAD)
scaled_data = scaler.transform(PCAD)

In [None]:
#Setting the algorithm for Principal Component Analysis
pca = PCA(n_components = 2)
pca.fit(scaled_data)

In [None]:
#The dimensionality reduction with Principal Component Analysis
x_pca = pca.transform(scaled_data)
print(scaled_data.shape)
print(x_pca.shape)

In [None]:
#Visualizing the Principal Components of Principal Component Analysis
plt.figure(figsize=(8,6))
plt.scatter(x_pca[:,0], x_pca[:,1], c=FPCAD, cmap='plasma')
plt.xlabel('First principal component')
plt.ylabel('Second principal component')

In [None]:
#Interpreting the components
pca.components_

In [None]:
#Visualizing the contributions of the components
df_comp = pd.DataFrame(pca.components_, columns = PCAD.keys())
plt.figure(figsize=(12,6))
sns.heatmap(df_comp, cmap='plasma')

# Random Forest

In [None]:
#Preparing the dataset for Random Forest
dataD.drop(columns=['kmean'], inplace=True)
dataD

In [None]:
#Creating Test and Train Data
dataD['is_train'] = np.random.uniform(0, 1, len(dataD)) <= .75

#the head of the dataset
dataD.head()

In [None]:
#Creating dataframes with test rows and training rows
train, test = dataD[dataD['is_train']==True], dataD[dataD['is_train']==False]

#Showing the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

In [None]:
#Creating a list of the feature column's names
features = dataD.columns[:6]

#Viewing features
print(features)

In [None]:
#Viewing target
y = train['Cluster']
y.head()

In [None]:
#Creating a random forest Classifier.
clf = RandomForestClassifier(n_jobs=2, random_state=0)

#Training the Classifier
clf.fit(train[features], y)

In [None]:
#Applying the trained classifier to the test
clf.predict(test[features])

In [None]:
#Viewing the predicted probabilities of the first 10 observations
clf.predict_proba(test[features])[0:10]

In [None]:
#Creating actual english names for the plants for each predicted plant class
preds = clf.predict(test[features])
preds = pd.Series(np.where(preds == 1, 'Suspicious', 'Normal'))

#Viewing the PREDICTED species
preds.head()

In [None]:
#Viewing the ACTUAL species for the first five observations
actual = test['Cluster']
actual = pd.Series(np.where(actual == 1, 'Suspicious', 'Normal'))
actual.head()

## Evaluation with Confusion Matrix

In [None]:
#Settting up the confusion matrix
cm = confusion_matrix(actual, preds)
cm

In [None]:
#Creating a dataframe for a array-formatted Confusion matrix,so it will be easy for plotting
cm_df = pd.DataFrame(cm,
                     index = ['Normal','Suspicious'], 
                     columns = ['Normal','Suspicious'])

In [None]:
#Plotting the confusion matrix
plt.figure(figsize=(5,4))
sns.heatmap(cm_df, annot=True)
plt.title('Confusion Matrix')
plt.ylabel('Actual Values')
plt.xlabel('Predicted Values')
plt.show()