## Overview

This notebook will show you how to create and query a table or DataFrame that you uploaded to DBFS. [DBFS](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html) is a Databricks File System that allows you to store data for querying inside of Databricks. This notebook assumes that you have a file already inside of DBFS that you would like to read from.

This notebook is written in **Python** so the default cell type is Python. However, you can use different languages by using the `%LANGUAGE` syntax. Python, Scala, SQL, and R are all supported.

In [2]:
# File location and type
file_location = "/FileStore/tables/creditcard.csv"
file_type = "csv"

# CSV options
infer_schema = "false"
first_row_is_header = "false"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df)

_c0,_c1,_c2,_c3,_c4,_c5,_c6,_c7,_c8,_c9,_c10,_c11,_c12,_c13,_c14,_c15,_c16,_c17,_c18,_c19,_c20,_c21,_c22,_c23,_c24,_c25,_c26,_c27,_c28,_c29,_c30
Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.3598071336738,-0.0727811733098497,2.53634673796914,1.37815522427443,-0.338320769942518,0.462387777762292,0.239598554061257,0.0986979012610507,0.363786969611213,0.0907941719789316,-0.551599533260813,-0.617800855762348,-0.991389847235408,-0.311169353699879,1.46817697209427,-0.470400525259478,0.207971241929242,0.0257905801985591,0.403992960255733,0.251412098239705,-0.018306777944153,0.277837575558899,-0.110473910188767,0.0669280749146731,0.128539358273528,-0.189114843888824,0.133558376740387,-0.0210530534538215,149.62,0
0,1.19185711131486,0.26615071205963,0.16648011335321,0.448154078460911,0.0600176492822243,-0.0823608088155687,-0.0788029833323113,0.0851016549148104,-0.255425128109186,-0.166974414004614,1.61272666105479,1.06523531137287,0.48909501589608,-0.143772296441519,0.635558093258208,0.463917041022171,-0.114804663102346,-0.183361270123994,-0.145783041325259,-0.0690831352230203,-0.225775248033138,-0.638671952771851,0.101288021253234,-0.339846475529127,0.167170404418143,0.125894532368176,-0.00898309914322813,0.0147241691924927,2.69,0
1,-1.35835406159823,-1.34016307473609,1.77320934263119,0.379779593034328,-0.503198133318193,1.80049938079263,0.791460956450422,0.247675786588991,-1.51465432260583,0.207642865216696,0.624501459424895,0.066083685268831,0.717292731410831,-0.165945922763554,2.34586494901581,-2.89008319444231,1.10996937869599,-0.121359313195888,-2.26185709530414,0.524979725224404,0.247998153469754,0.771679401917229,0.909412262347719,-0.689280956490685,-0.327641833735251,-0.139096571514147,-0.0553527940384261,-0.0597518405929204,378.66,0
1,-0.966271711572087,-0.185226008082898,1.79299333957872,-0.863291275036453,-0.0103088796030823,1.24720316752486,0.23760893977178,0.377435874652262,-1.38702406270197,-0.0549519224713749,-0.226487263835401,0.178228225877303,0.507756869957169,-0.28792374549456,-0.631418117709045,-1.0596472454325,-0.684092786345479,1.96577500349538,-1.2326219700892,-0.208037781160366,-0.108300452035545,0.00527359678253453,-0.190320518742841,-1.17557533186321,0.647376034602038,-0.221928844458407,0.0627228487293033,0.0614576285006353,123.5,0
2,-1.15823309349523,0.877736754848451,1.548717846511,0.403033933955121,-0.407193377311653,0.0959214624684256,0.592940745385545,-0.270532677192282,0.817739308235294,0.753074431976354,-0.822842877946363,0.53819555014995,1.3458515932154,-1.11966983471731,0.175121130008994,-0.451449182813529,-0.237033239362776,-0.0381947870352842,0.803486924960175,0.408542360392758,-0.00943069713232919,0.79827849458971,-0.137458079619063,0.141266983824769,-0.206009587619756,0.502292224181569,0.219422229513348,0.215153147499206,69.99,0
2,-0.425965884412454,0.960523044882985,1.14110934232219,-0.168252079760302,0.42098688077219,-0.0297275516639742,0.476200948720027,0.260314333074874,-0.56867137571251,-0.371407196834471,1.34126198001957,0.359893837038039,-0.358090652573631,-0.137133700217612,0.517616806555742,0.401725895589603,-0.0581328233640131,0.0686531494425432,-0.0331937877876282,0.0849676720682049,-0.208253514656728,-0.559824796253248,-0.0263976679795373,-0.371426583174346,-0.232793816737034,0.105914779097957,0.253844224739337,0.0810802569229443,3.67,0
4,1.22965763450793,0.141003507049326,0.0453707735899449,1.20261273673594,0.191880988597645,0.272708122899098,-0.00515900288250983,0.0812129398830894,0.464959994783886,-0.0992543211289237,-1.41690724314928,-0.153825826253651,-0.75106271556262,0.16737196252175,0.0501435942254188,-0.443586797916727,0.00282051247234708,-0.61198733994012,-0.0455750446637976,-0.21963255278686,-0.167716265815783,-0.270709726172363,-0.154103786809305,-0.780055415004671,0.75013693580659,-0.257236845917139,0.0345074297438413,0.00516776890624916,4.99,0
7,-0.644269442348146,1.41796354547385,1.0743803763556,-0.492199018495015,0.948934094764157,0.428118462833089,1.12063135838353,-3.80786423873589,0.615374730667027,1.24937617815176,-0.619467796121913,0.291474353088705,1.75796421396042,-1.32386521970526,0.686132504394383,-0.0761269994382006,-1.2221273453247,-0.358221569869078,0.324504731321494,-0.156741852488285,1.94346533978412,-1.01545470979971,0.057503529867291,-0.649709005559993,-0.415266566234811,-0.0516342969262494,-1.20692108094258,-1.08533918832377,40.8,0
7,-0.89428608220282,0.286157196276544,-0.113192212729871,-0.271526130088604,2.6695986595986,3.72181806112751,0.370145127676916,0.851084443200905,-0.392047586798604,-0.410430432848439,-0.705116586646536,-0.110452261733098,-0.286253632470583,0.0743553603016731,-0.328783050303565,-0.210077268148783,-0.499767968800267,0.118764861004217,0.57032816746536,0.0527356691149697,-0.0734251001059225,-0.268091632235551,-0.204232669947878,1.0115918018785,0.373204680146282,-0.384157307702294,0.0117473564581996,0.14240432992147,93.2,0


In [3]:
%python
import pyspark
from pyspark import SparkContext, SparkConf
con = SparkConf().setAppName("CreditCardFraudDetection").setMaster("local")

from pyspark.sql import SparkSession
from pyspark.sql import SparkSession

sparkses = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

from pyspark.sql.functions import explode
from pyspark.sql.functions import split
import pandas as pd
from matplotlib import pyplot as plt
from scipy.stats import norm
import numpy as np
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import VectorIndexer, VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.linalg import DenseVector
import seaborn as sns

In [4]:

%python
ccfd = spark.read.load("/FileStore/tables/creditcard.csv", format='csv', header='true', inferSchema='true')


type(ccfd)

In [5]:
def spark_shape(self):
    return (self.count(), len(self.columns))
pyspark.sql.dataframe.DataFrame.shape = spark_shape

In [6]:
ccfd.shape()

In [7]:
ccfd.printSchema()

In [8]:
%python
classquantity = ccfd.groupBy("Class").count()
classquantity.show()

In [9]:
ccfd

In [10]:
ccfd['Amount']

In [11]:
# #3D scatter plot
# FilteredData = ccfd[['Time','Amount', 'Class']]
# FilteredData

In [12]:
# plt.close()
# sns.set_style("whitegrid")
# g =sns.pairplot(ccfd)
# display()

In [13]:

ccfd= ccfd.toPandas()
ccfd= ccfd.sample(frac=1)

# amount of fraud classes 492 rows.
fraud_df_1 = ccfd.loc[ccfd['Class'] == 1]
non_fraud_df_1 = ccfd.loc[ccfd['Class'] == 0][:492]

normal_distributed_df_1 = pd.concat([fraud_df_1, non_fraud_df_1])

# Shuffle dataframe rows
new_df_0 = normal_distributed_df_1.sample(frac=1, random_state=42)

new_df_0.shape

In [14]:
print('Distribution of the Classes in the subsample dataset')
print(new_df_0['Class'].value_counts()/len(new_df_0))

sns.countplot('Class', data=new_df_0)
plt.title('Equally Distributed Classes', fontsize=14)
display()


In [15]:
# subsample in our correlation
f, (ax1, ax2) = plt.subplots(2, 1, figsize=(24,20))

# Entire DataFrame
corr = ccfd.corr()
sns.heatmap(corr, cmap='coolwarm_r', annot_kws={'size':20}, ax=ax1)
ax1.set_title("Imbalanced Correlation Matrix", fontsize=15)

sub_sample_corr = new_df_0.corr()
sns.heatmap(sub_sample_corr, cmap='coolwarm_r', annot_kws={'size':18}, ax=ax2)
ax2.set_title('SubSample Correlation Matrix', fontsize=15)
display()

In [16]:
f, axes = plt.subplots(ncols=4, figsize=(20,4))

# Negative Correlations with our Class (The lower our feature's value the more likely it will be a fraud transaction)
sns.boxplot(x="Class", y="V17", data=new_df_0,  ax=axes[0])
axes[0].set_title('V17 vs. Class Negative Correlation')

sns.boxplot(x="Class", y="V14", data=new_df_0,  ax=axes[1])
axes[1].set_title('V14 vs. Class Negative Correlation')

sns.boxplot(x="Class", y="V12", data=new_df_0, ax=axes[2])
axes[2].set_title('V12 vs. Class Negative Correlation')

sns.boxplot(x="Class", y="V10", data=new_df_0, ax=axes[3])
axes[3].set_title('V10 vs. Class Negative Correlation')

display()

In [17]:
f, axes = plt.subplots(ncols=4, figsize=(20,4))

# Positive correlations (The higher the feature the probability increases that it will be a fraud transaction)
sns.boxplot(x="Class", y="V11", data=new_df_0, color='#FB8861',  ax=axes[0])
axes[0].set_title('V11 vs. Class Positive Correlation')

sns.boxplot(x="Class", y="V4", data=new_df_0, color='#FB8861', ax=axes[1])
axes[1].set_title('V4 vs. Class Positive Correlation')

sns.boxplot(x="Class", y="V2", data=new_df_0, color='#C5B3F9', ax=axes[2])
axes[2].set_title('V2 vs. Class Positive Correlation')

sns.boxplot(x="Class", y="V19", data=new_df_0, color='#FA1062', ax=axes[3])
axes[3].set_title('V19 vs. Class Positive Correlation')

display()

In [18]:
f, (ax1, ax2, ax3) = plt.subplots(1,3, figsize=(20, 6))

v14_fraud_dist_1 = new_df_0['V14'].loc[new_df_0['Class'] == 1].values
sns.distplot(v14_fraud_dist_1, ax=ax1, fit=norm, color='#FB8861')
ax1.set_title('V14 Distribution \n (Fraud Transactions)', fontsize=15)

v12_fraud_dist_1 = new_df_0['V12'].loc[new_df_0['Class'] == 1].values
sns.distplot(v12_fraud_dist_1,ax=ax2, fit=norm, color='#56F9BB')
ax2.set_title('V12 Distribution \n (Fraud Transactions)', fontsize=15)


v10_fraud_dist_1 = new_df_0['V10'].loc[new_df_0['Class'] == 1].values
sns.distplot(v10_fraud_dist_1, ax=ax3, fit=norm, color='#4B8BBE')
ax3.set_title('V10 Distribution \n (Fraud Transactions)', fontsize=15)

display()

In [19]:
# # -----> V14 Removing Outliers (Highest Negative Correlated with Labels)
v14_fraud_1 = new_df_0['V14'].loc[new_df_0['Class'] == 1].values
q25, q75 = np.percentile(v14_fraud_1, 25), np.percentile(v14_fraud_1, 75)
print('Quartile 25: {} | Quartile 75: {}'.format(q25, q75))
v14_iqr = q75 - q25
print('iqr: {}'.format(v14_iqr))

v14_cut_off_1 = v14_iqr * 1.5
v14_lower_1, v14_upper_1 = q25 - v14_cut_off_1, q75 + v14_cut_off_1
print('Cut Off: {}'.format(v14_cut_off_1))
print('V14 Lower: {}'.format(v14_lower_1))
print('V14 Upper: {}'.format(v14_upper_1))

outliers = [x for x in v14_fraud_1 if x < v14_lower_1 or x > v14_upper_1]
print('Feature V14 Outliers for Fraud Cases: {}'.format(len(outliers)))
print('V10 outliers:{}'.format(outliers))

new_df_0 = new_df_0.drop(new_df_0[(new_df_0['V14'] > v14_upper_1) | (new_df_0['V14'] < v14_lower_1)].index)
print('----' * 44)


# -----> V12 removing outliers from fraud transactions
v12_fraud_1 = new_df_0['V12'].loc[new_df_0['Class'] == 1].values
q25, q75 = np.percentile(v12_fraud_1, 25), np.percentile(v12_fraud_1, 75)
v12_iqr = q75 - q25

v12_cut_off_1 = v12_iqr * 1.5
v12_lower_1, v12_upper_1 = q25 - v12_cut_off_1, q75 + v12_cut_off_1
print('V12 Lower: {}'.format(v12_lower_1))
print('V12 Upper: {}'.format(v12_upper_1))
outliers = [x for x in v12_fraud_1 if x < v12_lower_1 or x > v12_upper_1]
print('V12 outliers: {}'.format(outliers))
print('Feature V12 Outliers for Fraud Cases: {}'.format(len(outliers)))
new_df_0 = new_df_0.drop(new_df_0[(new_df_0['V12'] > v12_upper_1) | (new_df_0['V12'] < v12_lower_1)].index)
print('Number of Instances after outliers removal: {}'.format(len(new_df_0)))
print('----' * 44)


# Removing outliers V10 Feature
v10_fraud_1 = new_df_0['V10'].loc[new_df_0['Class'] == 1].values
q25, q75 = np.percentile(v10_fraud_1, 25), np.percentile(v10_fraud_1, 75)
v10_iqr = q75 - q25


v10_cut_off_1 = v10_iqr * 1.5
v10_lower_1, v10_upper_1 = q25 - v10_cut_off_1, q75 + v10_cut_off_1
print('V10 Lower: {}'.format(v10_lower_1))
print('V10 Upper: {}'.format(v10_upper_1))
outliers = [x for x in v10_fraud_1 if x < v10_lower_1 or x > v10_upper_1]
print('V10 outliers: {}'.format(outliers))
print('Feature V10 Outliers for Fraud Cases: {}'.format(len(outliers)))
new_df_0 = new_df_0.drop(new_df_0[(new_df_0['V10'] > v10_upper_1) | (new_df_0['V10'] < v10_lower_1)].index)
print('Number of Instances after outliers removal: {}'.format(len(new_df_0)))


type(new_df_0)

In [20]:
f,(ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20,6))

colors_0 = ['#B3F9C5', '#f9c5b3']
# Boxplots with outliers removed
# Feature V14
sns.boxplot(x="Class", y="V14", data=new_df_0, ax=ax1, palette=colors_0)
ax1.set_title("V14 Feature \n Reduction of outliers", fontsize=15)
ax1.annotate('Fewer extreme \n outliers', xy=(0.98, -17.5), xytext=(0, -12),
            arrowprops=dict(facecolor='black'),
            fontsize=14)

# Feature 12
sns.boxplot(x="Class", y="V12", data=new_df_0, ax=ax2, palette=colors_0)
ax2.set_title("V12 Feature \n Reduction of outliers", fontsize=15)
ax2.annotate('Fewer extreme \n outliers', xy=(0.98, -17.3), xytext=(0, -12),
            arrowprops=dict(facecolor='black'),
            fontsize=15)

# Feature V10
sns.boxplot(x="Class", y="V10", data=new_df_0, ax=ax3, palette=colors_0)
ax3.set_title("V10 Feature \n Reduction of outliers", fontsize=15)
ax3.annotate('Fewer extreme \n outliers', xy=(0.95, -16.5), xytext=(0, -12),
            arrowprops=dict(facecolor='black'),
            fontsize=15)


display()

In [21]:
dff = spark.createDataFrame(new_df_0)


In [22]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window
win_1 = Window().orderBy('Time')
dff = dff.withColumn("idx", row_number().over(win_1))

In [23]:
training_df_1 = dff.rdd.map(lambda x: (DenseVector(x[0:29]),x[30],x[31]))

training_df_1 = spark.createDataFrame(training_df_1,["features","label","index"])

training_df_1.head()

In [24]:
training_df_1 = training_df_1.select("index","features","label")

train_data_1, test_data_1 = training_df_1.randomSplit([.8,.2],seed=1234)

In [25]:
train_data_1.groupBy("label").count().show()

In [26]:
test_data_1.groupBy("label").count().show()

In [27]:
# gbt = GBTClassifier(featuresCol="features", maxIter=100,maxDepth=8)
# model = gbt.fit(train_data_1)


model = GBTClassifier(maxIter = 10, labelCol = 'label').fit(train_data_1)


In [28]:
predictions = model.transform(test_data_1)
predictions.groupBy("prediction").count().show()


In [29]:
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions)

In [30]:
predictions = predictions.withColumn("fraudPrediction",when((predictions.label==1)&(predictions.prediction==1),1).otherwise(0))
predictions.groupBy("fraudPrediction").count().show()

In [31]:
predictions.groupBy("label").count().show()

In [32]:
from pyspark.sql.functions import col
accurateFraud = predictions.groupBy("fraudPrediction").count().where(predictions.fraudPrediction==1).head()[1]
totalFraud = predictions.groupBy("label").count().where(predictions.label==1).head()[1]
FraudPredictionAccuracy = (accurateFraud/totalFraud)*100
FraudPredictionAccuracy

In [33]:
tp = predictions[(predictions.label == 1) & (predictions.prediction == 1)].count()
tn = predictions[(predictions.label == 0) & (predictions.prediction == 0)].count()
fp = predictions[(predictions.label == 0) & (predictions.prediction == 1)].count()
fn = predictions[(predictions.label == 1) & (predictions.prediction == 0)].count()



In [34]:
#For GBT

print("True Positive: ",tp,"\nTrue Negative: ",tn,"\nFalse Positive: ",fp,"\nFalse Negative: ",fn)
print("Recall: ",tp/(tp+fn))
print("Precision: ", tp/(tp+fp))

In [35]:
# # replace None values
def get_column(df, colname):
    coldata = df.rdd.map(lambda r: r[colname]).collect()
    coldata = ['None' if v is None else v for v in coldata] 
    return(coldata)
def model_acc(model, validate=test_data_1):
    pred = model.transform(validate)
    eval_vec = np.array(get_column(pred, "label")) == np.array(get_column(pred, "prediction")) 
    return(eval_vec.sum()/len(eval_vec))
  
# model_acc(gbt)

In [36]:
from pyspark.ml.classification import LogisticRegression

# Create initial LogisticRegression model
lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10)

# Train model with Training Data
lrModel = lr.fit(train_data_1)

# Make predictions on test data using the transform() method.
# LogisticRegression.transform() will only use the 'features' column.
lr_prediction_1 = lrModel.transform(test_data_1)
# Evaluate model
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(lr_prediction_1)

In [37]:

predictions_1 = lr_prediction_1.withColumn("fraudPrediction",when((lr_prediction_1.label==1)&(lr_prediction_1.prediction==1),1).otherwise(0))

predictions_1.groupBy("fraudPrediction").count().show()

In [38]:
predictions_1.groupBy("label").count().show()

In [39]:
# fraud prediction accuracy using Logistic Regression
accurateFraud_1 = predictions_1.groupBy("fraudPrediction").count().where(predictions_1.fraudPrediction==1).head()[1]
totalFraud_1 = predictions_1.groupBy("label").count().where(predictions_1.label==1).head()[1]
FraudPredictionAccuracy_1 = (accurateFraud_1/totalFraud_1)*100
FraudPredictionAccuracy_1

In [40]:

tp_1 = predictions_1[(predictions_1.label == 1) & (predictions_1.prediction == 1)].count()

In [41]:
tn_1 = predictions_1[(predictions_1.label == 0) & (predictions_1.prediction == 0)].count()

In [42]:
fp_1 = predictions_1[(predictions_1.label == 0) & (predictions_1.prediction == 1)].count()
fn_1 = predictions_1[(predictions_1.label == 1) & (predictions_1.prediction == 0)].count()

In [43]:
#Logistic Regression 

print("True Positive: ",tp_1,"\nTrue Negative: ",tn_1,"\nFalse Positive: ",fp_1,"\nFalse Negative: ",fn_1)
print("Recall: ",tp_1/(tp_1+fn_1))
print("Precision: ", tp_1/(tp_1+fp_1))

In [44]:
from sklearn.model_selection import train_test_split

y = ccfd['Class']
X = ccfd.drop('Class',axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y,random_state=8125)
X_train, X_validate, y_train, y_validate = train_test_split(X_train, y_train, test_size=0.333, stratify=y_train,random_state=8125)
train = pd.concat([X_train, y_train],axis=1)
validation = pd.concat([X_validate, y_validate],axis=1)
test = pd.concat([X_test, y_test],axis=1)
print("Percentage of fraud transactions in train is: ",train.Class.mean())
print("Percentage of fraud transactions in test is: ",test.Class.mean())


In [45]:
#Apply KNN on the dataset, find out the best k using 5-Folds CV.
#taking first 20000 samples
data_20000_1 = ccfd[:20000]

In [46]:
data_20000_1.shape

In [47]:
data_20000_1["Class"].value_counts()

In [48]:
#Data is heavily imbalanced here.
data20000_1 = data_20000_1.drop(['Class'], axis=1)
data20000_1.shape

In [49]:
data20000_1_labels = data_20000_1["Class"]
data20000_1_labels.shape

In [50]:
from sklearn.preprocessing import StandardScaler
data20000_1_Std = StandardScaler().fit_transform(data20000_1)
print(data20000_1_Std.shape)
print(type(data20000_1_Std))

In [51]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score


X2 = data20000_1_Std[0:16000]
XTest_1 = data20000_1_Std[16000:20000]
Y2 = data20000_1_labels[0:16000]
YTest_1 = data20000_1_labels[16000:20000]

myList_1 = list(range(0,50))
neighbors_1 = list(filter(lambda x: x%2!=0, myList_1))  

CV_Scores_1 = []

for k in neighbors_1:
    KNN_1 = KNeighborsClassifier(n_neighbors = k, algorithm = 'kd_tree')
    scores_1 = cross_val_score(KNN_1, X2, Y2, cv = 5, scoring='recall')
    CV_Scores_1.append(scores_1.mean())

In [52]:
# CV_Scores_1


In [53]:
plt.figure(figsize = (14, 12))
plt.plot(neighbors_1, CV_Scores_1)
plt.title("Neighbors Vs Recall Score", fontsize=25)
plt.xlabel("Number of Neighbors", fontsize=25)
plt.ylabel("Recall Score", fontsize=25)
plt.grid(linestyle='-', linewidth=0.5)
display()

In [54]:
best_k_1=1
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
KNN_best_1 = KNeighborsClassifier(n_neighbors = best_k_1, algorithm = 'kd_tree')

KNN_best_1.fit(X2, Y2)

prediction_1 = KNN_best_1.predict(XTest_1)

cm_1 = confusion_matrix(YTest_1, prediction_1)

print(cm_1)

tn_k, fp_k, fn_k, tp_k = cm_1.ravel()



In [55]:
#For KNN
print("True Positive: ",tp_k,"\nTrue Negative: ",tn_k,"\nFalse Positive: ",fp_k,"\nFalse Negative: ",fn_k)
print("Recall: ",tp_k/(tp_k+fn_k))
print("Precision: ", tp_k/(tp_k+fp_k))

In [56]:
#applying T-SNE on our dataset
data_1 = ccfd.drop("Class", axis = 1)
cc = ccfd["Class"]


In [57]:
print("the shape of data = ", data_1.shape)
print("the shape of class = ", cc.shape)

In [58]:
#Data-preprocessing: Standardizing the data
from sklearn.preprocessing import StandardScaler
standardized_data_1 = StandardScaler().fit_transform(data_1)
print(standardized_data_1.shape)
print(type(standardized_data_1))
#here we have just standardized our data to col-std so that the mean = 0 and standard-deviation = 1.

In [59]:
mn = np.mean(standardized_data_1)
sds = np.std(standardized_data_1)
print(mn)
print(sds)

In [60]:
from sklearn.manifold import TSNE
data_25k = standardized_data_1[0:25000]
labels_25k = cc[0:25000]

model = TSNE(n_components=2, random_state=0, perplexity=50, n_iter=1000)

tsne_data = model.fit_transform(data_25k)

# creating a new data frame which help us in ploting the result data
tsne_data = np.vstack((tsne_data.T, labels_25k)).T
tsne_df = pd.DataFrame(data=tsne_data, columns=("Dimension1", "Dimension2", "Class"))

# Ploting the result of tsne
sns.FacetGrid(tsne_df, hue="Class", size=8).map(plt.scatter, 'Dimension1', 'Dimension2').add_legend()
plt.show()
display()

In [61]:

model = TSNE(n_components=2, random_state=0, perplexity=50, n_iter=3500)

tsne_data = model.fit_transform(data_25k)

# creating a new data frame which help us in ploting the result data
tsne_data = np.vstack((tsne_data.T, labels_25k)).T
tsne_df = pd.DataFrame(data=tsne_data, columns=("Dimension1", "Dimension2", "Class"))

# Ploting the result of tsne
sns.FacetGrid(tsne_df, hue="Class", size=8).map(plt.scatter, 'Dimension1', 'Dimension2').add_legend()
plt.show()
display()

In [62]:
model = TSNE(n_components=2, random_state=0, perplexity=100, n_iter=2000)

tsne_data = model.fit_transform(data_25k)

# creating a new data frame which help us in ploting the result data
tsne_data = np.vstack((tsne_data.T, labels_25k)).T
tsne_df = pd.DataFrame(data=tsne_data, columns=("Dimension1", "Dimension2", "Class"))

# Ploting the result of tsne
sns.FacetGrid(tsne_df, hue="Class", size=8).map(plt.scatter, 'Dimension1', 'Dimension2').add_legend()
plt.show()
display()

In [63]:
# model = TSNE(n_components=2, random_state=0, perplexity=500, n_iter=1200)

# tsne_data = model.fit_transform(data_25k)

# # creating a new data frame which help us in ploting the result data
# tsne_data = np.vstack((tsne_data.T, labels_25k)).T
# tsne_df = pd.DataFrame(data=tsne_data, columns=("Dimension1", "Dimension2", "Class"))

# # Ploting the result of tsne
# sns.FacetGrid(tsne_df, hue="Class", size=8).map(plt.scatter, 'Dimension1', 'Dimension2').add_legend()
# plt.show()
# display()

In [64]:
# model = TSNE(n_components=2, random_state=0, perplexity=10, n_iter=1200)

# tsne_data = model.fit_transform(data_25k)

# # creating a new data frame which help us in ploting the result data
# tsne_data = np.vstack((tsne_data.T, labels_25k)).T
# tsne_df = pd.DataFrame(data=tsne_data, columns=("Dimension1", "Dimension2", "Class"))

# # Ploting the result of tsne
# sns.FacetGrid(tsne_df, hue="Class", size=8).map(plt.scatter, 'Dimension1', 'Dimension2').add_legend()
# plt.show()