# Load Library

In [1]:
import os

In [3]:
os.chdir('/content/drive/MyDrive/Colab_Notebooks/synthetic_data/SDG')

In [4]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score,roc_auc_score
import matplotlib.pyplot as plt

# Predictive Analysis

In [5]:
def recall(preds, dtrain):
    labels = dtrain.get_label()
    return 'recall',  recall_score(labels, np.round(preds))

def precision(preds, dtrain):
    labels = dtrain.get_label()
    return 'precision',  precision_score(labels, np.round(preds))

def roc_auc(preds, dtrain):
    labels = dtrain.get_label()
    return 'roc_auc',  roc_auc_score(labels, preds)

In [100]:
session = pd.read_csv('./output/synthetic_session2014.csv', sep = ',')
#session = pd.read_csv('./input/session2014_state.csv', sep = ',')

In [None]:
session.shape

In [102]:
session.drop(['Unnamed: 0'], axis=1, inplace=True)

In [57]:
#kdd = pd.read_csv('./output/synthetic_kdd.csv', sep = ',')
kdd = pd.read_csv('./input/KDD19_state.csv', sep = ',')

In [None]:
kdd.shape

In [59]:
kdd.drop(['Unnamed: 0'], axis=1, inplace=True)

In [12]:
intention = df = pd.read_csv('./output/synthetic_intention.csv', sep = ',')
#intention = pd.read_csv('./input/intention_state.csv', sep = ',')

In [None]:
intention.shape

In [14]:
intention.drop(['Unnamed: 0'], axis=1, inplace=True)

In [None]:
session.columns

In [104]:
features = ['QueryOrder', 'QueryLen', 'QueryDwellTime', 'NumClickedPage',
       'NumClickedDoc', 'NumKeyDoc', 'NumRelevantDoc', 'AveDwellTime',
       'TotalDwellTime', 'AveClickRank', 'ClickDepth', 'SearchEngineDwellTime',
       'ClickPrecision', 'TimetoFirstLastClick', 'ReciproalRank',
       'ratiofirstclick', 'ReformulationTime', 'isSegmentStart',
       'isSegmentEnd', 'NumRelevantDoc_real', 'NumKeyDoc_real', 'DCG3', 'DCG5',
       'DCG10', 'NDCG3', 'NDCG5', 'NDCG10', 'Precision3', 'Precision5',
       'Precision10', 'ReciprocalRank', 'CBF1', 'CBF1_1', 'CBF1_2',
       'averageRelevanceScore', 'irrelevantDocs30Num',
       'new_unique_query_terms_num', 'query_similarity', 'NumOfClick3',
       'NumOfClick5', 'NumOfClick6']
label = ['state']
X = session[features]
y = session[label] 

In [105]:
from sklearn.model_selection import train_test_split
#
# Create training and test split
#
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0) #random_state=42, stratify=y

In [106]:
# Standardize the data set
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

In [None]:
# svm
from sklearn.svm import SVC
# Fit the SVC model
#
svc = SVC(kernel='sigmoid', random_state=0)
svc.fit(X_train, y_train)
#
# Get the predictions
#
y_pred = svc.predict(X_test)
#
# Calculate the confusion matrix
#
conf_matrix = confusion_matrix(y_true=y_test, y_pred=y_pred)
#
# Print the confusion matrix using Matplotlib
#
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
 
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()

In [None]:
print('Precision: %.3f' % precision_score(y_test, y_pred, average='weighted'))
print('Recall: %.3f' % recall_score(y_test, y_pred, average='weighted'))
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))
print('F1 Score: %.3f' % f1_score(y_test, y_pred, average='weighted'))

In [None]:
# multilayer perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
# Fit the SVC model
#
clf = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train)
#
# Get the predictions
#
y_pred = clf.predict(X_test)
#
# Calculate the confusion matrix
#
conf_matrix = confusion_matrix(y_true=y_test, y_pred=y_pred)
#
# Print the confusion matrix using Matplotlib
#
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
 
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()

In [None]:
print('Precision: %.3f' % precision_score(y_test, y_pred, average='weighted'))
print('Recall: %.3f' % recall_score(y_test, y_pred, average='weighted'))
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))
print('F1 Score: %.3f' % f1_score(y_test, y_pred, average='weighted'))

In [None]:
#random forest
from sklearn.ensemble import RandomForestClassifier

#
# Fit the SVC model
#
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train,y_train)
#
# Get the predictions
#
y_pred = clf.predict(X_test)
#
# Calculate the confusion matrix
#
conf_matrix = confusion_matrix(y_true=y_test, y_pred=y_pred)
#
# Print the confusion matrix using Matplotlib
#
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
 
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()

In [None]:
print('Precision: %.3f' % precision_score(y_test, y_pred, average='weighted'))
print('Recall: %.3f' % recall_score(y_test, y_pred, average='weighted'))
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))
print('F1 Score: %.3f' % f1_score(y_test, y_pred, average='weighted'))

In [None]:
#knn
from sklearn.neighbors import KNeighborsClassifier
# Fit the SVC model
#
clf = KNeighborsClassifier(n_neighbors=4)

# Train the model using the training sets
clf.fit(X_train,y_train)
#
# Get the predictions
#
y_pred = clf.predict(X_test)
#
# Calculate the confusion matrix
#
conf_matrix = confusion_matrix(y_true=y_test, y_pred=y_pred)
#
# Print the confusion matrix using Matplotlib
#
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
 
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()

In [None]:
print('Precision: %.3f' % precision_score(y_test, y_pred, average='weighted'))
print('Recall: %.3f' % recall_score(y_test, y_pred, average='weighted'))
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))
print('F1 Score: %.3f' % f1_score(y_test, y_pred, average='weighted'))

In [None]:
#dt
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier

#
# Standardize the data set
#

# Fit the SVC model
#
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)
#
# Get the predictions
#
y_pred = clf.predict(X_test)
#
# Calculate the confusion matrix
#
conf_matrix = confusion_matrix(y_true=y_test, y_pred=y_pred)
#
# Print the confusion matrix using Matplotlib
#
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
 
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()

In [None]:
print('Precision: %.3f' % precision_score(y_test, y_pred, average='weighted'))
print('Recall: %.3f' % recall_score(y_test, y_pred, average='weighted'))
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))
print('F1 Score: %.3f' % f1_score(y_test, y_pred, average='weighted'))

In [None]:
#nb
from sklearn.naive_bayes import GaussianNB
#
# Standardize the data set
#
#
clf = GaussianNB()
clf.fit(X_train, y_train)
#
# Get the predictions
#
y_pred = clf.predict(X_test)
#
# Calculate the confusion matrix
#
conf_matrix = confusion_matrix(y_true=y_test, y_pred=y_pred)
#
# Print the confusion matrix using Matplotlib
#
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
 
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()

In [None]:
print('Precision: %.3f' % precision_score(y_test, y_pred, average='weighted'))
print('Recall: %.3f' % recall_score(y_test, y_pred, average='weighted'))
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))
print('F1 Score: %.3f' % f1_score(y_test, y_pred, average='weighted'))

In [None]:
#lr
from sklearn.linear_model import LinearRegression
#
# Fit the SVC model
#
clf = LinearRegression()
clf.fit(X_train, y_train)
#
# Get the predictions
#
y_pred = svc.predict(X_test)
#
# Calculate the confusion matrix
#
conf_matrix = confusion_matrix(y_true=y_test, y_pred=y_pred)
#
# Print the confusion matrix using Matplotlib
#
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
 
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()

In [None]:
print('Precision: %.3f' % precision_score(y_test, y_pred, average='weighted'))
print('Recall: %.3f' % recall_score(y_test, y_pred, average='weighted'))
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))
print('F1 Score: %.3f' % f1_score(y_test, y_pred, average='weighted'))