In [12]:
colab = False #when used in Google Colab

if colab:
  from google.colab import drive
  drive.mount('/content/drive/')

import os
import sys
sys.path.append("..")



 <a id="1"></a>
# <p style="background-color:#E598D8;font-family:newtimeroman;color:#E1F16B;font-size:150%;text-align:center;border-radius:20px 60px;">IMPORTING LIBRARIES</p>

**Code primarily makes use of NLTK and SKLEARN libraries for preprocessing, training, and testing.**

In [13]:
#Importing all the libraries to be used
from python import Lem #importing python file for Filipino lemmatization
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import pickle
import nltk

# nltk library for preprocessing and data cleaning
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

#training, testing, and evaluation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from matplotlib.colors import ListedColormap
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, classification_report, accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay


In [None]:
#nltk files that cannot be found by Colab unless downloaded
if colab:
  nltk.download('punkt')
  nltk.download('stopwords')
  nltk.download('wordnet')


In [None]:
#IMPORT ALL DATA FILES FOR CLIENTS
if colab: 
  directory = '/content/drive/MyDrive/Thesis/'

y_train = pd.read_csv('train_labels.csv')
X_train = np.load(directory + 'train_data.npy')
y_test = pd.read_csv('test_labels.csv')
X_test = np.load(directory + 'test_data.npy')

labelset = [y_train, y_test]

for count in range(len(labelset)):
  series = labelset[count]
  indx = series.index
  index = [series.iloc[i, 0] for i in range(0,len(indx))]
  vals = [series.iloc[i, 1] for i in range(0,len(indx))]
  labelset[count]= pd.Series(vals, index)
  #print(trainset[count])

y_train, y_test = labelset[0], labelset[1]


<a id="6"></a>
# <p style="background-color:#E598D8;font-family:newtimeroman;font-size:150%;color:#E1F16B;text-align:center;border-radius:20px 60px;">MODEL BUILDING</p>

**Steps involved in the Model Building**
* Setting up features and target as X and y
* Splitting the testing and training sets
* Build a pipeline of model for four different classifiers.
  1. Support Vector Machines
* Fit all the models on training data
* Get the cross-validation on the training set for all the models for accuracy

In [None]:
#Testing on the following classifiers
classifiers = [SVC()]
for cls in classifiers:
    cls.fit(X_train, y_train)
pipe_dict = {0: "SVC"}

In [None]:
# Crossvalidation
for i, model in enumerate(classifiers):
    #cv_score = cross_val_score(model, X_train,y_train,scoring="accuracy", cv=10)
    cv_score = cross_val_score(model, X_train,y_train,scoring="accuracy", cv=10)
    print("%s: %f " % (pipe_dict[i], cv_score.mean()))
    print(model)

<a id="7"></a>
# <p style="background-color:#E598D8;font-family:newtimeroman;font-size:150%;color:#E1F16B;text-align:center;border-radius:20px 60px;">EVALUATING MODELS</p>
**Testing the models on Testset**
* Accuracy Report
* Confusion Matrix

In [None]:
# Model Evaluation
# creating lists of varios scores
precision =[]
recall =[]
f1_score = []
trainset_accuracy = []
testset_accuracy = []

for i in classifiers:
    pred_train = i.predict(X_train)
    pred_test = i.predict(X_test)
    prec = metrics.precision_score(y_test, pred_test)
    recal = metrics.recall_score(y_test, pred_test)
    f1_s = metrics.f1_score(y_test, pred_test)
    train_accuracy = model.score(X_train,y_train)
    test_accuracy = model.score(X_test,y_test)

    #Appending scores
    precision.append(prec)
    recall.append(recal)
    f1_score.append(f1_s)
    trainset_accuracy.append(train_accuracy)
    testset_accuracy.append(test_accuracy)


In [None]:
# initialise data of lists.
data = {'Precision':precision,
'Recall':recall,
'F1score':f1_score,
'Accuracy on Testset':testset_accuracy,
'Accuracy on Trainset':trainset_accuracy}
# Creates pandas DataFrame.
Results = pd.DataFrame(data, index =["SVC"])

In [None]:
cmap2 = ListedColormap(["#E2CCFF","#E598D8"])
Results.style.background_gradient(cmap=cmap2)

In [None]:
cmap = ListedColormap(["#E1F16B", "#E598D8"])

for cls in classifiers:
    predictions = cls.predict(X_test)
    cm = confusion_matrix(y_test, predictions, labels=cls.classes_)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=cls.classes_)

disp.plot()
plt.show()

In [None]:
filename = 'finalized_model.sav'
pickle.dump(model, open(filename,'wb'))