Part Eight:

---
Machine Learning Techniques
*   Make the .csv File Usable
*   Encode the Labels
*   Split Data into Training, Testing & Validation
*   Classify using Machine Learning Classifiers

**Import Headers**

In [0]:
import numpy as np
import pandas as pd
import csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn import metrics
from matplotlib import pyplot as plt
import seaborn as sns

**Make the .csv File Usable**

In [0]:
data = pd.read_csv('/content/drive/My Drive/Project/Data/Working_Data/csv_files/data_50.csv')
data = data.drop(['filename'],axis=1)
print("The shape of the data after removing redundant columns:",data.shape)

**Encode the Labels**

In [0]:
genre_list = data.iloc[:, -1]
encoder = LabelEncoder()
y = encoder.fit_transform(genre_list)

**Calculate Number of Components for PCA**

In [0]:
scaler = StandardScaler()
x = scaler.fit_transform(np.array(data.iloc[:, :-1], dtype = float))
pca = PCA().fit(x)
plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)')
plt.title('VARIANCE OF DATA')
plt.show()

**Split Data into Train, Test & Validation Data**

In [0]:
pca = PCA(n_components=800)
pca_x = pca.fit_transform(x)
x = scaler.fit_transform(np.array(pca_x, dtype = float))
x_train, x_test, y_train, y_test = train_test_split(pca_x, y, test_size=0.2)
print("The shape of x_train is:",x_train.shape)
print("The shape of y_train is:",y_train.shape)
print("The shape of x_test is:",x_test.shape)
print("The shape of y_test is:",y_test.shape)

**Classify using Machine Learning Classifiers**

*Import Headers*

In [0]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import RidgeClassifierCV
from sklearn.svm import NuSVC
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import linear_model
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier

*Function to Draw Confusion Matrix*

In [0]:
def draw_confusion_matrix(y_test,y_pred,title):
  cm = metrics.confusion_matrix(y_test, y_pred)
  plt.figure(figsize=(9,9))
  sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Greens');
  plt.ylabel('Actual label');
  plt.xlabel('Predicted label');
  plt.title(title, size = 15);
  plt.show()

*Classification*

In [0]:
name_models = ["Bernoulli Naive Bayes","Decision Tree","Gaussian Naive Bayes","K Nearest Neighbour","Linear Discriminant Analysis","Linear Support Vector","Logistic Regression",
               "Logistic Regression with Cross Validation","MLP","Nearest Centroid","Quadratic Discriminant Analysis","Radius Neighbour","Random Forest","Ridge", "Ridge Classifier with Cross Validation", "NuSVC","SVC",
               "Gaussian Process","Gradient Boosting","SGD","Perceptron","Passive Agressive"]
i=0

model = BernoulliNB()
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
score = (model.score(x_test,y_test)) * 100
draw_confusion_matrix(y_test,y_pred,name_models[i])
print(name_models[i],"Accuracy Score:",str(score),"%")
i = i + 1

model = DecisionTreeClassifier(random_state=0)
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
score = (model.score(x_test,y_test)) * 100
draw_confusion_matrix(y_test,y_pred,name_models[i])
print(name_models[i],"Accuracy Score:",str(score),"%")
i = i + 1

model = GaussianNB()
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
score = (model.score(x_test,y_test)) * 100
draw_confusion_matrix(y_test,y_pred,name_models[i])
print(name_models[i],"Accuracy Score:",str(score),"%")
i = i + 1

model = KNeighborsClassifier(n_neighbors=3)
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
score = (model.score(x_test,y_test)) * 100
draw_confusion_matrix(y_test,y_pred,name_models[i])
print(name_models[i],"Accuracy Score:",str(score),"%")
i = i + 1

model = LinearDiscriminantAnalysis()
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
score = (model.score(x_test,y_test)) * 100
draw_confusion_matrix(y_test,y_pred,name_models[i])
print(name_models[i],"Accuracy Score:",str(score),"%")
i = i + 1

model = LinearSVC(random_state=0, tol=1e-5)
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
score = (model.score(x_test,y_test)) * 100
draw_confusion_matrix(y_test,y_pred,name_models[i])
print(name_models[i],"Accuracy Score:",str(score),"%")
i = i + 1

model = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
score = (model.score(x_test,y_test)) * 100
draw_confusion_matrix(y_test,y_pred,name_models[i])
print(name_models[i],"Accuracy Score:",str(score),"%")
i = i + 1

model = LogisticRegressionCV(cv=5, random_state=0, multi_class='multinomial')
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
score = (model.score(x_test,y_test)) * 100
draw_confusion_matrix(y_test,y_pred,name_models[i])
print(name_models[i],"Accuracy Score:",str(score),"%")
i = i + 1

model = MLPClassifier(alpha=1, max_iter=1000)
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
score = (model.score(x_test,y_test)) * 100
draw_confusion_matrix(y_test,y_pred,name_models[i])
print(name_models[i],"Accuracy Score:",str(score),"%")
i = i + 1

model = NearestCentroid()
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
score = (model.score(x_test,y_test)) * 100
draw_confusion_matrix(y_test,y_pred,name_models[i])
print(name_models[i],"Accuracy Score:",str(score),"%")
i = i + 1

model = QuadraticDiscriminantAnalysis()
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
score = (model.score(x_test,y_test)) * 100
draw_confusion_matrix(y_test,y_pred,name_models[i])
print(name_models[i],"Accuracy Score:",str(score),"%")
i = i + 1

model = RadiusNeighborsClassifier(radius=120.0)
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
score = (model.score(x_test,y_test)) * 100
draw_confusion_matrix(y_test,y_pred,name_models[i])
print(name_models[i],"Accuracy Score:",str(score),"%")
i = i + 1

model = RandomForestClassifier(n_estimators=100, max_depth=9,random_state=0)
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
score = (model.score(x_test,y_test)) * 100
draw_confusion_matrix(y_test,y_pred,name_models[i])
print(name_models[i],"Accuracy Score:",str(score),"%")
i = i + 1

model = RidgeClassifier()
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
score = (model.score(x_test,y_test)) * 100
draw_confusion_matrix(y_test,y_pred,name_models[i])
print(name_models[i],"Accuracy Score:",str(score),"%")
i = i + 1

model = RidgeClassifierCV(alphas=[1e-3, 1e-2, 1e-1, 1])
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
score = (model.score(x_test,y_test)) * 100
draw_confusion_matrix(y_test,y_pred,name_models[i])
print(name_models[i],"Accuracy Score:",str(score),"%")
i = i + 1

model = NuSVC(gamma='scale')
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
score = (model.score(x_test,y_test)) * 100
draw_confusion_matrix(y_test,y_pred,name_models[i])
print(name_models[i],"Accuracy Score:",str(score),"%")
i = i + 1

model = SVC(gamma='auto')
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
score = (model.score(x_test,y_test)) * 100
draw_confusion_matrix(y_test,y_pred,name_models[i])
print(name_models[i],"Accuracy Score:",str(score),"%")
i = i + 1

model = GaussianProcessClassifier(kernel=1.0 * RBF(),random_state=0)
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
score = (model.score(x_test,y_test)) * 100
draw_confusion_matrix(y_test,y_pred,name_models[i])
print(name_models[i],"Accuracy Score:",str(score),"%")
i = i + 1

model = GradientBoostingClassifier()
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
score = (model.score(x_test,y_test)) * 100
draw_confusion_matrix(y_test,y_pred,name_models[i])
print(name_models[i],"Accuracy Score:",str(score),"%")
i = i + 1

model = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
score = (model.score(x_test,y_test)) * 100
draw_confusion_matrix(y_test,y_pred,name_models[i])
print(name_models[i],"Accuracy Score:",str(score),"%")
i = i + 1

model = Perceptron(tol=1e-3, random_state=0)
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
score = (model.score(x_test,y_test)) * 100
draw_confusion_matrix(y_test,y_pred,name_models[i])
print(name_models[i],"Accuracy Score:",str(score),"%")
i = i + 1

model = PassiveAggressiveClassifier(max_iter=1000, random_state=0,tol=1e-3)
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
score = (model.score(x_test,y_test)) * 100
draw_confusion_matrix(y_test,y_pred,name_models[i])
print(name_models[i],"Accuracy Score:",str(score),"%")
i = i + 1

print("Part Eight Successful!")