In [58]:
# Importing libraries
import pandas as pd
import csv
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
import seaborn as sns

In [None]:
# Mounting the google drive

from google.colab import drive
drive.mount('/content/drive')

dp = "/content/drive/MyDrive/ML/personalProject/data.txt"

with open(dp, 'r') as f:
    textLine = f.readlines()
    textLine = [line.strip().split(',') for line in textLine] # spltting data by comas



# Now, 'text' contains the contents of the file
# print(text)


In [3]:
# Writing into a csv file and saving them in the local area (in colab)

# Defines attributes
attributes_names = ['class', 'age', 'menopause', 'tumor_size', 'inv_nodes', 'node_caps', 'deg_malig', 'breast', 'breast_quad', 'irradiat']

# Defining output of the csv file
outPut = 'data.csv'

# Add data to the csv file with attributes
with open(outPut, 'w', newline='') as csvfile:
  writer = csv.writer(csvfile)
  writer.writerow(attributes_names) # naming attributes
  writer.writerows(textLine) # writing data


In [None]:
# Reading the csv file into a dataframe
dataFrame = pd.read_csv(outPut)

# remving rows that has missing values and represented by ?
dataFrame.replace('?', np.nan, inplace=True)

# Removing rows with missing data in the dataframe
dataFrame = dataFrame.dropna()

# Printing the entire data frame with index
print(dataFrame.to_string(index=False, max_rows=None))

In [None]:
# Encoding

# Create a new DataFrame with label encoded columns
encoded_df = dataFrame.copy()  # Make a copy of the original DataFrame
label_encoder = LabelEncoder()

for column in attributes_names:
    encoded_df[column] = label_encoder.fit_transform(dataFrame[column])

# print(encoded_df.to_string(index=False, max_rows=None))

In [33]:
# Splitting data into train and test.
# In here we define x and y.
# X --> independent variable [Features that excluding target variable]
# y --> depended variable [target variable]

X = encoded_df.iloc[:,2:]
y = encoded_df.iloc[:,:1]

y = np.array(y).flatten()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [None]:
# Building the model using SVM classifier

# Creating the classifier. There are four options to the kernel --> Linear, Polynomial, Gausian RBF, and Sigmoid
svm = SVC(kernel='rbf', C=10, gamma='auto')

# Training the model
svm.fit(X_train,y_train)

# Predicting
y_prediction = svm.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_prediction)
print(f'Accuracy: {accuracy:.2f}')

print('Confusion Matrix:')
confusion = confusion_matrix(y_test, y_prediction)
print(confusion)

print('Classification Report:')
print(classification_report(y_test, y_prediction))

# Plot the confusion matrix with colors using seaborn and matplotlib
plt.figure(figsize=(8, 6))
sns.heatmap(confusion, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

In [None]:
# Building the model using LogisticRegression

model = LogisticRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

print('Classification Report:')
print(classification_report(y_test, y_pred))