In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
data = pd.read_csv('/kaggle/input/biomechanical-features-of-orthopedic-patients/column_3C_weka.csv') #read to file
data

# Configuring the Data


In [None]:
data["class"] = [1 if each == "Hernia" else 0 for each in data["class"]]
# Hernia = 1
# Normal = 0

In [None]:
data

In [None]:
data.info()

In [None]:
data.describe().T

In [None]:
plt.scatter(data.pelvic_radius,data.sacral_slope)
plt.xlabel("pelvic radius")
plt.ylabel("sacral slope")
plt.show()

By examining the values ​​above, we can better understand our data.

# Linear Regression
In statistics, linear regression is a linear approach to modeling the relationship between a scalar response (or dependent variable) and one or more explanatory variables (or independent variables). The case of one explanatory variable is called simple linear regression. <br>

* y = b0 + b1*x
* b0 = constant
* b1 = coeff
* x = value

In [None]:
from sklearn.linear_model import LinearRegression

linear_reg = LinearRegression()

# sacral_slope VS pelvic_radius
print("sacral_slope type: ", type(data.sacral_slope))
print("pelvic_radius type: ", type(data.pelvic_radius))

In [None]:
x = data.sacral_slope.values.reshape(-1,1)
y = data.pelvic_radius.values.reshape(-1,1)

linear_reg.fit(x,y)

In [None]:
b0 = linear_reg.predict([[0]]) # You can write the desired value instead of 0. Here we wrote 0 to find the point where the line crosses the y-axis
print("b0: ", b0)

In [None]:
# another way: 
b0 = linear_reg.intercept_
print("b0: ", b0)

In [None]:
b1 = linear_reg.coef_
print("b1 = ", b1)

For example: if sacral slope data is 45, pelvic radius data takes the following value.

In [None]:
print(linear_reg.predict([[45]]))

We can use the following method to see the change of pelvic radius values ​​according to sacral slope values.

In [None]:
array = np.array([5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100]).reshape(-1,1)
plt.scatter(x,y)
y_head = linear_reg.predict(array)
plt.plot(array,y_head,color="red")
plt.show()

# Multiple Linear Regression
Multiple linear regression (MLR), also known simply as multiple regression, is a statistical technique that uses several explanatory variables to predict the outcome of a response variable. Multiple regression is an extension of linear (OLS) regression that uses just one explanatory variable.


In [None]:
y = data["class"].values.reshape(-1,1)
x = data.drop(["class"],axis=1).values # axis = 1 => for columns

In [None]:
multiple_linear_regression = LinearRegression()
multiple_linear_regression.fit(x,y)

In [None]:
print("b0: ",multiple_linear_regression.intercept_)  # The point where it intersects the y axis
print("b1,b2,b3,b4,b5,b6: ",multiple_linear_regression.coef_) # The slopes of the line

Now I have to enter the values. For example, you are a doctor and you have this data. When we enter this data in order, it will show us whether there is a hernia. <br><br>

* pelvic_invidince = 57.26
* pelvic_tilt = 18.98
* lumbar_lordosis_angle = 38.63
* sacral_slope = 31.43
* pelvic_radius = 115.098
* degree_spondylolisthesis = 4.4512

In [None]:
multiple_linear_regression.predict(np.array([[
    57.26,
    19.98,
    38.63,
    31.43,
    115.098,
    4.4512
]]))

Since it is not 1, we can understand that the result is not a hernia.

# Polynomial Regression
Polynomial Regression is a form of linear regression in which the relationship between the independent variable x and dependent variable y is modeled as an nth degree polynomial. Polynomial regression fits a nonlinear relationship between the value of x and the corresponding conditional mean of y, denoted E(y |x)


# Decision Tree Regression
Decision Tree - Regression. Decision tree builds regression or classification models in the form of a tree structure. It breaks down a dataset into smaller and smaller subsets while at the same time an associated decision tree is incrementally developed. ... Decision trees can handle both categorical and numerical data.

In [None]:
x = data["class"].values.reshape(-1,1)
y = data["sacral_slope"].values.reshape(-1,1)

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(x,y)

In [None]:
y_head = tree_reg.predict(x)

plt.scatter(x,y,color="red")
plt.plot(x,y_head,color="green")
plt.xlabel("Grandstand Level")
plt.ylabel("Price")
plt.show()

**As can be seen, we did not get a complete yield in the chart here. This part is better described at the bottom.**

# KNN Algorithm
The k-nearest neighbors (KNN) algorithm is a simple, supervised machine learning algorithm that can be used to solve both classification and regression problems.

In [None]:
plt.scatter(data[data["class"]==1].pelvic_radius,data[data["class"]==1].sacral_slope,color="red",label="hernia",alpha= 0.6)
plt.scatter(data[data["class"]==0].pelvic_radius,data[data["class"]==0].sacral_slope,color="green",label="normal",alpha= 0.6)
plt.xlabel("pelvic_radius")
plt.ylabel("sacral_slope")
plt.legend()
plt.show()

- Choose K value
- Find the nearest data points in K
- Calculate how many of the class nearest neighbors in K
- Determine which class of point or data we tested belongs to

In [None]:
y = data["class"].values
x_data = data.drop(["class"],axis=1)

### normalization

In [None]:
x = (x_data - np.min(x_data))/(np.max(x_data) - np.min(x_data))

In [None]:
# train test split
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=1)

In [None]:
# knn model
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3) # n_neighbors => key count
knn.fit(x_train,y_train)
prediction = knn.predict(x_test)

In [None]:
prediction # looks nice :)

In [None]:
print("{} knn score: {} ".format(3,knn.score(x_test,y_test)*100)) # accuracy = 79.5%

In [None]:
# find k value
score_list = []

for each in range(1,15):
    knn2 = KNeighborsClassifier(n_neighbors=each)
    knn2.fit(x_train,y_train)
    score_list.append(knn2.score(x_test,y_test))
    
plt.plot(range(1,15),score_list)
plt.xlabel("k values")
plt.ylabel("accuracy")
plt.show()

When we examine the table, we see that the most appropriate value is 13.

In [None]:
print("{} knn score: {} ".format(13,knn.score(x_test,y_test)*100))

# Support Vector Machine (SVM)
A support vector machine (SVM) is a supervised machine learning model that uses classification algorithms for two-group classification problems. After giving an SVM model sets of labeled training data for each category, they're able to categorize new text. So you're working on a text classification problem.

In [None]:
from IPython.display import Image
Image(url="https://www.researchgate.net/publication/304611323/figure/fig8/AS:668377215406089@1536364954428/Classification-of-data-by-support-vector-machine-SVM.png")

In [None]:
y = data["class"].values
x_data = data.drop(["class"],axis=1)

# normalazition
x = (x_data - np.min(x_data))/(np.max(x_data) - np.min(x_data))

# train test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=1)

In [None]:
from sklearn.svm import SVC

svm = SVC(random_state=42)
svm.fit(x_train,y_train)

print("Accuracy of SVM algo: ", svm.score(x_test,y_test)*100)

# Naive Bayes Classification
Naive Bayes classifiers are a collection of classification algorithms based on Bayes' Theorem. It is not a single algorithm but a family of algorithms where all of them share a common principle, i.e. every pair of features being classified is independent of each other.

In [None]:
y = data["class"].values
x_data = data.drop(["class"],axis=1)

# normalazition
x = (x_data - np.min(x_data))/(np.max(x_data) - np.min(x_data))

# train test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=1)

In [None]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(x_train,y_train)
print("Accuracy of naive_bayes algo:",nb.score(x_test,y_test)*100)

#  Decision Tree Classification

In [None]:
y = data["class"].values
x_data = data.drop(["class"],axis=1)

# normalazition
x = (x_data - np.min(x_data))/(np.max(x_data) - np.min(x_data))

# train test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=1)

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(x_train,y_train)

print("Score: ", dt.score(x_test,y_test)*100)

# Random Forest Classification
Random forests or random decision forests are an ensemble learning method for classification, regression and other tasks that operate by constructing a multitude of decision trees at training time and outputting the class that is the mode of the classes (classification) or mean prediction (regression) of the individual trees. Random decision forests correct for decision trees' habit of overfitting to their training set.

In [None]:
y = data["class"].values
x_data = data.drop(["class"],axis=1)

# normalazition
x = (x_data - np.min(x_data))/(np.max(x_data) - np.min(x_data))

# train test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=1)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100,random_state=42) # n_estimators = number of trees
rf.fit(x_train,y_train)
print("Score: ", rf.score(x_test,y_test)*100)

# Evaluation Regression Models
After building a number of different regression models, there is a wealth of criteria by which they can be evaluated and compared. RMSE is a popular formula to measure the error rate of a regression model.

In [None]:
y = data["class"].values.reshape(-1,1)
x_data = data.drop(["class"],axis=1).values

# normalazition
x = (x_data - np.min(x_data))/(np.max(x_data) - np.min(x_data))

linear_reg.fit(x,y)
y_head = linear_reg.predict(x) 

In [None]:
from sklearn.metrics import r2_score

print("r_square score: ",r2_score(y,y_head))